In [1]:
%load_ext autoreload 
%autoreload 2
%matplotlib inline

In [2]:
import warnings 

In [3]:
warnings.filterwarnings("ignore")

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_swiss_roll

In [5]:
import sys 
sys.path.append('../')

In [6]:
from fmds.models.diffusion.diffusion_sde_noise_schedule import   DiffusionSDE

In [7]:
from fmds.utils.io.read import RawDataReader

In [8]:
import  fmds.config as cfg 
from fmds.settings import Settings

In [9]:
rdr = RawDataReader(raw_data_directory=Settings.backfill.paths.RAW_DATA_PATH)

In [10]:
equity_data = rdr.read_30_equities()

In [11]:
# n_samples = 1000

In [12]:
# data , _ = make_swiss_roll(n_samples, noise=0.5)

In [13]:
# data=data[:,[0,2]]

In [14]:
# data.shape

In [15]:
equity_data

Unnamed: 0.1,Unnamed: 0,index,tradeDate,ClosePrice,Ticker
0,0,0,,132.21,LLY
1,1,1,2020-01-03 00:00:00,131.77,LLY
2,2,2,2020-01-06 00:00:00,132.26,LLY
3,3,3,2020-01-07 00:00:00,132.51,LLY
4,4,4,2020-01-08 00:00:00,133.71,LLY
...,...,...,...,...,...
42167,42167,1362,2025-06-04 00:00:00,71.37,KO
42168,42168,1363,2025-06-05 00:00:00,70.91,KO
42169,42169,1364,2025-06-06 00:00:00,71.35,KO
42170,42170,1365,2025-06-09 00:00:00,71.77,KO


In [16]:
equity_data.columns.tolist()

['Unnamed: 0', 'index', 'tradeDate', 'ClosePrice', 'Ticker']

In [17]:
equity_data=equity_data[['tradeDate', 'ClosePrice', 'Ticker']].dropna().reset_index(drop=True)

In [18]:
equity_data

Unnamed: 0,tradeDate,ClosePrice,Ticker
0,2020-01-03 00:00:00,131.77,LLY
1,2020-01-06 00:00:00,132.26,LLY
2,2020-01-07 00:00:00,132.51,LLY
3,2020-01-08 00:00:00,133.71,LLY
4,2020-01-09 00:00:00,135.92,LLY
...,...,...,...
42055,2025-06-04 00:00:00,71.37,KO
42056,2025-06-05 00:00:00,70.91,KO
42057,2025-06-06 00:00:00,71.35,KO
42058,2025-06-09 00:00:00,71.77,KO


In [19]:
equity_data['Ticker'].nunique()

32

In [20]:
equity_data['Ticker'].unique()

array(['LLY', 'MCD', 'MMM', 'MRK', 'MSFT', 'NVDA', 'PFE', 'PG', 'TRV',
       'TSLA', 'UNH', 'V', 'VZ', 'WMT', 'XOM', 'AAPL', 'AXP', 'BA', 'CAT',
       'COST', 'CSCO', 'CVX', 'DO', 'DIS', 'GE', 'GS', 'HD', 'IBM',
       'INTC', 'JNJ', 'JMP', 'KO'], dtype=object)

In [21]:
equity_data['tradeDate'] = pd.to_datetime(equity_data['tradeDate'], errors='coerce').dt.date
equity_data['ClosePrice'] = pd.to_numeric(equity_data['ClosePrice'], errors='coerce')


In [22]:
equity_data.dropna(inplace = True) 

In [23]:
equity_data

Unnamed: 0,tradeDate,ClosePrice,Ticker
0,2020-01-03,131.77,LLY
1,2020-01-06,132.26,LLY
2,2020-01-07,132.51,LLY
3,2020-01-08,133.71,LLY
4,2020-01-09,135.92,LLY
...,...,...,...
42055,2025-06-04,71.37,KO
42056,2025-06-05,70.91,KO
42057,2025-06-06,71.35,KO
42058,2025-06-09,71.77,KO


In [24]:
# equity_data = equity_data[equity_data['Ticker'].isin([  'MSFT', 'NVDA',  'XOM', 'AAPL',  'IBM' ])]

In [25]:
equity_data.sort_values(by ='tradeDate', ascending = True, inplace = True)

In [26]:
equity_data['prior_1D_return'] = equity_data['ClosePrice']/ equity_data.groupby('Ticker')['ClosePrice'].shift(1)-1

In [27]:
equity_data

Unnamed: 0,tradeDate,ClosePrice,Ticker,prior_1D_return
0,2020-01-03,131.7700,LLY,
17759,2020-01-03,39.2967,WMT,
4098,2020-01-03,87.0092,MRK,
6830,2020-01-03,5.9018,NVDA,
33394,2020-01-03,231.5800,GS,
...,...,...,...,...
28686,2025-06-10,65.1500,CSCO,-0.011381
9562,2025-06-10,24.3000,PFE,0.013767
27320,2025-06-10,1007.2700,COST,0.001083
10928,2025-06-10,162.8400,PG,0.001722


In [28]:
equity_data[equity_data['Ticker'] == 'DO'].sort_values(by = 'tradeDate', ascending = True)

Unnamed: 0,tradeDate,ClosePrice,Ticker,prior_1D_return
30053,2022-03-31,7.13,DO,
30054,2022-04-01,6.89,DO,-0.033661
30055,2022-04-04,7.00,DO,0.015965
30056,2022-04-05,6.78,DO,-0.031429
30057,2022-04-06,6.75,DO,-0.004425
...,...,...,...,...
30657,2024-08-27,14.34,DO,-0.007612
30658,2024-08-28,14.21,DO,-0.009066
30659,2024-08-29,14.29,DO,0.005630
30660,2024-08-30,14.34,DO,0.003499


In [29]:
equity_data['tradeDate'] = pd.to_datetime(equity_data['tradeDate'] ).dt.date

In [30]:
equity_data['tradeDate'].dtype

dtype('O')

In [31]:
# equity_data['tradeDate'] = pd.to_datetime(equity_data['tradeDate'] ) 


In [32]:
data = equity_data.pivot_table(index= 'tradeDate', 
                               columns = 'Ticker',
                               values = 'prior_1D_return').dropna(axis=1, 
                                                 how='all').dropna(axis=0,
                                                                   how='all').fillna(0)

In [33]:
data.abs().max()

Ticker
AAPL    0.153288
AXP     0.218823
BA      0.243186
CAT     0.142822
COST    0.124513
CSCO    0.137304
CVX     0.227407
DIS     0.144123
DO      0.121569
GE      0.151592
GS      0.175803
HD      0.197938
IBM     0.129642
INTC    0.260585
JMP     0.448692
JNJ     0.079977
KO      0.096725
LLY     0.156798
MCD     0.181255
MMM     0.229906
MRK     0.098630
MSFT    0.147390
NVDA    0.243696
PFE     0.108552
PG      0.120090
TRV     0.208004
TSLA    0.226900
UNH     0.223797
V       0.138426
VZ      0.092705
WMT     0.117083
XOM     0.126868
dtype: float64

In [34]:
data.shape

(1365, 32)

In [35]:
data = torch.tensor(np.array(data), dtype = torch.float32)

In [36]:
data

tensor([[ 0.0080, -0.0043,  0.0029,  ..., -0.0022, -0.0020,  0.0077],
        [-0.0047, -0.0052,  0.0106,  ..., -0.0111, -0.0093, -0.0082],
        [ 0.0161,  0.0173, -0.0175,  ...,  0.0018, -0.0034, -0.0151],
        ...,
        [ 0.0164,  0.0220,  0.0085,  ...,  0.0115, -0.0050,  0.0240],
        [-0.0121, -0.0030,  0.0318,  ..., -0.0034, -0.0002,  0.0067],
        [ 0.0061, -0.0068, -0.0082,  ...,  0.0110, -0.0013,  0.0214]])

In [37]:
input_dimension=32
output_dimension  = 32

In [38]:
diffusion_sde = DiffusionSDE(data,
                            input_dimension  = input_dimension,
                             hidden_dimension  = 256,
                             number_hidden_layers=5,
                             output_dimension  = output_dimension ,
                             dropout_prob = 0.01,
                                 epochs =10000, 
                             lr  = 0.0001,
                               num_noise_levels  = 10,
                             beta_min  = 0.1,
                 beta_max  = 1.0) 

In [39]:
# diffusion_sde.fit_single_noise_level(sigma_fixed=10)
diffusion_sde.fit()

[Epoch 0] Avg Loss: 1518.060952
[Epoch 100] Avg Loss: 1505.988512
[Epoch 200] Avg Loss: 1558.802290
[Epoch 300] Avg Loss: 1549.612036
[Epoch 400] Avg Loss: 1605.771704
[Epoch 500] Avg Loss: 1618.497303
[Epoch 600] Avg Loss: 1534.505826
[Epoch 700] Avg Loss: 1499.679036
[Epoch 800] Avg Loss: 1509.576926
[Epoch 900] Avg Loss: 1589.426536
[Epoch 1000] Avg Loss: 1575.535536
[Epoch 1100] Avg Loss: 1597.464655
[Epoch 1200] Avg Loss: 1665.559420
[Epoch 1300] Avg Loss: 1530.692042
[Epoch 1400] Avg Loss: 1652.835333
[Epoch 1500] Avg Loss: 1549.799802
[Epoch 1600] Avg Loss: 1610.796109
[Epoch 1700] Avg Loss: 1498.450806
[Epoch 1800] Avg Loss: 1664.928747
[Epoch 1900] Avg Loss: 1477.605008
[Epoch 2000] Avg Loss: 1561.258215
[Epoch 2100] Avg Loss: 1571.133964
[Epoch 2200] Avg Loss: 1531.425828
[Epoch 2300] Avg Loss: 1450.143910
[Epoch 2400] Avg Loss: 1457.348579
[Epoch 2500] Avg Loss: 1517.702997
[Epoch 2600] Avg Loss: 1541.021457
[Epoch 2700] Avg Loss: 1604.795116
[Epoch 2800] Avg Loss: 1500.5046

In [40]:
data.shape

torch.Size([1365, 32])

In [41]:
synthetic_data = diffusion_sde.generate( 
                 
                   n_samples=1000,
                 steps=1000,
              
                 num_corrector_steps=5,
                 batch_size=100   )

In [42]:
synthetic_data.shape

torch.Size([1000, 32])

In [43]:
data_np = data.detach().numpy()
data_np = data_np[~np.isnan(data_np).any(axis=1)]
synthetic_data_np = synthetic_data.detach().numpy()
synthetic_data_np = synthetic_data_np[~np.isnan(synthetic_data_np).any(axis=1)]

In [44]:
torch.rand(1).item()

0.4683074355125427

In [45]:
torch.rand(1).item() * 0.5

0.40238189697265625

In [46]:
np.linalg.cond(pd.DataFrame(data_np).cov())

113.73615544814133

In [47]:
np.linalg.cond(pd.DataFrame(synthetic_data_np).cov())

139.62410170285185

In [48]:
from research.plots import qqplot, qqplot_against_normal

ModuleNotFoundError: No module named 'seaborn'

In [None]:
for i in range(0,5):
    _=qqplot_against_normal(data=data_np[:,i] )
    plt.show()
    plt.close()

In [None]:
for i in range(0,5):
    _=qqplot_against_normal(data=synthetic_data_np[:,i] )
    plt.show()
    plt.close()

In [None]:
?qqplot

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Flatten the arrays (concatenate all columns)
synthetic_flat = synthetic_data_np.flatten()
real_flat = data_np.flatten()

_=qqplot(x=synthetic_flat , 
         y=real_flat,
        title  = 'Q-Q Plot: Synthetuc vs Real (32 stocks)',
    xlabel  = 'X Quantiles - Synthetic data',
    ylabel  = 'Y Quantiles - Real data ',)
plt.show()
plt.close()

In [None]:
for i in range(0,32):
    _=qqplot(x=synthetic_data_np[:,i] , y=data_np[:,i] )
    plt.show()
    plt.close()

In [None]:
for i in range(0,5):
    _=qqplot(x=data_np[:,i], y= synthetic_data_np[:,i])
    plt.show()
    plt.close()