In [1]:
from datetime import datetime
from typing import Any
import pandas as pd
import numpy as np
from math import floor
import torch
import json

from utils.table_feature_calculation import financial_feature_calculation
from tqdm.notebook import tqdm

In [2]:
with open('config/config.json', 'r') as file:
    config = json.load(file)

with open('config/config_emb_path.json', 'r') as file:
    config_emb = json.load(file)
    
rs = config['random_state']

In [3]:
year_start = 2021

for key in config_emb.keys():
    config_emb[key] = config_emb[key].format(year_start)

# Data loading

In [4]:
df = pd.read_csv(config['ticker_data_preprocessed'], index_col=[0])
#df.drop(columns=['sector'], axis=1, inplace=True)


df = df.drop(['sector'], axis=1).T
df.index = pd.to_datetime(df.index)
df = df[(df.index < datetime(year_start, 1, 1)) ].T

In [5]:
df = pd.read_csv(config['ticker_data_preprocessed'], index_col=0)
print(df.shape)
df_pct = df.drop(['sector'], axis=1).T
df_pct.index = pd.to_datetime(df_pct.index)

tickers_list = df_pct.columns.tolist()
df_pct_train = df_pct[(df_pct.index < datetime(year_start, 1, 1)) ]

df_pct_train.tail()

(482, 1196)


Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADI,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2019-12-24,0.000469,-0.004096,0.003751,0.000951,-0.004432,-0.002103,0.010913,-0.000801,0.0037,0.004615,...,-0.005634,-0.002539,-0.003841,0.000525,-0.003927,0.00521,-0.002446,-0.009348,0.003701,0.004155
2019-12-26,0.001758,0.016792,-0.008096,0.01984,-0.000223,-0.004331,-0.007487,0.0,0.002079,8.4e-05,...,0.010757,-0.001114,0.001571,-0.011886,-0.000509,0.014552,-0.003778,0.000354,0.002523,0.000828
2019-12-27,-0.000351,-0.041456,-0.005776,-0.000379,-0.007013,-0.001646,-0.010175,0.001375,0.000802,-0.002339,...,-0.008018,0.005256,-0.003422,0.005484,0.003053,0.001081,-0.001929,0.00621,-0.003097,0.001654
2019-12-30,-0.006088,-0.004923,0.004863,0.005935,-0.007623,-0.014957,-0.011697,-0.006865,-0.007445,-0.004939,...,-0.003076,0.002693,-0.005866,-0.007741,-0.001268,-0.012365,-0.008399,-0.007383,0.005243,-0.007805
2019-12-31,0.004829,0.013428,0.006536,0.007307,0.000226,0.016499,0.019726,0.000691,-0.000332,-0.000168,...,-0.003659,0.00316,0.004318,0.003369,0.000635,0.000894,0.006252,0.005234,0.002897,0.001059


In [6]:
df_original = pd.read_csv(config['ticker_data_close'], index_col=0)

df_na = df_original.isna().sum()
thresh = 0.05 * len(df_original)
stocks_to_drop = df_na[df_na > thresh].index.tolist()
df_original = df_original.drop(stocks_to_drop, axis=1)
df_original = df_original.dropna(axis=0)

df_original.index = pd.to_datetime(df_original.index)
df_original_train = df_original[(df_original.index < datetime(year_start, 1, 1)) ]
df_original_train.tail()

Unnamed: 0_level_0,CMCSA,LVS,EXC,GPN,MSFT,ALL,VTR,NXPI,COST,DRI,...,CERN,A,STT,EA,KO,MDLZ,HRL,AMP,MET,MCHP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-24,44.529999,69.290001,32.325249,182.839996,157.380005,111.169998,57.369999,129.199997,294.230011,108.5,...,499.640015,85.300003,79.639999,108.550003,54.709999,54.990002,45.349998,167.720001,50.990002,52.525002
2019-12-26,44.970001,69.940002,32.325249,183.490005,158.669998,111.599998,57.610001,128.679993,295.730011,108.910004,...,503.23999,85.449997,79.849998,108.510002,55.02,54.849998,45.299999,167.679993,51.240002,52.965
2019-12-27,45.099998,69.739998,32.389442,182.889999,158.960007,111.870003,57.830002,128.190002,294.109985,108.010002,...,503.01001,85.419998,80.120003,108.43,55.349998,55.279999,45.259998,166.710007,50.830002,52.740002
2019-12-30,45.18,69.300003,32.368046,181.350006,157.589996,111.769997,57.98,127.550003,295.140015,108.959999,...,500.839996,84.900002,79.879997,106.93,55.27,55.0,45.0,166.070007,50.700001,52.494999
2019-12-31,44.970001,69.040001,32.517834,182.559998,157.699997,112.449997,57.740002,127.260002,293.920013,109.010002,...,502.700012,85.309998,79.099998,107.510002,55.349998,55.080002,45.110001,166.580002,50.970001,52.360001


In [7]:
df_market = pd.read_csv(config['ticker_data_sp500'], index_col=0)
df_market.columns = ['market']
df_market.index = pd.to_datetime(df_market.index)
df_market = df_market.pct_change()[1:]
#df_market = df_market.T
df_market_train = df_market[(df_market.index < datetime(year_start, 1, 1)) ]

df_market_train

Unnamed: 0_level_0,market
Date,Unnamed: 1_level_1
2018-01-03,0.006399
2018-01-04,0.004029
2018-01-05,0.007034
2018-01-08,0.001662
2018-01-09,0.001303
...,...
2019-12-24,-0.000195
2019-12-26,0.005128
2019-12-27,0.000034
2019-12-30,-0.005781


# Table Data

In [8]:
table_features_df = financial_feature_calculation(df_pct_train.T, df_market_train.T, year_split=2020, 
                              riskless_rate=config['riskless_rate']/252)

table_features_df.to_csv(config_emb['table_finance_features'])

100%|██████████| 482/482 [00:03<00:00, 131.57it/s]


# Autoencoders

In [9]:
from autoencoders import Conv1dAutoEncoder, LSTMAutoEncoder, TickerDataModule, MLPAutoEncoder

In [10]:
model_mlp = MLPAutoEncoder.load_from_checkpoint(config[f'nn_mlp_checkpoint_{year_start}'],
                                            in_features=100,
                                            latent_features=100)
model_lstm = LSTMAutoEncoder.load_from_checkpoint(config[f'nn_lstm_checkpoint_{year_start}'],
                                            seq_len=100,
                                            n_features=1, 
                                            embedding_dim=100)
model_cae = Conv1dAutoEncoder.load_from_checkpoint(config[f'nn_conv_checkpoint_{year_start}'],
                                            in_channels=1,
                                            n_latent_features=100, 
                                            seq_len=100)
model_mlp.eval()
model_lstm.eval()
model_cae.eval();

1


In [11]:
mlp_encoded = np.zeros((df.shape[0], 100))
lstm_encoded = np.zeros((df.shape[0], 100))
cae_encoded = np.zeros((df.shape[0], 100))

for i, name_ticker in tqdm(enumerate(tickers_list)):
    ts_name = df_pct_train[name_ticker].values
    ts_name = ts_name.flatten()
    seq_len = ts_name.shape[0]
    fl_1 = floor(seq_len / 100)
    sample_1 = ts_name[:100 * fl_1].reshape(fl_1, 1, 100)
    fl_2 = floor(seq_len / 100)
    sample_2 = ts_name[:100 * fl_2].reshape(fl_2, 1, 100)
    
    mlp_sample = model_mlp.predict_step(torch.tensor(sample_1).float()).detach().numpy()
    cae_sample = model_cae.predict_step(torch.tensor(sample_1).float()).squeeze().detach().numpy()
    lstm_sample = model_lstm.predict_step(torch.tensor(sample_2).float()).detach().numpy()
    
    mlp_emb = mlp_sample.mean(axis=0)
    cae_emb = cae_sample.mean(axis=0)
    lstm_emb = lstm_sample.mean(axis=0)
    
    mlp_encoded[i, :] = mlp_emb
    cae_encoded[i, :] = cae_emb
    lstm_encoded[i, :] = lstm_emb

0it [00:00, ?it/s]

In [12]:
df_mlp = pd.DataFrame(mlp_encoded, index=tickers_list)
df_cae = pd.DataFrame(cae_encoded, index=tickers_list)
df_lstm = pd.DataFrame(lstm_encoded, index=tickers_list)

df_mlp.to_csv(config_emb['autoencoder_mlp'])
df_cae.to_csv(config_emb['autoencoder_conv'])
df_lstm.to_csv(config_emb['autoencoder_lstm'])

## TSFRESH

In [13]:
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

from sklearn.feature_selection import SelectKBest

In [14]:
df_to_tsfresh = df_pct_train.reset_index()
df_to_tsfresh = pd.melt(df_to_tsfresh, id_vars=['index'], var_name='ticker')
df_to_tsfresh.head()

Unnamed: 0,index,ticker,value
0,2018-01-03,A,0.025444
1,2018-01-04,A,-0.007501
2,2018-01-05,A,0.015988
3,2018-01-08,A,0.002146
4,2018-01-09,A,0.024554


In [15]:
data_tsfresh = extract_features(df_to_tsfresh, column_id='ticker', n_jobs=4, column_sort='index')

features_filtered = SelectKBest(k=100).fit_transform(data_tsfresh.dropna(axis=1), df['sector'])
features_filtered  # n_instances x output_dims

Feature Extraction: 100%|██████████| 20/20 [01:29<00:00,  4.48s/it]
  59  60  61  62  63  64  68  69  70  71  72  73  74  75  76  77  78  79
  80  81  82  83  92 105 365 670 671 672 737 738] are constant.
  f = msb / msw


array([[1.66735588e-02, 1.62418356e-02, 1.62528043e-02, ...,
        2.53081501e-05, 2.25099602e-01, 6.84318037e-02],
       [2.46994151e-02, 2.38312922e-02, 2.38496884e-02, ...,
        3.97356257e-05, 2.60956175e-01, 8.06922900e-02],
       [2.00086841e-02, 1.83641343e-02, 1.83907223e-02, ...,
        2.61593485e-05, 2.27091633e-01, 7.41182877e-02],
       ...,
       [2.51243635e-02, 2.46688217e-02, 2.47576863e-02, ...,
        4.15407864e-05, 2.03187251e-01, 1.13576634e-01],
       [1.73599059e-02, 1.58615403e-02, 1.58624901e-02, ...,
        2.33084913e-05, 2.66932271e-01, 5.32557822e-02],
       [1.46346358e-02, 1.39547917e-02, 1.40167761e-02, ...,
        2.07907827e-05, 2.19123506e-01, 5.48109374e-02]])

In [16]:
features_filtered_df = pd.DataFrame(features_filtered, index=tickers_list)
features_filtered_df.to_csv(config_emb['tsfresh'])

# TS2Vec

In [17]:
from ts2vec.ts2vec import TS2Vec

data = np.expand_dims(df_pct_train.values.T, axis=2)

# Train a TS2Vec model
model = TS2Vec(
    input_dims=1,
    device=0,
    output_dims=100
)
loss_log = model.fit(
    data,
    verbose=True
)

emb_ts2vec = model.encode(data, encoding_window='full_series') 
ts2vec_df = pd.DataFrame(emb_ts2vec, index=tickers_list)
ts2vec_df.to_csv(config_emb['ts2vec'])

Epoch #0: loss=3.4616939942042033
Epoch #1: loss=3.3099417448043824
Epoch #2: loss=3.4099957307179767
Epoch #3: loss=3.215883255004883
Epoch #4: loss=3.3122122128804525
Epoch #5: loss=3.3394102017084757
Epoch #6: loss=3.407264264424642
Epoch #7: loss=3.3642369985580443
Epoch #8: loss=3.2915334701538086
Epoch #9: loss=3.263402891159058
Epoch #10: loss=3.4207643985748293
Epoch #11: loss=3.2955001751581827
Epoch #12: loss=3.3701553344726562
Epoch #13: loss=3.3103437185287476
Epoch #14: loss=3.3611316045125323
Epoch #15: loss=3.32905855178833
Epoch #16: loss=3.24954133828481
Epoch #17: loss=3.309320505460103
Epoch #18: loss=3.2871153950691223
Epoch #19: loss=3.263795351982117


## PCA and FastICA

In [18]:
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import TSNE
from umap import UMAP

In [19]:
pca_data = PCA(n_components=100, random_state=rs).fit_transform(df_pct_train.T)
fastica_data =  FastICA(n_components=100, random_state=rs).fit_transform(df_pct_train.T)
tnse_data = TSNE(n_components=3, random_state=rs).fit_transform(df_pct_train.T)
umap_data = UMAP(n_components=100, random_state=rs).fit_transform(df_pct_train.T)



In [20]:
pca_df = pd.DataFrame(pca_data, index=tickers_list)
fastica_df = pd.DataFrame(pca_data, index=tickers_list)
tnse_df =  pd.DataFrame(tnse_data, index=tickers_list)
umap_df =  pd.DataFrame(umap_data, index=tickers_list)

pca_df.to_csv(config_emb['pca'])
fastica_df.to_csv(config_emb['fastica'])
tnse_df.to_csv(config_emb['tnse'])
umap_df.to_csv(config_emb['umap'])