In [17]:
from datetime import datetime
from datetime import timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import os
import json
from tqdm.notebook import tqdm
import plotly.express as px
import plotly

from comet_ml import Experiment

In [18]:
with open('../config/config.json', 'r') as file:
    config = json.load(file)
    
with open('../config/config_emb_path.json', 'r') as file:
    config_emb = json.load(file)
    
rs = config['random_state']

In [19]:
def change_features(df):
    df['AVG_returns'] = (df['AVG_returns'] + 1) ** 252 - 1
    df['Risk'] = (df['Risk'] + 1) ** np.sqrt(252) - 1
    df['Alpha'] = df['Alpha'].round(4)
    df['Drawdown'] = -df['Drawdown']
    

    return df 

# Building table

In [20]:
year_start = 2020
fine_tune = False

In [21]:
def filter_risk_return(df, filter_baselines=['sp500', 'sectors'], save_methods=['sectors', 'sp500']):      
        
    for baseline in filter_baselines:

        risk_baseline = df[df.emb_model == baseline]['Risk'].values[0]
        mask_risk = df['Risk'] <= risk_baseline
        mask = mask_risk

        return_baseline = df[df.emb_model == baseline]['AVG_returns'].values[0]
        mask_return = df['AVG_returns'] >= return_baseline
        mask = mask & mask_return
        
        drd_baseline = df[df.emb_model == baseline]['Drawdown'].values[0]
        mask_drd = df['Drawdown'] <= drd_baseline
        mask = mask & mask_drd
        
        mask_save = df['emb_model'].isin(save_methods)
        mask = mask | mask_save

        df = df[mask]
        
    mask_save = df['emb_model'].isin(save_methods)
    df = df[(df['AVG_returns'] > 0) | mask_save]
    return df.sort_values('Sharpe', ascending=False)

In [22]:
config['financial_metric_path'].format(year_start, fine_tune)

'results/financial_metric_2020_False_december_.csv'

In [7]:
financial_metrics = {year_start:pd.read_csv('../'+config['financial_metric_path'].format(year_start, fine_tune), index_col=0) 
                     for year_start in [2020, 2021]}

FileNotFoundError: [Errno 2] No such file or directory: '../results/financial_metric_2021_False_december_.csv'

In [23]:
df_2020 = pd.read_csv('../'+config['financial_metric_path'].format(2020, False), index_col=0)
# df_2020_tuned = pd.read_csv('../'+config['financial_metric_path'].format(2020, True), index_col=0)
# df_2021 = pd.read_csv('../'+config['financial_metric_path'].format(2021, False), index_col=0)

df_2020 = change_features(df_2020)
# df_2020_tuned = change_features(df_2020_tuned)
# df_2021 = change_features(df_2021)

print(df_2020.shape)
df_2020.head()

(40, 10)


Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.453507,0.43705,0.554507,0.0012,0.059204,-0.038122,44.948793,202.0,KMeans,ts2vec
port,0.483981,0.468114,0.515221,0.0013,0.059237,-0.040397,45.108995,190.0,Agg,ts2vec
port,0.293635,0.307741,0.78059,0.0007,0.053087,-0.028124,37.728144,142.0,KMeans,PersImage_2
port,0.353085,0.310419,0.779779,0.0009,0.063074,-0.028341,39.016213,144.0,Agg,PersImage_2
port,0.326064,0.334986,0.733832,0.0008,0.054621,-0.030305,33.676552,201.0,KMeans,PersLandscape_1


In [24]:
df_2020_filtered = filter_risk_return(df_2020, filter_baselines=['sectors'], 
                                      save_methods=['sectors', 'sp500'])
df_2020_filtered.head()

IndexError: index 0 is out of bounds for axis 0 with size 0

In [25]:
df_2020['emb_model'].unique()

array(['ts2vec', 'PersImage_2', 'PersLandscape_1',
       'transformer_embds_sum_23', 'transformer_embds3',
       'transformer_embds2', 'table_finance_features', 'autoencoder_mlp',
       'PersLandscape_2', 'tsfresh', 'pca', 'topo_features',
       'autoencoder_conv', 'umap', 'fastica', 'transformer_embds1',
       'autoencoder_lstm', 'transformer_embds_sum_all', 'tnse',
       'PersImage_1'], dtype=object)

In [26]:
dict_category2emb = {
    'topological' : [
        'PersImage_2',
        'PersLandscape_1'
        'PersImage_1',
        'PersLandscape_2',
        'topo_features'                 
      ],
    
    'dim_reduction' : ['umap', 'pca', 'fastica', 'tnse'],
    
    'transformers' : [ 'transformer_embds1',
 'transformer_embds2',
 'transformer_embds3',
 'transformer_embds_sum_23',
 'transformer_embds_sum_all',],
    
    'autoencoders' : [ 'autoencoder_conv',
 'autoencoder_lstm',
 'autoencoder_mlp',],
    
    'baselines' : ['sp500', 'sectors'],
    
    'interpretable' : [ 'tsfresh', 'table_finance_features'],
    
    'time_clustering' : ['KShape',
 'MSTcorr',
 'TimeSeriesKMeans'],
    
    'time_embedding' : ['ts2vec', 'signal2vec_embds']
}

In [27]:
dict_emb2cat = {method:categ for categ, list_methods in dict_category2emb.items() for method in list_methods }
#dict_emb2cat

In [39]:
df_2020 = pd.read_csv('../' + 'results/financial_metric_{}_{}_{}.csv'.format(2020, False, 'december_'), index_col=0)
df_2020 = change_features(df_2020)

df_to_rebuild = df_2020

df_2020_clusters = pd.read_csv('../'+config['clust_metric_path'].format(2020, False), index_col=0)
df_to_rebuild = pd.merge(df_2020_clusters.drop(columns=['clust_model']), df_to_rebuild, on='emb_model', how='outer')

df_best_clustering = df_to_rebuild.sort_values(['Risk'], ascending=False)
df_best_clustering = df_best_clustering.groupby('emb_model', as_index=False, sort=True).first()
df_best_clustering['category'] = df_best_clustering['emb_model'].map(dict_emb2cat)
df_best_clustering.drop(columns=['Beta', 'Alpha', 'VaR']).to_csv('../results/table_all_methods_december_.csv')
df_best_clustering.sort_values('Risk')

Unnamed: 0,emb_model,DB,HC,Sil,hom,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,category
0,PersImage_1,2.160083,38.085767,0.111894,0.091855,0.097029,0.281114,0.811275,0.0,0.015911,-0.025951,31.648254,210.0,Agg,
1,PersImage_2,2.107882,51.1742,0.087259,0.087995,0.353085,0.310419,0.779779,0.0009,0.063074,-0.028341,39.016213,144.0,Agg,topological
9,table_finance_features,1.121949,115.549853,0.194085,0.154215,0.168078,0.326285,0.750731,0.0003,0.027825,-0.029613,38.889481,201.0,Agg,interpretable
15,transformer_embds_sum_23,1.134028,184.085481,0.321149,0.043921,0.404346,0.359416,0.674966,0.0011,0.063034,-0.032226,31.763108,143.0,Agg,transformers
14,transformer_embds3,1.202013,176.910613,0.3121,0.03804,0.381981,0.362079,0.655524,0.001,0.059386,-0.032433,37.899521,129.0,KMeans,transformers
10,tnse,0.985253,127.193142,0.230976,0.298667,0.263784,0.379746,0.697887,0.0006,0.039646,-0.0338,41.695679,174.0,Agg,dim_reduction
11,topo_features,1.546094,101.950124,0.134255,0.131627,0.323281,0.380808,0.686822,0.0008,0.04845,-0.033881,34.49791,203.0,KMeans,topological
13,transformer_embds2,2.758432,24.476381,0.058524,0.1014,0.457257,0.381613,0.641843,0.0012,0.066989,-0.033943,36.27247,129.0,Agg,transformers
2,PersLandscape_1,1.918107,39.956934,0.070351,0.108945,0.411442,0.406893,0.642528,0.0011,0.057555,-0.035868,34.575551,203.0,Agg,
3,PersLandscape_2,2.279878,22.778592,0.033095,0.062482,0.530376,0.413514,0.640465,0.0014,0.071354,-0.036367,34.431483,205.0,KMeans,topological


In [40]:
df_2020 = pd.read_csv('../' + 'results/financial_metric_{}_{}_{}.csv'.format(2020, False, 'may'), index_col=0)
df_2020 = change_features(df_2020)

df_to_rebuild = df_2020

df_2020_clusters = pd.read_csv('../'+config['clust_metric_path'].format(2020, False), index_col=0)
df_to_rebuild = pd.merge(df_2020_clusters.drop(columns=['clust_model']), df_to_rebuild, on='emb_model', how='outer')

df_best_clustering = df_to_rebuild.sort_values(['Risk'], ascending=False)
df_best_clustering = df_best_clustering.groupby('emb_model', as_index=False, sort=True).first()
df_best_clustering['category'] = df_best_clustering['emb_model'].map(dict_emb2cat)
df_best_clustering.drop(columns=['Beta', 'Alpha', 'VaR']).to_csv('../results/table_all_methods_december_.csv')
df_best_clustering.sort_values('Risk')

Unnamed: 0,emb_model,DB,HC,Sil,hom,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,category
2,PersImage_1,2.160083,38.085767,0.111894,0.091855,0.122132,0.257027,0.838556,0.0001,0.02343,-0.023948,28.280809,195.0,KMeans,
13,sp500,,,,,0.098372,0.289888,1.0,0.0,0.015781,-0.026672,33.92496,200.0,sp500,baselines
3,PersImage_2,2.107882,51.1742,0.087259,0.087995,0.306826,0.291371,0.801863,0.0007,0.058206,-0.026793,37.563553,69.0,Agg,topological
14,table_finance_features,1.121949,115.549853,0.194085,0.154215,0.22193,0.338905,0.731522,0.0005,0.03656,-0.030616,42.134962,195.0,Agg,interpretable
0,KShape,,,,,0.256991,0.340169,0.763,0.0006,0.042477,-0.030716,34.778614,211.0,KShape,time_clustering
18,transformer_embds2,2.666001,23.045589,0.04022,0.138601,0.399068,0.34103,0.713682,0.001,0.065185,-0.030784,32.533257,143.0,KMeans,transformers
12,sectors,,,,,0.311464,0.348002,0.687612,0.0008,0.050514,-0.031333,41.943386,219.0,sectors,baselines
20,transformer_embds_sum_23,1.134028,184.085481,0.321149,0.043921,0.470947,0.36266,0.698392,0.0012,0.071898,-0.032479,32.269475,143.0,KMeans,transformers
6,TimeSeriesKMeans,,,,,0.202942,0.362845,0.714069,0.0004,0.03129,-0.032493,35.403761,366.0,TimeSeriesKMeans,time_clustering
16,topo_features,1.546094,101.950124,0.134255,0.131627,0.327747,0.374868,0.696038,0.0008,0.049773,-0.033424,35.052341,201.0,KMeans,topological


In [34]:
df_2020

Unnamed: 0,DB,HC,Sil,hom,clust_model,emb_model
0,1.987504,84.010727,0.083092,0.163971,KMeans,ts2vec
0,2.024043,80.150906,0.079267,0.153958,Agg,ts2vec
0,1.954198,54.617123,0.123347,0.093071,KMeans,PersImage_2
0,2.107882,51.1742,0.087259,0.087995,Agg,PersImage_2
0,1.918107,39.956934,0.070351,0.108945,KMeans,PersLandscape_1
0,2.054301,36.666365,0.051789,0.106513,Agg,PersLandscape_1
0,1.157965,188.370477,0.321772,0.045826,KMeans,transformer_embds_sum_23
0,1.134028,184.085481,0.321149,0.043921,Agg,transformer_embds_sum_23
0,1.202013,176.910613,0.3121,0.03804,KMeans,transformer_embds3
0,1.198022,173.640291,0.305755,0.041086,Agg,transformer_embds3


In [29]:
df_best_clustering = pd.read_csv('table_all_methods_old.csv')
df_best_clustering.sort_values('Risk')

Unnamed: 0.1,Unnamed: 0,emb_model,DB,HC,Sil,hom,AVG_returns,Risk,Sharpe,Drawdown,Recovery,clust_model,category
10,10,sp500,,,,,0.068939,0.288985,0.009137,33.92496,179.0,sp500,baselines
36,36,umap,1.173824,24.756191,-0.243332,0.223404,0.20899,0.314975,0.03656,37.997495,133.0,KMeans,dim_reduction
17,17,takens_mult_PI_2_sigma=0.0005,0.350201,8.322796,0.321104,0.027223,0.508435,0.319754,0.085943,32.473692,55.0,KMeans,topological
23,23,takens_one_PI_0_sigma=0.0005,2.343354,57.314122,0.079197,0.131069,0.269434,0.320425,0.046981,36.170465,81.0,KMeans,topological
20,20,takens_one_BC_0,1.20672,219.953984,0.243884,0.084446,0.213173,0.322478,0.036582,35.774649,121.0,KMeans,topological
25,25,takens_one_PI_2_sigma=0.0005,0.467841,6.003348,0.262037,0.025424,0.394508,0.324655,0.067334,32.822206,82.0,KMeans,topological
15,15,takens_mult_PI_0_sigma=0.0005,0.423499,14.30994,0.377641,0.028869,0.102995,0.327738,0.015084,43.476745,219.0,Agg,topological
24,24,takens_one_PI_1_sigma=0.0005,0.319803,10.388697,0.432609,0.027312,0.393651,0.33366,0.065602,37.237865,100.0,KMeans,topological
14,14,takens_mult_BC_2,0.457607,10.709994,0.240211,0.027779,0.340816,0.339595,0.056336,36.801444,76.0,KMeans,topological
1,1,MSTcorr,3.731503,0.081506,-0.751448,0.024262,0.037729,0.341597,0.001588,49.249316,239.0,MSTcorr,time_clustering


In [15]:
df_best_clustering.sort_values('Risk')

Unnamed: 0,emb_model,DB,HC,Sil,hom,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,category
0,PersImage_1,2.160083,38.085767,0.111894,0.091855,0.097029,0.281114,0.811275,0.0,0.015911,-0.025951,31.648254,210.0,Agg,
1,PersImage_2,2.107882,51.1742,0.087259,0.087995,0.353085,0.310419,0.779779,0.0009,0.063074,-0.028341,39.016213,144.0,Agg,topological
9,table_finance_features,1.121949,115.549853,0.194085,0.154215,0.168078,0.326285,0.750731,0.0003,0.027825,-0.029613,38.889481,201.0,Agg,interpretable
15,transformer_embds_sum_23,1.134028,184.085481,0.321149,0.043921,0.404346,0.359416,0.674966,0.0011,0.063034,-0.032226,31.763108,143.0,Agg,transformers
14,transformer_embds3,1.202013,176.910613,0.3121,0.03804,0.381981,0.362079,0.655524,0.001,0.059386,-0.032433,37.899521,129.0,KMeans,transformers
10,tnse,0.985253,127.193142,0.230976,0.298667,0.263784,0.379746,0.697887,0.0006,0.039646,-0.0338,41.695679,174.0,Agg,dim_reduction
11,topo_features,1.546094,101.950124,0.134255,0.131627,0.323281,0.380808,0.686822,0.0008,0.04845,-0.033881,34.49791,203.0,KMeans,topological
13,transformer_embds2,2.758432,24.476381,0.058524,0.1014,0.457257,0.381613,0.641843,0.0012,0.066989,-0.033943,36.27247,129.0,Agg,transformers
2,PersLandscape_1,1.918107,39.956934,0.070351,0.108945,0.411442,0.406893,0.642528,0.0011,0.057555,-0.035868,34.575551,203.0,Agg,
3,PersLandscape_2,2.279878,22.778592,0.033095,0.062482,0.530376,0.413514,0.640465,0.0014,0.071354,-0.036367,34.431483,205.0,KMeans,topological


## COMET_ML

In [78]:
year_start = 2020
fine_tune = True

for key in config_emb.keys():
    config_emb[key] = config_emb[key].format(year_start)

In [79]:
df_finance = pd.read_csv('../'+config['financial_metric_path'].format(year_start, fine_tune), index_col=0)
df_clust = pd.read_csv('../'+config['clust_metric_path'].format(year_start, fine_tune), index_col=0)

df_finance = change_features(df_finance)

In [80]:
df_finance.head()

Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.495915,0.41104,0.601512,0.0014,0.067593,-0.03618,43.955665,75.0,KMeans,ts2vec
port,0.191167,0.363129,0.640662,0.0005,0.029286,-0.032515,36.463147,154.0,Agg,ts2vec
port,0.404731,0.356168,0.696695,0.0011,0.06359,-0.031973,35.514546,115.0,KMeans,takens_one_BC_2
port,-0.065147,0.247098,0.941967,-0.0005,-0.027456,-0.023112,31.13078,642.0,Agg,takens_one_BC_2
port,0.606091,0.355021,0.674516,0.0017,0.091323,-0.031883,24.775282,57.0,KMeans,takens_one_PL_2


In [81]:
model_del = 'Agg' if model_name == 'KMeans' else 'KMeans'

df_finance = df_finance[df_finance.clust_model != model_del]
df_clust = df_clust[df_clust.clust_model != model_del]

df_finance = df_finance.set_index('emb_model').drop(columns=['clust_model'])
df_clust = df_clust.set_index('emb_model').drop(columns=['clust_model'])

df_data = df_finance.join(df_clust)

assert len(df_data) == len(df_clust) + 1 == len(df_finance)
df_data.head()

Unnamed: 0_level_0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,DB,HC,Sil,hom
emb_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ts2vec,0.191167,0.363129,0.640662,0.0005,0.029286,-0.032515,36.463147,154.0,0.320173,18.543424,0.51673,0.028243
takens_one_BC_2,-0.065147,0.247098,0.941967,-0.0005,-0.027456,-0.023112,31.13078,642.0,0.45379,6.411142,0.273747,0.0289
takens_one_PL_2,0.156138,0.317826,0.740148,0.0003,0.02615,-0.028937,28.214303,182.0,0.311802,10.268861,0.379736,0.023011
transformer_embds_sum_23,0.330917,0.359422,0.667278,0.0009,0.052108,-0.032227,36.297709,180.0,1.142738,130.819923,0.216855,0.051649
transformer_embds3,0.350557,0.34313,0.696339,0.001,0.057361,-0.030949,34.166709,180.0,1.202075,129.118784,0.223701,0.054524


In [13]:
# for model_name, data in tqdm(df_model.iterrows()):
    
#     experiment = Experiment(
#     api_key="W4Exl5JlKMoVHRelfS04Tc0Lg",
#     project_name="stock-clustering-with-time-series-embeddings",
#     workspace="petrsokerin",
#     )

#     experiment.set_name(model_name)

#     hyper_params = {
#         "model": model_name,
#         "include_2020": year_start == 2021,
#         "fine_tune": fine_tune,
#     }

#     experiment.log_parameters(hyper_params)
    
#     print(model_name)
#     metrics = data.to_dict()
#     experiment.log_metrics(metrics)