In [32]:
from datetime import datetime
from datetime import timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import os
import json
from tqdm.notebook import tqdm
import plotly.express as px
import plotly

from comet_ml import Experiment

In [33]:
with open('../config/config.json', 'r') as file:
    config = json.load(file)
    
with open('../config/config_emb_path.json', 'r') as file:
    config_emb = json.load(file)
    
rs = config['random_state']

In [34]:
def change_features(df):
    df['AVG_returns'] = (df['AVG_returns'] + 1) ** 252 - 1
    df['Risk'] = (df['Risk'] + 1) ** np.sqrt(252) - 1
    df['Alpha'] = df['Alpha'].round(4)
    df['Drawdown'] = -df['Drawdown']
    

    return df 

# Building table

In [35]:
year_start = 2020
fine_tune = True

In [36]:
def filter_risk_return(df, filter_baselines=['sp500', 'sectors'], save_methods=['sectors', 'sp500']):      
        
    for baseline in filter_baselines:

        risk_baseline = df[df.emb_model == baseline]['Risk'].values[0]
        mask_risk = df['Risk'] <= risk_baseline
        mask = mask_risk

        return_baseline = df[df.emb_model == baseline]['AVG_returns'].values[0]
        mask_return = df['AVG_returns'] >= return_baseline
        mask = mask & mask_return
        
        drd_baseline = df[df.emb_model == baseline]['Drawdown'].values[0]
        mask_drd = df['Drawdown'] <= drd_baseline
        mask = mask & mask_drd
        
        mask_save = df['emb_model'].isin(save_methods)
        mask = mask | mask_save

        df = df[mask]
        
    mask_save = df['emb_model'].isin(save_methods)
    df = df[(df['AVG_returns'] > 0) | mask_save]
    return df.sort_values('Sharpe', ascending=False)

In [37]:
config['financial_metric_path'].format(year_start, fine_tune)

'results/financial_metric_2020_True.csv'

In [38]:
financial_metrics = {year_start:pd.read_csv('../'+config['financial_metric_path'].format(year_start, fine_tune), index_col=0) 
                     for year_start in [2020, 2021]}

In [39]:
df_2020 = pd.read_csv('../'+config['financial_metric_path'].format(2020, False), index_col=0)
df_2020_tuned = pd.read_csv('../'+config['financial_metric_path'].format(2020, True), index_col=0)
df_2021 = pd.read_csv('../'+config['financial_metric_path'].format(2021, False), index_col=0)

df_2020 = change_features(df_2020)
df_2020_tuned = change_features(df_2020_tuned)
df_2021 = change_features(df_2021)

In [40]:
df_2020_filtered = filter_risk_return(df_2020, filter_baselines=['sectors'], 
                                      save_methods=['sectors', 'sp500'])


In [41]:
df_2020_filtered_tuned = filter_risk_return(df_2020_tuned, filter_baselines=['sectors'], 
                                      save_methods=['sectors', 'sp500'])
df_2020_filtered_tuned

Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.606091,0.355021,0.674516,0.0017,0.091323,-0.031883,24.775282,57.0,KMeans,takens_one_PL_2
port,0.508435,0.319754,0.716834,0.0014,0.085943,-0.029091,32.473692,55.0,KMeans,takens_mult_PI_2_sigma=0.0005
port,0.40448,0.324876,0.747942,0.0011,0.068878,-0.029501,36.53091,91.0,KMeans,table_finance_features
port,0.464832,0.377153,0.635457,0.0013,0.068685,-0.0336,33.899962,82.0,KMeans,takens_one_BC_1
port,0.457139,0.375732,0.662357,0.0013,0.067878,-0.033491,37.609756,122.0,KMeans,autoencoder_lstm
port,0.394508,0.324655,0.76198,0.0011,0.067334,-0.029483,32.822206,82.0,KMeans,takens_one_PI_2_sigma=0.0005
port,0.393651,0.33366,0.660439,0.0011,0.065602,-0.0302,37.237865,100.0,KMeans,takens_one_PI_1_sigma=0.0005
port,0.38132,0.323331,0.743312,0.0011,0.065456,-0.029378,29.859144,108.0,KMeans,takens_mult_PL_2
port,0.405292,0.346878,0.705258,0.0011,0.065156,-0.031244,33.04888,145.0,KMeans,transformer_embds_sum_23
port,0.404731,0.356168,0.696695,0.0011,0.06359,-0.031973,35.514546,115.0,KMeans,takens_one_BC_2


In [42]:
df_2021_filtered = filter_risk_return(df_2021, filter_baselines=['sectors'], 
                                      save_methods=['sectors', 'sp500'])
df_2021_filtered

Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.672739,0.270982,0.501459,0.0021,0.126564,-0.025113,12.020327,65.0,KMeans,table_finance_features
port,0.414883,0.208473,0.605803,0.0014,0.105069,-0.0198,11.293906,78.0,Agg,table_finance_features
port,0.491682,0.315458,0.468096,0.0016,0.084426,-0.028746,16.850101,73.0,Agg,tsfresh
port,0.354555,0.328862,0.421275,0.0012,0.060186,-0.029819,20.176808,88.0,sectors,sectors
port,-0.042238,0.203246,1.0,0.0,-0.024612,-0.019344,25.081503,179.0,sp500,sp500


In [43]:
df_2020_tuned[df_2020_tuned.clust_model != 'KMeans']

Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.191167,0.363129,0.640662,0.0005,0.029286,-0.032515,36.463147,154.0,Agg,ts2vec
port,-0.065147,0.247098,0.941967,-0.0005,-0.027456,-0.023112,31.13078,642.0,Agg,takens_one_BC_2
port,0.156138,0.317826,0.740148,0.0003,0.02615,-0.028937,28.214303,182.0,Agg,takens_one_PL_2
port,0.330917,0.359422,0.667278,0.0009,0.052108,-0.032227,36.297709,180.0,Agg,transformer_embds_sum_23
port,0.350557,0.34313,0.696339,0.001,0.057361,-0.030949,34.166709,180.0,Agg,transformer_embds3
port,0.356413,0.411002,0.58341,0.001,0.049856,-0.036178,41.246102,288.0,Agg,transformer_embds2
port,0.102995,0.327738,0.691486,0.0002,0.015084,-0.029729,43.476745,219.0,Agg,takens_mult_PI_0_sigma=0.0005
port,0.261016,0.375108,0.585676,0.0007,0.03964,-0.033443,39.231977,180.0,Agg,takens_mult_PL_2
port,0.064499,0.327195,0.737279,0.0,0.007268,-0.029686,41.084186,196.0,Agg,takens_one_BC_1
port,0.165177,0.43037,0.574173,0.0004,0.021467,-0.037626,43.611548,148.0,Agg,table_finance_features


In [44]:
dict_category2emb = {
    'topological' : ['takens_mult_BC_0',
 'takens_mult_BC_1',
 'takens_mult_BC_2',
 'takens_mult_PI_0_sigma=0.0005',
 'takens_mult_PI_1_sigma=0.0005',
 'takens_mult_PI_2_sigma=0.0005',
 'takens_mult_PL_1',
 'takens_mult_PL_2',
 'takens_one_BC_0',
 'takens_one_BC_1',
 'takens_one_BC_2',
 'takens_one_PI_0_sigma=0.0005',
 'takens_one_PI_1_sigma=0.0005',
 'takens_one_PI_2_sigma=0.0005',
 'takens_one_PL_1',
 'takens_one_PL_2',],
    
    'dim_reduction' : ['umap', 'pca', 'fastica', 'tnse'],
    
    'transformers' : [ 'transformer_embds1',
 'transformer_embds2',
 'transformer_embds3',
 'transformer_embds_sum_23',
 'transformer_embds_sum_all',],
    
    'autoencoders' : [ 'autoencoder_conv',
 'autoencoder_lstm',
 'autoencoder_mlp',],
    
    'baselines' : ['sp500', 'sectors'],
    
    'interpretable' : [ 'tsfresh', 'table_finance_features'],
    
    'time_clustering' : ['KShape',
 'MSTcorr',
 'TimeSeriesKMeans'],
    
    'time_embedding' : ['ts2vec', 'signal2vec_embds']
}

In [45]:
dict_emb2cat = {method:categ for categ, list_methods in dict_category2emb.items() for method in list_methods }
#dict_emb2cat

In [46]:
df_2020_clusters = pd.read_csv('../'+config['clust_metric_path'].format(2020, True), index_col=0)
df_2020_tuned = pd.merge(df_2020_clusters.drop(columns=['clust_model']), df_2020_tuned, on='emb_model', how='outer')

df_best_clustering = df_2020_tuned.sort_values(['Risk'], ascending=False)
df_best_clustering = df_best_clustering.groupby('emb_model', as_index=False, sort=True).first()
df_best_clustering['category'] = df_best_clustering['emb_model'].map(dict_emb2cat)
df_best_clustering.drop(columns=['Beta', 'Alpha', 'VaR']).to_csv('../results/table_all_methods.csv')

## COMET_ML

In [78]:
year_start = 2020
fine_tune = True

for key in config_emb.keys():
    config_emb[key] = config_emb[key].format(year_start)

In [79]:
df_finance = pd.read_csv('../'+config['financial_metric_path'].format(year_start, fine_tune), index_col=0)
df_clust = pd.read_csv('../'+config['clust_metric_path'].format(year_start, fine_tune), index_col=0)

df_finance = change_features(df_finance)

In [80]:
df_finance.head()

Unnamed: 0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,clust_model,emb_model
port,0.495915,0.41104,0.601512,0.0014,0.067593,-0.03618,43.955665,75.0,KMeans,ts2vec
port,0.191167,0.363129,0.640662,0.0005,0.029286,-0.032515,36.463147,154.0,Agg,ts2vec
port,0.404731,0.356168,0.696695,0.0011,0.06359,-0.031973,35.514546,115.0,KMeans,takens_one_BC_2
port,-0.065147,0.247098,0.941967,-0.0005,-0.027456,-0.023112,31.13078,642.0,Agg,takens_one_BC_2
port,0.606091,0.355021,0.674516,0.0017,0.091323,-0.031883,24.775282,57.0,KMeans,takens_one_PL_2


In [81]:
model_del = 'Agg' if model_name == 'KMeans' else 'KMeans'

df_finance = df_finance[df_finance.clust_model != model_del]
df_clust = df_clust[df_clust.clust_model != model_del]

df_finance = df_finance.set_index('emb_model').drop(columns=['clust_model'])
df_clust = df_clust.set_index('emb_model').drop(columns=['clust_model'])

df_data = df_finance.join(df_clust)

assert len(df_data) == len(df_clust) + 1 == len(df_finance)
df_data.head()

Unnamed: 0_level_0,AVG_returns,Risk,Beta,Alpha,Sharpe,VaR,Drawdown,Recovery,DB,HC,Sil,hom
emb_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ts2vec,0.191167,0.363129,0.640662,0.0005,0.029286,-0.032515,36.463147,154.0,0.320173,18.543424,0.51673,0.028243
takens_one_BC_2,-0.065147,0.247098,0.941967,-0.0005,-0.027456,-0.023112,31.13078,642.0,0.45379,6.411142,0.273747,0.0289
takens_one_PL_2,0.156138,0.317826,0.740148,0.0003,0.02615,-0.028937,28.214303,182.0,0.311802,10.268861,0.379736,0.023011
transformer_embds_sum_23,0.330917,0.359422,0.667278,0.0009,0.052108,-0.032227,36.297709,180.0,1.142738,130.819923,0.216855,0.051649
transformer_embds3,0.350557,0.34313,0.696339,0.001,0.057361,-0.030949,34.166709,180.0,1.202075,129.118784,0.223701,0.054524


In [13]:
# for model_name, data in tqdm(df_model.iterrows()):
    
#     experiment = Experiment(
#     api_key="W4Exl5JlKMoVHRelfS04Tc0Lg",
#     project_name="stock-clustering-with-time-series-embeddings",
#     workspace="petrsokerin",
#     )

#     experiment.set_name(model_name)

#     hyper_params = {
#         "model": model_name,
#         "include_2020": year_start == 2021,
#         "fine_tune": fine_tune,
#     }

#     experiment.log_parameters(hyper_params)
    
#     print(model_name)
#     metrics = data.to_dict()
#     experiment.log_metrics(metrics)