In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os
import json
from tqdm.notebook import tqdm

import sklearn

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import (KMeans, Birch, SpectralClustering, 
                             MiniBatchKMeans, AgglomerativeClustering)

from sklearn.mixture import GaussianMixture

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
with open('config/config.json', 'r') as file:
    config = json.load(file)

# Loading Data

In [3]:
df = pd.read_csv(config['ticker_data_preprocessed'], index_col=0)
df_vol = pd.read_csv(config['ticker_data_volume'], index_col=0)
df_index = pd.read_csv(config['ticker_data_sp500'], index_col=0)
df_sectors = pd.read_csv(config['tickers_sectors_path'], index_col=0)
print(df.shape)
df.head()

(484, 853)


Unnamed: 0,2018-10-24,2018-10-25,2018-10-26,2018-10-29,2018-10-30,2018-10-31,2018-11-01,2018-11-02,2018-11-05,2018-11-06,...,2022-03-02,2022-03-03,2022-03-04,2022-03-07,2022-03-08,2022-03-09,2022-03-10,2022-03-11,2022-03-14,sector
A,-0.025506,0.024211,0.001917,-0.010362,0.025129,0.01807,0.006637,0.010886,-0.000303,0.001062,...,0.009323,0.030114,-0.023912,-0.026514,-0.007826,0.030467,0.008029,-0.029926,-0.005218,Healthcare
AAL,-0.063002,0.066908,0.00278,0.004313,0.06319,0.012118,0.036773,-0.0011,-0.016791,-0.015398,...,0.004911,-0.040318,-0.071292,-0.119945,0.052181,0.058475,-0.006993,-0.012676,0.021284,Industrials
AAP,-0.029213,0.025066,0.008457,0.001945,0.000607,-0.031582,0.023473,0.009113,0.000667,0.018049,...,0.03927,-0.003826,-0.007922,-0.025699,-0.003626,0.021438,0.000195,-0.004099,-0.024745,Consumer Cyclical
AAPL,-0.034302,0.021898,-0.015924,-0.01877,0.004994,0.026067,0.015352,-0.066331,-0.028388,0.010814,...,0.020588,-0.001981,-0.018408,-0.023718,-0.011676,0.034997,-0.027186,-0.023909,-0.018742,Technology
ABBV,-0.042792,0.029215,-0.011501,0.007055,0.002827,-0.045839,0.030315,-0.008104,0.037959,0.013078,...,0.012729,0.005616,0.000997,-0.007372,-0.01539,0.011485,0.002217,-0.000737,0.02442,Healthcare


# Preprocessing

In [4]:
dict_features = dict()
dict_features['original;;'] = df.drop(['sector'], axis=1).values

## table data

In [5]:
table_features = pd.read_csv(config['features_path'], index_col=0)

from sklearn.preprocessing import StandardScaler 
table_features = StandardScaler().fit_transform(table_features)
dict_features['table_data;;'] = table_features

## neural network

# Clustering

In [6]:
modelname = 'namepreprocessing;surnamepreprocessing;parameterspreproc|clustmodelname;parsmetrsclustering'
parametrs = 'par1name:par1value,par2name:par2value'

example = 'row;originalnowindows;|random;random_state:42'

In [7]:
def score_func(estimator, X, y):
    labels_predicted = estimator.fit_predict(X)
    score = silhouette_score(X, labels_predicted)
    return score

def gridsearch(model, X, y, parameters:dict):
    custom_cv=[(slice(None), slice(None))]
    grid_search = GridSearchCV(model, param_grid=parameters, scoring=score_func, cv=custom_cv)#, refit=False)
    grid_search.fit(X, y)
    labels_predicted = grid_search.best_estimator_.fit_predict(X)
    return grid_search.best_params_, labels_predicted

def parse_dict(best_params_dict, model_name, preproc_label):
    name = preproc_label + '|' + model_name + ';'
    for key, value in best_params_dict.items():
        name = name + key + ':' + str(value) + ','
    return name[:-1]
    

def train_all(models_dict, params_dict, preproc_label, X, y=None, grid_search=True, n_clusters_=config['n_clusters']):
    output = {}
    for model_name, model in models_dict.items():
        print(model_name)
        param_grid = params_dict[model_name]
        
        if grid_search:
            best_params_dict, labels_predicted = gridsearch(model, X, y, param_grid)
            final_key = parse_dict(best_params_dict, model_name, preproc_label)
        else:
            if model_name == 'Gauss_Mix': 
                model=model(n_components=n_clusters_)
            else:
                model=model(n_clusters=n_clusters_)
                
            model_loc = model.fit(X)
            labels_predicted = model_loc.labels_
            final_key = preproc_label + '|' + 'original'

        
        output[final_key] = labels_predicted
        print('done')
    return output



In [8]:
models_dict = {'Kmeans': KMeans(),
               'Birch': Birch(),
               'Agglomer': AgglomerativeClustering(),
               'MiniBatchKMeans': MiniBatchKMeans(),
               'Gauss_Mix': GaussianMixture(),
               'Spectral': SpectralClustering()}

n_clusters = np.arange(9, 14)

params_dict = {'Kmeans': 
               {
                   'n_clusters':n_clusters,
                   'init': ['k-means++', 'random'],
                   'algorithm':['auto', 'full', 'elkan']
               },
               'Birch': 
               {
                   #'threshold': np.linspace(0, 1, num=10),
                   'n_clusters': n_clusters,
                   'branching_factor': np.arange(10, 80, step=5)
               },
               'Agglomer': 
               {
                   'n_clusters': n_clusters,
                   'linkage': ['ward', 'complete', 'average', 'single'],
                   #'affinity':['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
               },
               'MiniBatchKMeans':
               {
                   'n_clusters': n_clusters,
                   'init':['k-means++', 'random'],
               },
               'Gauss_Mix': 
               {
                   'n_components': n_clusters,
                   'covariance_type':['full', 'tied', 'diag', 'spherical'],
                   'init_params': ['kmeans', 'random']
               },
               'Spectral': 
               {
                   'n_clusters': n_clusters,
                   'eigen_solver':['arpack', 'lobpcg'],
                   #'affinity': ['nearest_neigbors', 'rbf']
               }
              }

In [9]:
n_clusters_ = df_sectors['sector'].nunique()

df_predictions = pd.DataFrame(df['sector'].values, index=df.index, columns=['original'])
df_predictions['original_n'] = LabelEncoder().fit_transform(df_predictions['original'])
df_predictions.head()

Unnamed: 0,original,original_n
A,Healthcare,6
AAL,Industrials,7
AAP,Consumer Cyclical,2
AAPL,Technology,9
ABBV,Healthcare,6


In [10]:
import warnings
warnings.filterwarnings("ignore")

for prepr_name, data in dict_features.items():
    print(f"--------------------- {prepr_name} ---------------------")
    X = data
    y = df['sector'].values
    gridsearch_output = train_all(models_dict, params_dict, preproc_label=prepr_name, X=X, y=y)
    prepr_predictions = pd.DataFrame(gridsearch_output, index=df_predictions.index)
    
    df_predictions = pd.concat([df_predictions, prepr_predictions], axis=1)
    print()

--------------------- original;; ---------------------
Kmeans
done
Birch
done
Agglomer
done
MiniBatchKMeans
done
Gauss_Mix
done
Spectral
done

--------------------- table_data;; ---------------------
Kmeans
done
Birch
done
Agglomer
done
MiniBatchKMeans
done
Gauss_Mix
done
Spectral
done



In [11]:
df_predictions.head()

Unnamed: 0,original,original_n,"original;;|Kmeans;algorithm:auto,init:k-means++,n_clusters:9","original;;|Birch;branching_factor:10,n_clusters:9","original;;|Agglomer;linkage:ward,n_clusters:9","original;;|MiniBatchKMeans;init:k-means++,n_clusters:9","original;;|Gauss_Mix;covariance_type:full,init_params:kmeans,n_components:9","original;;|Spectral;eigen_solver:arpack,n_clusters:9","table_data;;|Kmeans;algorithm:auto,init:k-means++,n_clusters:9","table_data;;|Birch;branching_factor:10,n_clusters:9","table_data;;|Agglomer;linkage:ward,n_clusters:9","table_data;;|MiniBatchKMeans;init:k-means++,n_clusters:9","table_data;;|Gauss_Mix;covariance_type:full,init_params:kmeans,n_components:9","table_data;;|Spectral;eigen_solver:arpack,n_clusters:9"
A,Healthcare,6,7,3,6,3,4,0,8,6,6,6,3,2
AAL,Industrials,7,5,0,1,7,7,3,4,0,4,5,2,2
AAP,Consumer Cyclical,2,4,3,2,5,6,7,5,2,2,7,0,2
AAPL,Technology,9,6,3,6,3,1,0,8,6,6,2,3,2
ABBV,Healthcare,6,7,3,2,4,4,6,3,2,2,6,1,2


In [12]:
df_predictions['original;;|original;'] = df_predictions['original_n']

In [13]:
model_name = 'original;;|random;random_state:42'

X = df.drop(['sector'], axis=1).values

np.random.seed(42)
clust_pred = np.random.choice(df_predictions['original;;|original;'].unique(), size=len(df))
df_predictions[model_name] = clust_pred

# Calculating metrics

In [14]:
def renaming(name):
    data_name, model_name = name.split('|')
    model_name = model_name[:4]
    data_name, data_surname, params = data_name.split(';')
    new_name = data_name[:4] + '_' + data_surname[:4] + model_name
    return new_name
    
name = 'original;;|random;random_state:42'
renaming(name)

'orig_rand'

In [15]:
from sklearn.metrics import (davies_bouldin_score, 
                            silhouette_score,
                            calinski_harabasz_score,
                            homogeneity_score)

metrics = {'silhouette':silhouette_score, 
           'davies_bouldin':davies_bouldin_score, 
           'calinski_harabasz':calinski_harabasz_score, 
           'homogeneity':homogeneity_score}


metrics_df = pd.DataFrame(columns = list(metrics.keys()))



for features_model in df_predictions.columns[2:]:
    metrics_list = []
    for metric_name, metric_formula in metrics.items():
        if metric_name == 'homogeneity':
            metric_meaning = metric_formula(df_predictions['original'], df_predictions[features_model])
        else:
            features = features_model.split('|')[0]
#             print(features)
#             print(dict_features[features])
            metric_meaning = metric_formula(dict_features[features], df_predictions[features_model])
        metrics_list.append(metric_meaning)
    metrics_df.loc[features_model] = metrics_list

model_params = metrics_df.index.tolist()
metrics_df.index = metrics_df.reset_index()['index'].apply(lambda x: renaming(x))
model_short_names = metrics_df.index.tolist()

In [16]:
df_params = pd.DataFrame({'short_name': model_short_names, 
                         'all_params': model_params})
df_params['model'] = df_params['all_params'].apply(lambda x: x.split('|')[1].split(';')[0])
df_params['f_name'] = df_params['all_params'].apply(lambda x: x.split('|')[0].split(';')[0])
df_params['f_surname'] = df_params['all_params'].apply(lambda x: x.split('|')[0].split(';')[1])

df_params

Unnamed: 0,short_name,all_params,model,f_name,f_surname
0,orig_Kmea,"original;;|Kmeans;algorithm:auto,init:k-means+...",Kmeans,original,
1,orig_Birc,"original;;|Birch;branching_factor:10,n_clusters:9",Birch,original,
2,orig_Aggl,"original;;|Agglomer;linkage:ward,n_clusters:9",Agglomer,original,
3,orig_Mini,"original;;|MiniBatchKMeans;init:k-means++,n_cl...",MiniBatchKMeans,original,
4,orig_Gaus,"original;;|Gauss_Mix;covariance_type:full,init...",Gauss_Mix,original,
5,orig_Spec,"original;;|Spectral;eigen_solver:arpack,n_clus...",Spectral,original,
6,tabl_Kmea,"table_data;;|Kmeans;algorithm:auto,init:k-mean...",Kmeans,table_data,
7,tabl_Birc,"table_data;;|Birch;branching_factor:10,n_clust...",Birch,table_data,
8,tabl_Aggl,"table_data;;|Agglomer;linkage:ward,n_clusters:9",Agglomer,table_data,
9,tabl_Mini,"table_data;;|MiniBatchKMeans;init:k-means++,n_...",MiniBatchKMeans,table_data,


In [17]:
new_col_names = []
for col in df_predictions.columns:
    if len(col.split('|')) == 2:
        col = renaming(col)
    new_col_names.append(col)
    
df_predictions.columns = new_col_names
df_predictions.head()

Unnamed: 0,original,original_n,orig_Kmea,orig_Birc,orig_Aggl,orig_Mini,orig_Gaus,orig_Spec,tabl_Kmea,tabl_Birc,tabl_Aggl,tabl_Mini,tabl_Gaus,tabl_Spec,orig_orig,orig_rand
A,Healthcare,6,7,3,6,3,4,0,8,6,6,6,3,2,6,5
AAL,Industrials,7,5,0,1,7,7,3,4,0,4,5,2,2,7,9
AAP,Consumer Cyclical,2,4,3,2,5,6,7,5,2,2,7,0,2,2,1
AAPL,Technology,9,6,3,6,3,1,0,8,6,6,2,3,2,9,0
ABBV,Healthcare,6,7,3,2,4,4,6,3,2,2,6,1,2,6,3


In [18]:
def rank_methods(df):
    
    for c in df.columns[2:]:
        if c == 'davies_bouldin':
            df[c] = df[c].rank(method='dense',ascending=True).astype(int)
        else:
            df[c] = df[c].rank(method='dense',ascending=False).astype(int)
            
    return df.mean(axis=1)

def choose_method(df_mean_ranked):
    method = df_mean_ranked.argmin()
    return method

df_mean_ranked = rank_methods(metrics_df.copy()) 
metrics_df['mean_ranks'] = df_mean_ranked
method = choose_method(df_mean_ranked)
print('best_method: '+ df_mean_ranked.index[method])

metrics_df = metrics_df.sort_values(by=['mean_ranks'])
metrics_df

best_method: tabl_Kmea


Unnamed: 0_level_0,silhouette,davies_bouldin,calinski_harabasz,homogeneity,mean_ranks
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tabl_Kmea,0.195768,1.067041,119.248035,0.130744,2.565702
tabl_Mini,0.170787,1.418123,102.224753,0.146355,3.147228
orig_Aggl,0.019951,2.800897,21.6536,0.331075,3.205212
tabl_Aggl,0.166914,1.084454,104.14363,0.116102,3.312842
tabl_Birc,0.178363,1.13171,103.317828,0.121801,3.327518
orig_Kmea,0.018787,2.825767,22.899585,0.293566,3.461138
orig_Spec,0.03029,2.930691,20.682965,0.36055,3.740245
orig_Mini,0.036069,3.209643,21.527921,0.298839,3.811428
orig_Gaus,0.046832,2.418138,21.115347,0.247624,4.366242
tabl_Gaus,0.141359,1.408759,91.268208,0.11408,4.38753


# Saving results

In [19]:
metrics_df.to_csv(config['metrics_path'])
df_predictions.to_csv(config['predictions_path'])
df_params.to_csv(config['params_path'])