In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os
import json

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [21]:
with open('config/config.json', 'r') as file:
    config = json.load(file)

# Loading Data

In [9]:
df = pd.read_csv('data/ticker_data_Close.csv', index_col=0)
df_index = pd.read_csv('data/ticker_data_SP500.csv', index_col=0)
df.head()

Unnamed: 0_level_0,SPGI,MCO,CPRT,EFX,FLT,CRL,OMC,IPG,RHI,NLSN,...,ETR,CMS,CNP,AES,EVRG,LNT,ATO,NI,NRG,PNW
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,168.119995,146.130005,43.599998,119.540001,193.869995,110.650002,72.769997,20.25,55.91,36.66,...,82.599998,46.950001,28.02,10.88,53.139999,42.110001,85.040001,25.360001,28.9,83.889999
2018-01-03,170.820007,148.860001,43.389999,120.160004,194.960007,110.309998,70.360001,19.76,55.810001,35.990002,...,81.690002,46.66,27.959999,10.87,52.32,41.740002,84.339996,25.200001,28.879999,83.120003
2018-01-04,173.380005,151.600006,43.740002,121.639999,195.460007,109.470001,71.059998,20.17,55.98,35.790001,...,80.540001,46.139999,27.99,10.83,52.099998,41.25,83.980003,25.09,28.549999,82.510002
2018-01-05,175.699997,154.119995,43.529999,122.830002,197.0,111.879997,72.169998,20.190001,56.310001,36.0,...,79.919998,45.810001,27.870001,10.87,51.66,41.080002,83.190002,24.790001,28.73,82.419998
2018-01-08,177.179993,155.070007,43.549999,121.989998,197.160004,111.889999,72.410004,20.35,56.77,36.16,...,80.839996,46.34,28.040001,10.87,51.720001,41.540001,83.449997,25.0,29.17,83.050003


In [10]:
df_na = df.isna().sum() 
stocks_to_drop = df_na[df_na > 0].index.tolist()
df = df.drop(stocks_to_drop, axis=1)

# Preprocessing

In [11]:
df = df.pct_change()[1:]
df = df.T
df.head()

Date,2018-01-03,2018-01-04,2018-01-05,2018-01-08,2018-01-09,2018-01-10,2018-01-11,2018-01-12,2018-01-16,2018-01-17,...,2022-01-19,2022-01-20,2022-01-21,2022-01-24,2022-01-25,2022-01-26,2022-01-27,2022-01-28,2022-01-31,2022-02-01
SPGI,0.01606,0.014987,0.013381,0.008423,0.000339,-0.009197,0.004499,0.00703,-0.010189,0.00546,...,-0.002829,-0.011751,-0.002512,0.002447,-0.044072,-0.004981,-0.006188,0.029209,0.02115,0.005395
MCO,0.018682,0.018407,0.016623,0.006164,0.006771,-0.008391,0.005878,0.009825,-0.010938,0.012088,...,0.006417,-0.003924,-0.009847,0.006464,-0.055365,-0.001508,0.004252,0.032493,0.019286,0.002624
CPRT,-0.004816,0.008066,-0.004801,0.000459,-0.001607,0.00092,0.022518,0.007865,-0.011817,0.000226,...,-0.007823,-0.026535,-0.013318,0.021549,-0.018003,-0.030687,-0.013637,0.032178,0.030537,0.001393
EFX,0.005187,0.012317,0.009783,-0.006839,0.002131,-0.008753,0.002558,0.011194,-0.005617,0.008677,...,-0.028006,-0.022529,-0.013077,0.0378,-0.034715,0.002433,-0.007458,0.040592,0.024396,0.01026
FLT,0.005622,0.002565,0.007879,0.000812,-0.001268,0.020314,0.008312,0.003406,-0.007232,0.007929,...,-0.002687,-0.016794,-0.030738,-0.00424,0.001419,-0.008726,-0.003173,0.036846,0.030046,0.010199


# Feature engineering

In [12]:
df_sectors = pd.read_csv(config['tickers_sectors_path'], encoding='windows-1251', sep=';')
df_sectors.head()

Unnamed: 0,Название компании,Тикер,Сектор
0,S&P Global Inc.,SPGI,Commercial Services
1,Moody’s Corporation,MCO,Commercial Services
2,"Copart, Inc.",CPRT,Commercial Services
3,"Equifax, Inc.",EFX,Commercial Services
4,"FleetCor Technologies, Inc.",FLT,Commercial Services


In [13]:
dict_tick_sect = dict(zip(df_sectors['Тикер'].values.tolist(),
                         df_sectors['Сектор'].values.tolist()))

df['sector'] = df.index.map(dict_tick_sect)
df.head()

Date,2018-01-03,2018-01-04,2018-01-05,2018-01-08,2018-01-09,2018-01-10,2018-01-11,2018-01-12,2018-01-16,2018-01-17,...,2022-01-20,2022-01-21,2022-01-24,2022-01-25,2022-01-26,2022-01-27,2022-01-28,2022-01-31,2022-02-01,sector
SPGI,0.01606,0.014987,0.013381,0.008423,0.000339,-0.009197,0.004499,0.00703,-0.010189,0.00546,...,-0.011751,-0.002512,0.002447,-0.044072,-0.004981,-0.006188,0.029209,0.02115,0.005395,Commercial Services
MCO,0.018682,0.018407,0.016623,0.006164,0.006771,-0.008391,0.005878,0.009825,-0.010938,0.012088,...,-0.003924,-0.009847,0.006464,-0.055365,-0.001508,0.004252,0.032493,0.019286,0.002624,Commercial Services
CPRT,-0.004816,0.008066,-0.004801,0.000459,-0.001607,0.00092,0.022518,0.007865,-0.011817,0.000226,...,-0.026535,-0.013318,0.021549,-0.018003,-0.030687,-0.013637,0.032178,0.030537,0.001393,Commercial Services
EFX,0.005187,0.012317,0.009783,-0.006839,0.002131,-0.008753,0.002558,0.011194,-0.005617,0.008677,...,-0.022529,-0.013077,0.0378,-0.034715,0.002433,-0.007458,0.040592,0.024396,0.01026,Commercial Services
FLT,0.005622,0.002565,0.007879,0.000812,-0.001268,0.020314,0.008312,0.003406,-0.007232,0.007929,...,-0.016794,-0.030738,-0.00424,0.001419,-0.008726,-0.003173,0.036846,0.030046,0.010199,Commercial Services


# Clustering

In [14]:
n_clusters_ = df_sectors['Сектор'].nunique()

df_predictions = pd.DataFrame(df['sector'].values, index=df.index, columns=['original'])
df_predictions['original_n'] = LabelEncoder().fit_transform(df_predictions['original'])

dict_features = dict()

df_predictions

Unnamed: 0,original,original_n
SPGI,Commercial Services,0
MCO,Commercial Services,0
CPRT,Commercial Services,0
EFX,Commercial Services,0
FLT,Commercial Services,0
...,...,...
LNT,Utilities,18
ATO,Utilities,18
NI,Utilities,18
NRG,Utilities,18


In [15]:
model_name = 'Kmeans_original'

X = df.drop(['sector'], axis=1).values
dict_features[model_name] = X

kmeans = KMeans(n_clusters=n_clusters_, random_state=0).fit(X)
clust_pred = kmeans.labels_
df_predictions[model_name] = clust_pred

In [16]:
model_name = 'Random'

X = df.drop(['sector'], axis=1).values
dict_features[model_name] = X

clust_pred = np.random.choice(df_predictions['original_n'].unique(), size=len(df))
df_predictions[model_name] = clust_pred

# Calculating metrics

In [18]:
from sklearn.metrics import (davies_bouldin_score, 
                            silhouette_score,
                            calinski_harabasz_score,
                            homogeneity_score)


metrics = {'silhouette':silhouette_score, 
           'davies_bouldin':davies_bouldin_score, 
           'calinski_harabasz':calinski_harabasz_score, 
           'homogeneity':homogeneity_score}


metrics_df = pd.DataFrame(columns = list(metrics.keys()))



for model in dict_features.keys():
    metrics_list = []
    for metric_name, metric_formula in metrics.items():
        if metric_name == 'homogeneity':
            metric_meaning = metric_formula(df_predictions['original_n'], df_predictions[model])
        else:
            metric_meaning = metric_formula(dict_features[model], df_predictions[model])
        metrics_list.append(metric_meaning)
    metrics_df.loc[model] = metrics_list
    
metrics_df

Unnamed: 0,silhouette,davies_bouldin,calinski_harabasz,homogeneity
Kmeans_original,0.027872,2.550901,12.701693,0.389196
Random,-0.060587,7.899208,1.008638,0.163584


# Saving results

In [23]:
metrics_df.to_csv(config['metrics_path'])
df_predictions.to_csv(config['predictions_path'])