In [6]:
import numpy as np
from scipy import stats
import networkx as nx
from networkx.algorithms import tree

In [7]:
from algorithm.louvain import louvain
from algorithm.spectral_clustering import spectral_clustering, normalized_spectral_clustering
from algorithm.hierarchical_clustering import mst_cut_clustering
from utils.analyze import get_rs_from_fixed_weighted_degree, compute_clustering, validation, metrics_to_df, set_zero_weights_to_very_low,get_cor_from_cov

## Stocks clustering

In [8]:
dow30_tickers = ['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 
                'CSCO', 'CVX', 'GS', 'HD', 'HON',
                'IBM', 'INTC', 'JNJ', 'KO', 'JPM',
                'MCD', 'MMM', 'MRK', 'MSFT', 'NKE',
                'PG', 'TRV', 'UNH', 'CRM', 'VZ',
                'V', 'WBA', 'WMT', 'DIS', 'DOW'
                ]
len(dow30_tickers)

30

In [9]:
import yfinance as yf

START_DATE  ="2022-01-01"
END_DATE ="2023-01-01"
data = yf.download(' '.join(dow30_tickers), start=START_DATE, end=END_DATE,
                                 group_by='ticker')

[*********************100%***********************]  30 of 30 completed


In [10]:
data = data.apply(lambda x: np.log(x/x.shift(1)))
data = data.dropna()
data

Unnamed: 0_level_0,AXP,AXP,AXP,AXP,AXP,AXP,CSCO,CSCO,CSCO,CSCO,...,HON,HON,HON,HON,MCD,MCD,MCD,MCD,MCD,MCD
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2022-01-03,0.002495,0.019367,0.007020,0.027789,0.027789,0.618510,-0.010281,-0.007392,-0.013035,-0.003319,...,-0.003981,-0.008235,-0.008235,0.517740,0.005544,0.005965,-0.006194,0.001901,0.001900,0.593635
2022-01-04,0.031179,0.033001,0.031140,0.031598,0.031598,0.394050,-0.007340,-0.008561,-0.022491,-0.030707,...,0.000438,0.010582,0.010582,0.233376,0.000037,0.001550,0.010153,-0.000261,-0.000260,-0.093131
2022-01-05,0.028977,0.012901,0.011141,-0.010713,-0.010713,-0.066236,-0.020222,-0.018803,-0.014173,-0.015963,...,0.016733,0.009808,0.009808,0.227556,-0.000742,-0.003214,-0.005120,-0.004928,-0.004928,-0.013256
2022-01-06,-0.012498,-0.014972,-0.010257,0.006615,0.009122,-0.527943,-0.019639,-0.008308,-0.005659,0.010561,...,0.007147,-0.001138,-0.001138,-0.223266,-0.004391,0.001774,0.003366,0.009313,0.009313,0.162171
2022-01-07,-0.000290,0.009569,0.012353,0.008523,0.008523,0.138511,0.014396,0.006034,0.012606,0.003441,...,0.004500,0.023116,0.023116,-0.067576,0.001640,-0.002181,-0.002992,-0.009800,-0.009800,-0.275018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,-0.005630,0.002995,0.012533,0.011699,0.011699,-0.551189,-0.005067,0.000000,0.006830,0.003376,...,0.011749,0.007321,0.007321,-0.534605,-0.005139,0.000857,0.004315,0.006750,0.006750,-0.623267
2022-12-27,0.014085,0.004881,0.007926,-0.004294,-0.004294,0.105869,0.008850,0.004622,0.004457,0.001053,...,0.010584,0.004758,0.004758,0.116219,0.010326,0.001936,0.006812,-0.002732,-0.002732,0.277405
2022-12-28,-0.007359,-0.007058,-0.011947,-0.016392,-0.016392,-0.220859,0.000419,0.001257,-0.005096,-0.009725,...,-0.007987,-0.012644,-0.012644,0.095435,-0.002460,-0.002719,-0.005755,-0.006504,-0.006505,-0.159499
2022-12-29,-0.007759,0.005434,0.003191,0.022724,0.022724,0.165551,-0.009057,-0.000628,0.005942,0.009094,...,0.003578,0.011992,0.011992,0.156663,-0.007716,-0.001232,-0.000717,0.003088,0.003088,-0.024029


In [11]:
cov = np.cov(data.T)
cov = set_zero_weights_to_very_low(cov)

In [12]:
def check(col1, col2):
    # вычислить коэффициент преобразования
    # при этом надо найти ненулевые значения иначе
    first1 = -1
    first2 = -1

    for i in range(len(col1)):
        if col1[i] != 0 and first1 != -1:
            first1 = i

        if col2[i] != 0 and first2 != -1:
            first2 = i

    # если ненулевых элементов не найдено - выйти
    if first1 == 0 and first2 == 0:
        return True

    # найти коэффициент
    first = max(first1, first2)
    coeff = col1[first] / col2[first]

    # проверить элементы столбцов - можно ли их представить в виде elem2 = elem1 * const
    for i in range(len(col1)):
        if abs(col1[i] - col2[i] * coeff) > 1e-10:
            return False

    return True

# проверить, является ли матрица вырожденной
def is_invertible(matrix):
    n = len(matrix)
    for i in range(n - 1):
        for j in range(i + 1, n):
            if check(matrix[i], matrix[j]):
                return True
    return False

is_invertible(cov)

True

In [14]:
import pandas as pd

In [15]:
cor = get_cor_from_cov(cov)
k=2
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
result = dict()
for algo in algos:
    result[algo.__name__] = algo(cor,k)
pd.DataFrame(result)

Unnamed: 0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
0,0,0,2,0
1,0,0,2,0
2,0,0,2,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
175,0,0,2,0
176,0,0,2,0
177,0,0,1,0
178,0,0,1,0


In [19]:
from utils.generation import multivariate_t_rvs
sample_vol = 250
distribution = np.random.multivariate_normal
mean = np.mean(data)

distribution = multivariate_t_rvs
samples = distribution(mean, cov, sample_vol).T
esimated_cor = np.corrcoef(samples)

In [20]:
k=2
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
result_e = dict()
for algo in algos:
    result_e[algo.__name__] = algo(esimated_cor,k)
pd.DataFrame(result)

Unnamed: 0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
0,0,0,2,0
1,0,0,2,0
2,0,0,2,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
175,0,0,2,0
176,0,0,2,0
177,0,0,1,0
178,0,0,1,0


In [29]:
def stock_uncertainty(cor, num_repeats, num_clusters, algos, distribution = np.random.multivariate_normal, **kwargs):
    from utils.generation import generate_samples_bag
    from sklearn.metrics.cluster import rand_score
    #from sklearn.metrics import rand_score
    from sklearn.metrics.cluster import adjusted_rand_score
    from sklearn.metrics.cluster import adjusted_mutual_info_score
    from sklearn.metrics.cluster import mutual_info_score

    result = dict()    
    sample_vol = 250

    mean = np.mean(data)
    samples_bag = generate_samples_bag(mean, cov, bags = num_repeats, sample_size=sample_vol, distribution = distribution, **kwargs)

    estimated_graphs_bag = [set_zero_weights_to_very_low(np.corrcoef(sample)) for sample in samples_bag]
    true_labels = dict()
    for algo in algos:
        true_labels[algo.__name__] = algo(cor, num_clusters)
        algo_result = []
        for estimated_graph in estimated_graphs_bag:
            algo_result.append(algo(estimated_graph, num_clusters))       
        result[algo.__name__] = algo_result
    
    metrics_by_algos = dict()
    for algo in result:
        metrics = dict()
        metrics['RI'] = np.array([rand_score(true_labels[algo], labels) for labels in result[algo]]).T
        metrics['ARI'] = np.array([adjusted_rand_score(true_labels[algo], labels) for labels in result[algo]]).T
        metrics_by_algos[algo] = metrics    
    return true_labels, result, estimated_graphs_bag, metrics_by_algos


def stock_uncertainty_different_k(cor, num_repeats, ks, algos, distribution = distribution, **kwargs):
    from utils.analyze import nested_dict_to_dict

    metrics = dict()
    for k in ks:
        true_labels, result, estimated_graphs_bag, metrics[k] = stock_uncertainty(cor, num_repeats, k, algos, distribution = distribution, **kwargs)
    for k in metrics:
        for algo in metrics[k]:
            for metric_type in metrics[k][algo]:
                metrics[k][algo][metric_type] = np.mean(metrics[k][algo][metric_type])
    
    metric_by_k = dict()
    for k in metrics:
        for algo in metrics[k]:
            metric_by_k[algo] = dict()
            for metric_type in metrics[k][algo]:
                metric_by_k[algo][metric_type] = []
    for k in metrics:
        for algo in metrics[k]:
            for metric_type in metrics[k][algo]:
                metric_by_k[algo][metric_type].append(metrics[k][algo][metric_type])
    df  = pd.DataFrame(nested_dict_to_dict(metric_by_k), index = None)
    df['k'] = ks
    df.set_index('k', inplace=True)
    return df


In [22]:
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
true_labels, result, graphs, metrics = stock_uncertainty(cor, 400, 2, algos)

In [23]:
from utils.analyze import nested_dict_to_dict

In [24]:

df  = pd.DataFrame(nested_dict_to_dict(metrics), index = None)
df

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
0,0.988889,0.972173,1.000000,1.000000,0.850962,0.667795,1.000000,1.000000
1,1.000000,1.000000,0.988889,0.972173,0.771074,0.483571,0.988889,0.972173
2,1.000000,1.000000,1.000000,1.000000,0.844072,0.651862,1.000000,1.000000
3,1.000000,1.000000,1.000000,1.000000,0.818746,0.535508,1.000000,1.000000
4,1.000000,1.000000,0.988889,0.972173,0.850962,0.667795,1.000000,1.000000
...,...,...,...,...,...,...,...,...
395,1.000000,1.000000,1.000000,1.000000,0.793606,0.488995,1.000000,1.000000
396,0.988889,0.972173,0.988889,0.972173,0.845065,0.651521,1.000000,1.000000
397,1.000000,1.000000,0.988889,0.972173,0.916511,0.796660,1.000000,1.000000
398,1.000000,1.000000,1.000000,1.000000,0.896710,0.754225,1.000000,1.000000


In [25]:
metrics_by_k_normal = stock_uncertainty_different_k(cor, 400, [2,3,4,5,6,7], algos)

In [30]:
metrics_by_k_student_df3 = stock_uncertainty_different_k(cor, 400, [2,3,4,5,6,7], algos, distribution=multivariate_t_rvs, **{'df':3})

In [35]:
metrics_by_k_student_df3

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,0.876728,0.603785,0.901458,0.790409,0.816651,0.598168,0.805478,0.317737
3,0.88305,0.668444,0.789813,0.561102,0.798781,0.560964,0.848072,0.481888
4,0.899851,0.735427,0.794876,0.49305,0.802453,0.5682,0.888811,0.635033
5,0.89231,0.711241,0.813722,0.478851,0.807034,0.57831,0.89918,0.68233
6,0.890353,0.720138,0.827044,0.475365,0.811217,0.587347,0.916753,0.752456
7,0.891834,0.733673,0.847004,0.455457,0.796872,0.552963,0.887839,0.712289


In [32]:
metrics_by_k_normal

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,0.972914,0.908062,0.996306,0.990746,0.900837,0.781927,0.993745,0.978135
3,0.994642,0.98477,0.972652,0.942133,0.934107,0.855369,0.997545,0.99394
4,0.995803,0.989587,0.923553,0.807356,0.927942,0.841741,0.996524,0.991676
5,0.993363,0.983084,0.910599,0.747638,0.927394,0.840494,0.988616,0.973136
6,0.992195,0.980351,0.917914,0.75484,0.928604,0.843125,0.978553,0.950028
7,0.99165,0.979539,0.902387,0.65904,0.930492,0.847499,0.952299,0.891096


In [33]:
metrics_by_k_student_df2 = stock_uncertainty_different_k(cor, 400, [2,3,4,5,6,7], algos, distribution=multivariate_t_rvs, **{'df':2})
metrics_by_k_student_df2

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,0.781166,0.38509,0.728184,0.424608,0.698127,0.351638,0.735822,0.072314
3,0.771248,0.416308,0.683545,0.343215,0.701585,0.318196,0.757456,0.17315
4,0.747205,0.375887,0.710942,0.304942,0.680421,0.314076,0.764114,0.223007
5,0.771854,0.438166,0.747566,0.316844,0.702584,0.358923,0.788715,0.326139
6,0.758705,0.431875,0.763515,0.303816,0.689534,0.333694,0.782799,0.326169
7,0.765643,0.437929,0.795878,0.30704,0.697347,0.348085,0.760241,0.346149


In [36]:
metrics_by_k_student_df2.columns

MultiIndex([(           'spectral_clustering',  'RI'),
            (           'spectral_clustering', 'ARI'),
            ('normalized_spectral_clustering',  'RI'),
            ('normalized_spectral_clustering', 'ARI'),
            (                       'louvain',  'RI'),
            (                       'louvain', 'ARI'),
            (            'mst_cut_clustering',  'RI'),
            (            'mst_cut_clustering', 'ARI')],
           )

In [37]:
metrics_by_k_student_df2 - metrics_by_k_normal

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,-0.191748,-0.522972,-0.268123,-0.566139,-0.202711,-0.430289,-0.257923,-0.90582
3,-0.223394,-0.568462,-0.289107,-0.598917,-0.232522,-0.537173,-0.240089,-0.82079
4,-0.248598,-0.6137,-0.21261,-0.502413,-0.247521,-0.527665,-0.23241,-0.768668
5,-0.22151,-0.544918,-0.163033,-0.430794,-0.22481,-0.481571,-0.199901,-0.646997
6,-0.233489,-0.548475,-0.154399,-0.451024,-0.23907,-0.509431,-0.195754,-0.623859
7,-0.226007,-0.541611,-0.106509,-0.352,-0.233145,-0.499413,-0.192058,-0.544947


In [92]:
#metrics_by_k_student_df2[[col for col in metrics_by_k_student_df2.columns if 'ARI' in col[-1]]]
metrics = {'normal':metrics_by_k_normal,  'student_df3':metrics_by_k_student_df3,'student_df2':metrics_by_k_student_df2}
for name in metrics:
    metrics[name] = metrics[name][[col for col in metrics[name].columns if 'ARI' in col[-1]]]

In [93]:
metrics_per_k = {}
for k in list(metrics.values())[0].index:
    metrics_per_k[k] = pd.DataFrame({name:metrics[name].loc[k] for name in metrics})


In [72]:
metrics_by_k_student_df3

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,0.876728,0.603785,0.901458,0.790409,0.816651,0.598168,0.805478,0.317737
3,0.88305,0.668444,0.789813,0.561102,0.798781,0.560964,0.848072,0.481888
4,0.899851,0.735427,0.794876,0.49305,0.802453,0.5682,0.888811,0.635033
5,0.89231,0.711241,0.813722,0.478851,0.807034,0.57831,0.89918,0.68233
6,0.890353,0.720138,0.827044,0.475365,0.811217,0.587347,0.916753,0.752456
7,0.891834,0.733673,0.847004,0.455457,0.796872,0.552963,0.887839,0.712289


In [94]:
metrics_per_k[2]

Unnamed: 0,Unnamed: 1,normal,student_df3,student_df2
spectral_clustering,ARI,0.908062,0.603785,0.38509
normalized_spectral_clustering,ARI,0.990746,0.790409,0.424608
louvain,ARI,0.781927,0.598168,0.351638
mst_cut_clustering,ARI,0.978135,0.317737,0.072314


In [95]:
metrics_per_k[3]

Unnamed: 0,Unnamed: 1,normal,student_df3,student_df2
spectral_clustering,ARI,0.98477,0.668444,0.416308
normalized_spectral_clustering,ARI,0.942133,0.561102,0.343215
louvain,ARI,0.855369,0.560964,0.318196
mst_cut_clustering,ARI,0.99394,0.481888,0.17315


In [107]:
metrics_per_k[7].T.diff()

Unnamed: 0_level_0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
Unnamed: 0_level_1,ARI,ARI,ARI,ARI
normal,,,,
student_df3,-0.245867,-0.203583,-0.294536,-0.178807
student_df2,-0.295744,-0.148417,-0.204878,-0.36614


In [103]:
metrics['normal']

Unnamed: 0_level_0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
Unnamed: 0_level_1,ARI,ARI,ARI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.908062,0.990746,0.781927,0.978135
3,0.98477,0.942133,0.855369,0.99394
4,0.989587,0.807356,0.841741,0.991676
5,0.983084,0.747638,0.840494,0.973136
6,0.980351,0.75484,0.843125,0.950028
7,0.979539,0.65904,0.847499,0.891096


In [121]:
metrics['normal']

Unnamed: 0_level_0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
Unnamed: 0_level_1,ARI,ARI,ARI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.908062,0.990746,0.781927,0.978135
3,0.98477,0.942133,0.855369,0.99394
4,0.989587,0.807356,0.841741,0.991676
5,0.983084,0.747638,0.840494,0.973136
6,0.980351,0.75484,0.843125,0.950028
7,0.979539,0.65904,0.847499,0.891096


In [122]:
for name in metrics:
    metrics[name].to_csv(f'{name}_distribution_overview.csv')

In [124]:
metrics_per_k

{2:                                       normal  student_df3  student_df2
 spectral_clustering            ARI  0.908062     0.603785     0.385090
 normalized_spectral_clustering ARI  0.990746     0.790409     0.424608
 louvain                        ARI  0.781927     0.598168     0.351638
 mst_cut_clustering             ARI  0.978135     0.317737     0.072314,
 3:                                       normal  student_df3  student_df2
 spectral_clustering            ARI  0.984770     0.668444     0.416308
 normalized_spectral_clustering ARI  0.942133     0.561102     0.343215
 louvain                        ARI  0.855369     0.560964     0.318196
 mst_cut_clustering             ARI  0.993940     0.481888     0.173150,
 4:                                       normal  student_df3  student_df2
 spectral_clustering            ARI  0.989587     0.735427     0.375887
 normalized_spectral_clustering ARI  0.807356     0.493050     0.304942
 louvain                        ARI  0.841741     0.5

In [125]:
for k in metrics_per_k:
    metrics_per_k[k].to_csv(f'detailed_view_per_{k}_clusters.csv')