In [3]:
import numpy as np
from scipy import stats
import networkx as nx
from networkx.algorithms import tree

In [4]:
from algorithm.louvain import louvain
from algorithm.spectral_clustering import spectral_clustering, normalized_spectral_clustering
from algorithm.hierarchical_clustering import mst_cut_clustering
from utils.analyze import get_rs_from_fixed_weighted_degree, compute_clustering, validation, metrics_to_df, set_zero_weights_to_very_low,get_cor_from_cov

## Stocks clustering

In [5]:
dow30_tickers = ['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 
                'CSCO', 'CVX', 'GS', 'HD', 'HON',
                'IBM', 'INTC', 'JNJ', 'KO', 'JPM',
                'MCD', 'MMM', 'MRK', 'MSFT', 'NKE',
                'PG', 'TRV', 'UNH', 'CRM', 'VZ',
                'V', 'WBA', 'WMT', 'DIS', 'DOW'
                ]
len(dow30_tickers)

30

In [4]:
import yfinance as yf

START_DATE  ="2022-01-01"
END_DATE ="2023-01-01"
data = yf.download(' '.join(dow30_tickers), start=START_DATE, end=END_DATE,
                                 group_by='ticker')

[*********************100%***********************]  30 of 30 completed


In [5]:
data = data[[(ticker, 'Close') for ticker in dow30_tickers]].copy()
data = data.rename(columns = {name:name[0] for name in data.columns})
data.columns = dow30_tickers
#data = data.rename(columns = {name:name[0] for name in data.columns})
data.head()

Unnamed: 0_level_0,AXP,AMGN,AAPL,BA,CAT,CSCO,CVX,GS,HD,HON,...,PG,TRV,UNH,CRM,VZ,V,WBA,WMT,DIS,DOW
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-31,163.600006,224.970001,177.570007,201.320007,206.740005,63.369999,117.349998,382.549988,415.01001,208.509995,...,163.580002,156.429993,502.140015,254.130005,51.959999,216.710007,52.16,144.690002,154.889999,56.720001
2022-01-03,168.210007,226.690002,182.009995,207.860001,207.0,63.16,119.260002,395.329987,408.640015,206.800003,...,162.899994,155.75,502.279999,255.460007,52.439999,221.429993,53.060001,144.649994,156.759995,56.869999
2022-01-04,173.610001,227.839996,179.699997,213.630005,218.080002,61.25,121.43,407.480011,412.839996,209.0,...,163.470001,159.0,490.899994,248.229996,53.470001,222.460007,53.48,142.0,155.729996,58.41
2022-01-05,171.759995,225.139999,174.919998,213.070007,219.75,60.279999,122.220001,398.630005,407.23999,211.059998,...,164.210007,159.770004,489.690002,227.669998,54.02,220.0,54.0,143.919998,155.190002,58.349998
2022-01-06,172.899994,225.169998,172.0,211.339996,221.990005,60.919998,123.260002,396.929993,405.76001,210.820007,...,162.830002,162.330002,469.649994,229.149994,53.759998,219.75,52.439999,143.520004,156.899994,58.240002


In [7]:
data = data.apply(lambda x: np.log(x/x.shift(1)))
data = data.dropna()
data

Unnamed: 0_level_0,AXP,AMGN,AAPL,BA,CAT,CSCO,CVX,GS,HD,HON,...,PG,TRV,UNH,CRM,VZ,V,WBA,WMT,DIS,DOW
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,0.027789,0.007616,0.024697,0.031969,0.001257,-0.003319,0.016145,0.032861,-0.015468,-0.008235,...,-0.004166,-0.004356,0.000279,0.005220,0.009195,0.021546,0.017107,-0.000277,0.012001,0.002641
2022-01-04,0.031598,0.005060,-0.012773,0.027381,0.052143,-0.030707,0.018032,0.030271,0.010225,0.010582,...,0.003493,0.020652,-0.022917,-0.028710,0.019451,0.004641,0.007884,-0.018490,-0.006592,0.026719
2022-01-05,-0.010713,-0.011921,-0.026960,-0.002625,0.007629,-0.015963,0.006485,-0.021958,-0.013657,0.009808,...,0.004517,0.004831,-0.002468,-0.086459,0.010234,-0.011120,0.009676,0.013431,-0.003474,-0.001028
2022-01-06,0.006615,0.000133,-0.016834,-0.008153,0.010142,0.010561,0.008473,-0.004274,-0.003641,-0.001138,...,-0.008439,0.015896,-0.041785,0.006480,-0.004825,-0.001137,-0.029314,-0.002783,0.010958,-0.001887
2022-01-07,0.008523,0.009371,0.000988,0.019493,0.009862,0.003441,0.014258,0.001460,-0.030401,0.023116,...,-0.000553,0.023801,-0.023809,-0.003672,0.008889,-0.012778,0.026347,0.009500,0.005910,0.014658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,0.011699,-0.005064,-0.002802,0.004294,0.009761,0.003376,0.030448,-0.000203,0.008223,0.007321,...,0.002821,0.012106,0.007974,0.001933,0.002607,0.003748,0.004931,0.002019,0.015343,0.013660
2022-12-27,-0.004294,-0.002010,-0.013976,0.001797,0.013540,0.001053,0.012492,-0.010299,0.002569,0.004758,...,0.008677,0.005264,0.001279,0.009381,0.021634,0.002232,-0.008318,0.000278,-0.018810,0.006467
2022-12-28,-0.016392,-0.007508,-0.031166,-0.005400,-0.015585,-0.009725,-0.014862,-0.003222,-0.012026,-0.012644,...,-0.013010,-0.012414,-0.006676,-0.016903,-0.011273,-0.006322,-0.019239,-0.017678,-0.025802,-0.023721
2022-12-29,0.022724,0.006634,0.027931,0.002810,0.003420,0.009094,0.007543,0.007482,0.014714,0.011992,...,0.004137,0.006939,0.002702,0.031189,0.011528,0.014865,-0.002931,0.006068,0.035136,0.013116


In [10]:
cov = np.cov(data.T)
cov = set_zero_weights_to_very_low(cov)

In [11]:
def check(col1, col2):
    # вычислить коэффициент преобразования
    # при этом надо найти ненулевые значения иначе
    first1 = -1
    first2 = -1

    for i in range(len(col1)):
        if col1[i] != 0 and first1 != -1:
            first1 = i

        if col2[i] != 0 and first2 != -1:
            first2 = i

    # если ненулевых элементов не найдено - выйти
    if first1 == 0 and first2 == 0:
        return True

    # найти коэффициент
    first = max(first1, first2)
    coeff = col1[first] / col2[first]

    # проверить элементы столбцов - можно ли их представить в виде elem2 = elem1 * const
    for i in range(len(col1)):
        if abs(col1[i] - col2[i] * coeff) > 1e-10:
            return False

    return True

# проверить, является ли матрица вырожденной
def is_invertible(matrix):
    n = len(matrix)
    for i in range(n - 1):
        for j in range(i + 1, n):
            if check(matrix[i], matrix[j]):
                return True
    return False

is_invertible(cov)

False

In [15]:
import pandas as pd

In [16]:
cor = get_cor_from_cov(cov)
k=2
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
result = dict()
for algo in algos:
    result[algo.__name__] = algo(cor,k)
pd.DataFrame(result)

Unnamed: 0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
0,0,1,2,0
1,0,0,3,0
2,0,1,0,0
3,0,1,2,0
4,0,1,1,0
5,0,1,3,0
6,1,1,1,0
7,0,1,2,0
8,0,1,0,0
9,0,1,4,0


In [17]:
sample_vol = 250
distribution = np.random.multivariate_normal
mean = np.mean(data)
samples = distribution(mean, cov, sample_vol).T
esimated_cor = np.corrcoef(samples)

In [18]:
k=2
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
result_e = dict()
for algo in algos:
    result_e[algo.__name__] = algo(esimated_cor,k)
pd.DataFrame(result)

Unnamed: 0,spectral_clustering,normalized_spectral_clustering,louvain,mst_cut_clustering
0,0,1,2,0
1,0,0,3,0
2,0,1,0,0
3,0,1,2,0
4,0,1,1,0
5,0,1,3,0
6,1,1,1,0
7,0,1,2,0
8,0,1,0,0
9,0,1,4,0


In [19]:
def stock_uncertainty(cor, num_repeats, num_clusters, algos):
    
    result = dict()    
    sample_vol = 250

    distribution = np.random.multivariate_normal

    mean = np.mean(data)
    samples = np.hsplit(distribution(mean, cov, sample_vol * num_repeats).T, num_repeats)
    samples_bag = generate_samples_bag(mean, cov, bags = num_repeats, sample_size=sample_vol)
    estimated_graphs_bag = [set_zero_weights_to_very_low(np.corrcoef(sample)) for sample in samples_bag]
    true_labels = dict()
    for algo in algos:
        true_labels[algo.__name__] = algo(cor, num_clusters)
        algo_result = []
        for estimated_graph in estimated_graphs_bag:
            algo_result.append(algo(estimated_graph, num_clusters))       
        result[algo.__name__] = algo_result
    
    metrics_by_algos = dict()
    for algo in result:
        metrics = dict()
        metrics['RI'] = np.array([rand_score(true_labels[algo], labels) for labels in result[algo]]).T
        metrics['ARI'] = np.array([adjusted_rand_score(true_labels[algo], labels) for labels in result[algo]]).T
        metrics_by_algos[algo] = metrics    
    return true_labels, result, estimated_graphs_bag, metrics_by_algos


def stock_uncertainty_different_k(cor, num_repeats, ks, algos):
    metrics = dict()
    for k in ks:
        true_labels, result, estimated_graphs_bag, metrics[k] = stock_uncertainty(cor, num_repeats, k, algos)
    for k in metrics:
        for algo in metrics[k]:
            for metric_type in metrics[k][algo]:
                metrics[k][algo][metric_type] = np.mean(metrics[k][algo][metric_type])
    
    metric_by_k = dict()
    for k in metrics:
        for algo in metrics[k]:
            metric_by_k[algo] = dict()
            for metric_type in metrics[k][algo]:
                metric_by_k[algo][metric_type] = []
    for k in metrics:
        for algo in metrics[k]:
            for metric_type in metrics[k][algo]:
                metric_by_k[algo][metric_type].append(metrics[k][algo][metric_type])
    df  = pd.DataFrame(nested_dict_to_dict(metric_by_k), index = None)
    df['k'] = ks
    df.set_index('k', inplace=True)
    return df


In [20]:
algos = [spectral_clustering, normalized_spectral_clustering, louvain, mst_cut_clustering ]
true_labels, result, graphs, metrics = stock_uncertainty(cor, 400, 2, algos)

NameError: name 'generate_samples_bag' is not defined

In [196]:
df  = pd.DataFrame(nested_dict_to_dict(metrics), index = None)
df

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
0,1.000000,1.000000,0.813793,0.627407,0.983908,0.946141,1.000000,1.000000
1,0.871264,-0.034483,0.668966,0.339519,0.864368,0.574597,0.871264,-0.034483
2,1.000000,1.000000,0.871264,0.742582,0.926437,0.766121,1.000000,1.000000
3,1.000000,1.000000,0.813793,0.627202,0.924138,0.768307,0.871264,-0.034483
4,1.000000,1.000000,0.933333,0.866603,0.889655,0.664460,1.000000,1.000000
...,...,...,...,...,...,...,...,...
395,1.000000,1.000000,0.813793,0.627407,0.974713,0.915365,0.871264,-0.034483
396,1.000000,1.000000,0.871264,0.742582,0.947126,0.842647,0.871264,-0.034483
397,1.000000,1.000000,0.871264,0.742299,0.908046,0.727503,1.000000,1.000000
398,0.871264,-0.034483,1.000000,1.000000,0.931034,0.770279,0.871264,-0.034483


In [199]:
metrics_by_k = stock_uncertainty_different_k(cor, 400, [2,3,4,5,6,7], algos)

100%|██████████| 4/4 [00:00<00:00,  5.98it/s]
100%|██████████| 4/4 [00:00<00:00,  5.93it/s]
100%|██████████| 4/4 [00:00<00:00,  5.98it/s]
100%|██████████| 4/4 [00:00<00:00,  5.83it/s]
100%|██████████| 4/4 [00:00<00:00,  5.77it/s]
100%|██████████| 4/4 [00:00<00:00,  5.75it/s]


In [200]:
metrics_by_k

Unnamed: 0_level_0,spectral_clustering,spectral_clustering,normalized_spectral_clustering,normalized_spectral_clustering,louvain,louvain,mst_cut_clustering,mst_cut_clustering
Unnamed: 0_level_1,RI,ARI,RI,ARI,RI,ARI,RI,ARI
k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2,0.948822,0.7067,0.793966,0.588575,0.901557,0.716364,0.947644,0.586053
3,0.903339,0.702523,0.866943,0.69818,0.909552,0.714407,0.922897,0.674871
4,0.840603,0.6376,0.76442,0.379552,0.886443,0.672291,0.886925,0.651238
5,0.793885,0.580203,0.810621,0.402393,0.910408,0.718182,0.887069,0.717196
6,0.742822,0.491021,0.845006,0.47053,0.903839,0.704055,0.856724,0.680197
7,0.77304,0.535907,0.867563,0.476195,0.904149,0.714463,0.817793,0.622928
