In [1]:
from src.StocksReader import ReaderStocksData
from src.correlation import CorrelationMeasurement
from src.clustering_methods import ClusteringMethods
from src.multivariate_distribution import MultivariateDistribution
from src.CBM import CorrelationBlockModel
import itertools
from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np
import pandas as pd
from tqdm import tqdm


# Download data

In [15]:
start_date = "2020-01-01"
end_date =   '2023-01-01'

ReaderData = ReaderStocksData("./data/DataStocks/SP500")
DATA_OF_STOCKS, TICKERS = ReaderData.load_data(str(start_date), str(end_date))

In [3]:
import random
selected_indices = random.sample(range(100), 60)

In [4]:
DATA_OF_STOCKS = [DATA_OF_STOCKS[i] for i in selected_indices]
TICKERS = [TICKERS[i] for i in selected_indices]

IndexError: list index out of range

# Calculate the mean vector and covariance matrix

In [10]:
import numpy as np
import pandas as pd

def get_covariance_matrix(Stocks, tickers) -> pd.DataFrame:
    """
    Calculates the covariance matrix for the given list of stock objects.
    Returns:
    - pd.DataFrame: Covariance matrix.
    """
    covariance_matrix = []
    for stock_1 in Stocks:
        covv = []
        for stock_2 in Stocks:
            covv.append(np.cov(stock_1.returns, stock_2.returns)[0, 1])
        covariance_matrix.append(covv)

    return pd.DataFrame(covariance_matrix, columns = tickers, index = tickers)


def get_mean_vector(Stocks):
    mean_vector = []
    for stock in Stocks:
        mean_vector.append(stock.returns.mean())
    return np.array(mean_vector)

In [11]:
true_cov_matrix = get_covariance_matrix(DATA_OF_STOCKS, TICKERS)
true_mean_vec = get_mean_vector(DATA_OF_STOCKS)

In [12]:
cluster_method = ClusteringMethods()
correlation_method = CorrelationMeasurement()
multivariate_distribution = MultivariateDistribution()
correlation_block_model = CorrelationBlockModel()

params = {
    'clustering_method': [
                          cluster_method.single_clustering, 
                          cluster_method.louvain_clustering,
                          cluster_method.spectral_clustering,
                          cluster_method.normalized_spectral_clustering
                        ],
    'correlation_network': [
                            correlation_method.Pearson, 
                            correlation_method.Kendall,
                            correlation_method.Fechner
                        ],
    'multivariate_distribution': [
                            multivariate_distribution.normal_distribution, 
                            multivariate_distribution.student_distribution
                        ],
    'number_clusters': [2, 4, 6],
    'sample_size_of_observations': [10, 20, 40, 60]
}

params_name = {
    'clustering_method': [
                          'single_clustering', 
                          'louvain_clustering',
                          'spectral_clustering',
                          'normalized_spectral_clustering'
                        ],
    'correlation_network': [
                            'Pearson', 
                            'Kendall',
                            'Fechner'
                        ],
    'multivariate_distribution': [
                            'normal_distribution', 
                            'student_distribution'
                        ],
    'number_clusters': [2, 4, 6],
    'sample_size_of_observations': [10, 20, 40, 60]

}

# Создание всех возможных комбинаций параметров
keys, values = zip(*params.items())
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

keys_name, values_name = zip(*params_name.items())
combinations_name = [dict(zip(keys_name, combination)) for combination in itertools.product(*values_name)]

In [8]:
len(combinations)

288

In [9]:
def one_experiment(cluster_method,
                   correlation_method,
                   multivariate_distribution,
                   number_clusters,
                   sample_size_of_observations,
                   Stocks,
                   true_cov_matrix,
                   true_mean_vec,
) -> float:
   correlation_matrix = []
   for stock_1 in Stocks:
      row = []
      for stock_2 in Stocks:
         row.append(correlation_method(stock_1.returns, stock_2.returns))
      correlation_matrix.append(row)
   
   true_labels = cluster_method(np.array(correlation_matrix), number_clusters)

   gen_labels_ = correlation_block_model.clustering(multivariate_distribution = multivariate_distribution,
                                                     mean_vector = true_mean_vec,
                                                     cov_matrix = true_cov_matrix,
                                                     sample_size_of_observations = sample_size_of_observations,
                                                     correlation_method = correlation_method,
                                                     clustering_method = cluster_method,
                                                     number_clusters = number_clusters
                                                     )

   return adjusted_rand_score(true_labels, gen_labels_)
    

In [10]:
results_experiments = pd.DataFrame()
number_repetitions = 100

for combination, combination_name in tqdm(zip(combinations, combinations_name), leave=False):
    ari_score_results = []
    for count in range(number_repetitions):
        result_score = one_experiment(cluster_method = combination['clustering_method'],
                   correlation_method = combination['correlation_network'],
                   multivariate_distribution = combination['multivariate_distribution'],
                   number_clusters = combination['number_clusters'],
                   sample_size_of_observations = combination['sample_size_of_observations'],
                   Stocks = DATA_OF_STOCKS,
                   true_cov_matrix = true_cov_matrix,
                   true_mean_vec = true_mean_vec,
                )
        ari_score_results.append(result_score)
    combination_name['ARI'] = np.mean(ari_score_results)
    results_experiments = results_experiments._append(pd.Series(combination_name), ignore_index=True)
        

                             

In [11]:
# results_experiments.to_csv('./Russia_60_stocks_all_results.csv')

In [12]:
results_experiments

Unnamed: 0,clustering_method,correlation_network,multivariate_distribution,number_clusters,sample_size_of_observations,ARI
0,single_clustering,Pearson,normal_distribution,2,10,0.044896
1,single_clustering,Pearson,normal_distribution,2,20,0.064168
2,single_clustering,Pearson,normal_distribution,2,40,0.121485
3,single_clustering,Pearson,normal_distribution,2,60,0.070888
4,single_clustering,Pearson,normal_distribution,4,10,0.071804
...,...,...,...,...,...,...
283,normalized_spectral_clustering,Fechner,student_distribution,4,60,0.198829
284,normalized_spectral_clustering,Fechner,student_distribution,6,10,0.033827
285,normalized_spectral_clustering,Fechner,student_distribution,6,20,0.064043
286,normalized_spectral_clustering,Fechner,student_distribution,6,40,0.118824


In [33]:
import pandas as pd
def convert_table(data: pd.DataFrame) -> None:
    columns =  params_name['number_clusters']
    rows = params_name['clustering_method']
    for multi_distribution in params_name['multivariate_distribution']:
        for corr_network in params_name['correlation_network']:
            for size_samples in params_name['sample_size_of_observations']:

                filtered_data = data[(data['correlation_network'] == corr_network) & 
                            (data['multivariate_distribution'] == multi_distribution) & 
                            (data['sample_size_of_observations'] == size_samples)]
                
                result = pd.DataFrame(index=list(rows), columns=list(columns))

                for row in rows:
                    for column in columns:
                        ari_value = filtered_data[(filtered_data['clustering_method'] == row) & 
                                                (filtered_data['number_clusters'] == column)]['ARI']
                        
                        if not ari_value.empty:
                            result.loc[row, column] = ari_value.iloc[0]
                result.to_csv(f'./results/Russia/stocks_60/Russia_60_stocks_{multi_distribution}_{corr_network}_{str(size_samples)}.csv')
    

            

In [34]:
df = pd.read_csv("Russia_60_stocks_results.csv")

convert_table(df)