In [1]:
import itertools
import random

import numpy as np
import pandas as pd
from sklearn.metrics.cluster import adjusted_rand_score
from tqdm import tqdm
import pathlib

from src.artificial_cluster_structure import ArtificialСlusterStructure
from src.correlation_block_model import CorrelationBlockModel
from src.params import PARAMS, PARAMS_NAME
from src.Stocks import Stocks
from src.StocksReader import ReaderStocksData

# Download data

In [2]:
start_date = '2016-01-01'
end_date =   '2018-12-31'
path_to_data = './data/DataStocks/SP100'
path_to_save = './data/results/SP100_1618_true/'
name_common_file = 'SP100_60_stocks_all_results_test.csv'
name_specific_files = 'SP100_1618_60_stocks_true'
number_stocks = 60

ReaderData = ReaderStocksData(path_to_data)
DATA_OF_STOCKS, TICKERS = ReaderData.load_data(str(start_date), str(end_date))

ERROR:root:Error processing file data/DataStocks/SP100/.DS_Store: 'utf-8' codec can't decode byte 0x86 in position 27: invalid start byte


In [3]:
true_tickers=['GILD',
 'META',
 'LIN',
 'V',
 'AMGN',
 'TSLA',
 'ORCL',
 'AAPL',
 'MA',
 'GOOG',
 'GS',
 'NKE',
 'DIS',
 'UPS',
 'CHTR',
 'GD',
 'CRM',
 'AMT',
 'MET',
 'COF',
 'SPG',
 'LMT',
 'USB',
 'ACN',
 'COP',
 'AIG',
 'QCOM',
 'BK',
 'CAT',
 'PM',
 'INTC',
 'MRK',
 'AXP',
 'JPM',
 'DUK',
 'BA',
 'CMCSA',
 'BLK',
 'AVGO',
 'CVS',
 'EMR',
 'XOM',
 'ABBV',
 'GOOGL',
 'TMUS',
 'AMD',
 'TXN',
 'CSCO',
 'NEE',
 'PYPL',
 'MO',
 'GE',
 'BMY',
 'NFLX',
 'DHR',
 'SBUX',
 'F',
 'VZ',
 'LOW',
 'MS']

In [4]:
res = []
for stock in DATA_OF_STOCKS:
    if stock.ticker in true_tickers:
        res.append(stock)


In [5]:
DATA_OF_STOCKS = res

TICKERS = true_tickers

In [6]:
# selected_indices = random.sample(range(99), number_stocks)

# DATA_OF_STOCKS = [DATA_OF_STOCKS[i] for i in selected_indices]
# TICKERS = [TICKERS[i] for i in selected_indices]

In [7]:
len(TICKERS)

60

In [8]:
len(DATA_OF_STOCKS[0].returns)

250

# Calculate the mean vector and covariance matrix

In [9]:
def get_covariance_matrix(Stocks: Stocks, tickers: np.array) -> pd.DataFrame:
    """
    Calculates the covariance matrix for the given list of stock objects.
    
    Args:
    - stocks (list): List of stock objects, each having a 'returns' attribute.
    - tickers (np.array): Array of stock tickers corresponding to the stocks.
    
    Returns:
    - pd.DataFrame: Covariance matrix.
    """
    covariance_matrix = []
    for stock_1 in Stocks:
        covv = []
        for stock_2 in Stocks:
            covv.append(np.cov(stock_1.returns, stock_2.returns)[0, 1])
        covariance_matrix.append(covv)

    return pd.DataFrame(covariance_matrix, columns = tickers, index = tickers)


def get_mean_vector(Stocks: Stocks) -> np.array:
    """
    Calculates the mean returns for the given list of stock objects.
    
    Args:
    - stocks (list): List of stock objects, each having a 'returns' attribute.
    
    Returns:
    - np.array: Array of mean returns.
    """
    mean_vector = []
    for stock in Stocks:
        mean_vector.append(stock.returns.mean())
    return np.array(mean_vector)

In [10]:
true_cov_matrix = get_covariance_matrix(DATA_OF_STOCKS, TICKERS)
true_mean_vec = get_mean_vector(DATA_OF_STOCKS)

In [11]:
# Создание всех возможных комбинаций параметров
keys, values = zip(*PARAMS.items())
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

keys_name, values_name = zip(*PARAMS_NAME.items())
combinations_name = [dict(zip(keys_name, combination)) for combination in itertools.product(*values_name)]
print(f'Total number of combinations: {len(combinations)}')

Total number of combinations: 54


In [12]:
combinations

[{'clustering_method': <bound method ClusteringMethods.normalized_spectral_clustering of <src.clustering_methods.ClusteringMethods object at 0x155af3190>>,
  'correlation_network': <bound method CorrelationMeasurement.Pearson of <src.correlation.CorrelationMeasurement object at 0x155af2f10>>,
  'multivariate_distribution': <bound method MultivariateDistribution.normal_distribution of <src.multivariate_distribution.MultivariateDistribution object at 0x155af2ad0>>,
  'number_clusters': 2,
  'sample_size_of_observations': 40},
 {'clustering_method': <bound method ClusteringMethods.normalized_spectral_clustering of <src.clustering_methods.ClusteringMethods object at 0x155af3190>>,
  'correlation_network': <bound method CorrelationMeasurement.Pearson of <src.correlation.CorrelationMeasurement object at 0x155af2f10>>,
  'multivariate_distribution': <bound method MultivariateDistribution.normal_distribution of <src.multivariate_distribution.MultivariateDistribution object at 0x155af2ad0>>,
  

In [13]:
def one_experiment(
      cluster_method,
      correlation_method,
      multivariate_distribution,
      number_clusters,
      sample_size_of_observations,
      Stocks,
      true_cov_matrix,
      true_mean_vec,
) -> float:
   artificial_cluster_structure = ArtificialСlusterStructure()
   correlation_matrix = []
   for stock_1 in Stocks:
      row = []
      for stock_2 in Stocks:
         row.append(correlation_method(data_1 = stock_1.returns, data_2 = stock_2.returns))
      correlation_matrix.append(row)
   true_labels = cluster_method(np.array(correlation_matrix), number_clusters)

   gen_labels_ = artificial_cluster_structure.clustering(
      multivariate_distribution = multivariate_distribution,
      mean_vector = true_mean_vec,
      cov_matrix = true_cov_matrix,
      sample_size_of_observations = sample_size_of_observations,
      correlation_method = correlation_method,
      clustering_method = cluster_method,
      number_clusters = number_clusters
   )

   return adjusted_rand_score(true_labels, gen_labels_)
    

In [14]:
results_experiments = pd.DataFrame()
number_repetitions = 100

for combination, combination_name in tqdm(zip(combinations, combinations_name), leave=False):
    print('=======', combination_name)
    
    ari_score_results = []
    for count in range(number_repetitions):
        result_score = one_experiment(
            cluster_method = combination['clustering_method'],
            correlation_method = combination['correlation_network'],
            multivariate_distribution = combination['multivariate_distribution'],
            number_clusters = combination['number_clusters'],
            sample_size_of_observations = combination['sample_size_of_observations'],
            Stocks = DATA_OF_STOCKS,
            true_cov_matrix = true_cov_matrix,
            true_mean_vec = true_mean_vec,
        )
        ari_score_results.append(result_score)
    combination_name['ARI'] = np.mean(ari_score_results)
    results_experiments = results_experiments._append(pd.Series(combination_name), ignore_index=True)
        

0it [00:00, ?it/s]



1it [03:49, 229.30s/it]



2it [07:32, 225.60s/it]



3it [11:14, 223.89s/it]



4it [14:54, 222.54s/it]



5it [18:34, 221.75s/it]



6it [22:16, 221.62s/it]



7it [25:54, 220.52s/it]



8it [30:02, 229.13s/it]



9it [33:50, 228.98s/it]



10it [37:38, 228.45s/it]



11it [41:26, 228.32s/it]



12it [45:15, 228.53s/it]



13it [48:57, 226.78s/it]



14it [52:35, 224.01s/it]



15it [56:14, 222.46s/it]



16it [59:52, 221.05s/it]



17it [1:03:29, 220.08s/it]



18it [1:07:08, 219.71s/it]



19it [1:10:47, 219.40s/it]



20it [1:14:26, 219.19s/it]



21it [1:18:05, 219.24s/it]



22it [1:21:44, 219.10s/it]



23it [1:25:23, 218.99s/it]



24it [1:29:02, 219.14s/it]



25it [1:32:41, 218.99s/it]



26it [1:36:20, 218.97s/it]



27it [1:40:00, 219.31s/it]



28it [1:43:39, 219.24s/it]



29it [1:47:17, 219.06s/it]



30it [1:50:57, 219.30s/it]



31it [1:54:36, 219.20s/it]



32it [1:58:15, 219.00s/it]



33it [2:01:54, 219.17s/it]



34it [2:05:33, 219.02s/it]



35it [2:09:12, 218.95s/it]



36it [2:12:52, 219.24s/it]



37it [2:13:28, 164.48s/it]



38it [2:14:05, 126.16s/it]



39it [2:14:43, 99.60s/it] 



40it [2:15:20, 80.74s/it]



41it [2:15:56, 67.60s/it]



42it [2:16:34, 58.59s/it]



43it [2:17:11, 52.04s/it]



44it [2:17:48, 47.48s/it]



45it [2:18:25, 44.54s/it]



46it [2:19:02, 42.16s/it]



47it [2:19:39, 40.53s/it]



48it [2:20:16, 39.67s/it]



49it [2:20:53, 38.83s/it]



50it [2:21:30, 38.23s/it]



51it [2:22:08, 38.07s/it]



52it [2:22:44, 37.67s/it]



53it [2:23:21, 37.45s/it]



                         

In [10]:
results_experiments

Unnamed: 0,clustering_method,correlation_network,multivariate_distribution,number_clusters,sample_size_of_observations,ARI
0,single_clustering,Pearson,normal_distribution,2,10,0.110729
1,single_clustering,Pearson,normal_distribution,2,20,0.146771
2,single_clustering,Pearson,normal_distribution,2,40,0.300695
3,single_clustering,Pearson,normal_distribution,2,60,0.058854
4,single_clustering,Pearson,normal_distribution,4,10,0.090857
...,...,...,...,...,...,...
211,spectral_clustering,Fechner,student_distribution,4,60,0.349267
212,spectral_clustering,Fechner,student_distribution,6,10,0.512834
213,spectral_clustering,Fechner,student_distribution,6,20,0.150226
214,spectral_clustering,Fechner,student_distribution,6,40,0.235105


In [15]:
results_experiments.to_csv(path_to_save + name_common_file)
# '/data/results/Brazil/stocks_20'
# /data/results/Brazil/stocks_20

# Saving data in the required form

In [16]:
def convert_table(data: pd.DataFrame, path: pathlib.Path, base_name_file: str) -> None:
    clustering_order = PARAMS_NAME['clustering_method']
    
    for multi_distribution in PARAMS_NAME['multivariate_distribution']:
        for corr_network in PARAMS_NAME['correlation_network']:
            for size_samples in PARAMS_NAME['sample_size_of_observations']:
                
                filtered_data = data[(data['correlation_network'] == corr_network) & 
                                     (data['multivariate_distribution'] == multi_distribution) & 
                                     (data['sample_size_of_observations'] == size_samples)]

                if filtered_data.empty:
                    continue

                filtered_data = filtered_data[filtered_data['clustering_method'].isin(clustering_order)]
                
                filtered_data['clustering_method'] = pd.Categorical(
                    filtered_data['clustering_method'], 
                    categories=clustering_order, 
                    ordered=True
                )

                result = pd.pivot_table(
                    filtered_data, 
                    values='ARI', 
                    index='clustering_method', 
                    columns='number_clusters', 
                    aggfunc='first',
                    sort=False ,
                    observed=False
                )

                file_name = f'{base_name_file}_{multi_distribution}_{corr_network}_{size_samples}.csv'
                result.to_csv(path + file_name)


In [17]:
# name_specific_files = 'SP100_1618_60_stocks'
df = pd.read_csv(path_to_save + name_common_file)
convert_table(data = df, path = path_to_save, base_name_file = name_specific_files)

In [14]:
# ПЕРЕИМЕНОВАТЬ ФАЙЛЫ



# Correlation block model


In [28]:
def one_experiment_cbm(
      cluster_method,
      correlation_method,
      multivariate_distribution,
      number_clusters,
      sample_size_of_observations,
      true_cov_matrix,
      true_mean_vec,
      true_labels
) -> float:
   
   artificial_cluster_structure = ArtificialСlusterStructure()
   gen_labels_ = artificial_cluster_structure.clustering(
      multivariate_distribution = multivariate_distribution,
      mean_vector = true_mean_vec,
      cov_matrix = true_cov_matrix,
      sample_size_of_observations = sample_size_of_observations,
      correlation_method = correlation_method,
      clustering_method = cluster_method,
      number_clusters = number_clusters
   )

   return adjusted_rand_score(true_labels, gen_labels_)
    

In [3]:
# Создание всех возможных комбинаций параметров
keys, values = zip(*PARAMS.items())
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

keys_name, values_name = zip(*PARAMS_NAME.items())
combinations_name = [dict(zip(keys_name, combination)) for combination in itertools.product(*values_name)]
print(f'Total number of combinations: {len(combinations)}')
print(f'Total number of name combinations: {len(combinations_name)}')

Total number of combinations: 216
Total number of name combinations: 216


In [29]:
results_experiments = pd.DataFrame()
number_repetitions = 100
number_vertices = 60

for combination, combination_name in tqdm(zip(combinations, combinations_name), leave=False):
    print('=======', combination_name)
    
    cbm = CorrelationBlockModel(
        num_clusters = combination['number_clusters'],
        size_cluster = int(number_vertices / combination['number_clusters']),
        r_in = 0.8,
        r_out = 0.1
    )
    cbm = cbm.create_correlation_block_model()
    ari_score_results = []

    for count in range(number_repetitions):
        result_score = one_experiment_cbm(
            cluster_method = combination['clustering_method'],
            correlation_method = combination['correlation_network'],
            multivariate_distribution = combination['multivariate_distribution'],
            number_clusters = combination['number_clusters'],
            sample_size_of_observations = combination['sample_size_of_observations'],
            true_cov_matrix = cbm['covariance_matrix'],
            true_mean_vec = cbm['mean_vector'],
            true_labels = cbm['labels']
        )
        ari_score_results.append(result_score)
    combination_name['ARI'] = np.mean(ari_score_results)
    results_experiments = results_experiments._append(pd.Series(combination_name), ignore_index=True)
        

0it [00:00, ?it/s]



1it [01:49, 109.84s/it]



                       

KeyboardInterrupt: 