In [2]:
import pandas as pd

idx = []
for year in [2021, 2022, 2023, 2024]:
    for quarter in ['Q1', 'Q2', 'Q3', 'Q4']:
        idx.append(f"{year}_{quarter}")

df_list = [pd.read_csv(f'../data/processed/quarterly/{id}.csv') for id in idx]

In [55]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

cols = df_list[0].columns[3:]
scores_list = []
wcss_list = []

for df in df_list:
    cluster_range = range(2, 40)
    silhouette_scores = []
    wcss_scores = []

    for n_clusters in cluster_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(df[cols])
        score = silhouette_score(df[cols], cluster_labels)
        silhouette_scores.append(score)
        wcss_scores.append(kmeans.inertia_)
    scores_list.append(silhouette_scores)
    wcss_list.append(wcss_scores)


In [None]:
import matplotlib.pyplot as plt

for i, df in enumerate(scores_list):
    plt.figure(figsize=(8, 5))
    plt.plot(cluster_range, silhouette_scores, marker='o')
    plt.title(f'Q{i}')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()

In [None]:
from sklearn.metrics import adjusted_rand_score
import numpy as np

common_symbols = set(df_list[0]['symbol'])
for df in df_list[1:]:
    common_symbols.intersection_update(set(df['symbol']))

# Filter each dataframe to include only common symbols
filtered_dfs = [df[df['symbol'].isin(common_symbols)] for df in df_list]

# Fit k-means models only on these filtered dataframes
kmeans_models = []
for df in filtered_dfs:
    kmeans = KMeans(n_clusters=15, random_state=42, n_init=10)
    kmeans.fit(df[cols])
    kmeans_models.append(kmeans)

# Now compare the clustering results using ARS, but only for common symbols
ars_scores = []
n_models = len(kmeans_models)

# Compare each model with every other model
for i in range(n_models):
    for j in range(i + 1, n_models):
        labels_i = kmeans_models[i].labels_
        labels_j = kmeans_models[j].labels_
        score = adjusted_rand_score(labels_i, labels_j)
        ars_scores.append((i, j, score))

# Print the ARS scores
for i, j, score in ars_scores:
    print(f"Adjusted Rand Score between model {i+1} and model {j+1}: {score}")


In [67]:
def get_ars(k):
    models = []
    for df in filtered_dfs:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(df[cols])
        models.append(kmeans)

    ars_scores = []
    n_models = len(models)
    for i in range(n_models):
        for j in range(i + 1, n_models):
            labels_i = models[i].labels_
            labels_j = models[j].labels_
            score = adjusted_rand_score(labels_i, labels_j)
            ars_scores.append((i, j, score))
    
    return ars_scores

meta_ars_scores = []

for k in range(5,30):
    ars_scores = get_ars(k)
    meta_ars_scores.append((k, ars_scores))

In [69]:
for k, list in enumerate(meta_ars_scores):
    array = np.array(list[1]).T
    print(k)
    print(np.max(array[2]))
    print(np.mean(array[2]))

0
0.6015226938500317
0.3243386308911614
1
0.5550597304817292
0.2535612836378158
2
0.539438994386315
0.26969039213383267
3
0.3999335426685302
0.19677332437868392
4
0.5647754245432869
0.20017506059216353
5
0.4121254833922778
0.1763536325647382
6
0.39047299931351526
0.18431753547365115
7
0.3808104174467782
0.18758308517520017
8
0.37078537292139696
0.1756015471683154
9
0.4150036664916371
0.19202966397815532
10
0.34026766544162035
0.18112211060404662
11
0.4159492196738275
0.19876425878754814
12
0.3367616586830515
0.18872176004188368
13
0.39533902032630724
0.19211512475172293
14
0.3702385943018315
0.18642911563806816
15
0.3244285464428416
0.19509201572280926
16
0.29348561396375134
0.18273530719701186
17
0.4156656337863127
0.1800013186183764
18
0.2978142061218619
0.18368851379056206
19
0.3054639504819712
0.17727916350395448
20
0.3331512349567854
0.1806046074605287
21
0.3294859039809701
0.18100526894598012
22
0.34271840420250277
0.18625543891809213
23
0.30480854576463057
0.18094734272913685
24

In [5]:
from sklearn.metrics import adjusted_rand_score
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

common_symbols = set(df_list[0]['symbol'])
for df in df_list[1:]:
    common_symbols.intersection_update(set(df['symbol']))

filtered_dfs = [df[df['symbol'].isin(common_symbols)].sort_values('symbol') for df in df_list]

cols = filtered_dfs[0].columns[3:]  # Adjust this index based on your actual data structure

kmeans_models = []
for df in filtered_dfs:
    kmeans = KMeans(n_clusters=17, random_state=42, n_init=10)
    kmeans.fit(df[cols])
    kmeans_models.append(kmeans)

# Now compare the clustering results using ARS, but only for common symbols
ars_scores = []
n_models = len(kmeans_models)

# Compare each model with every other model
for i in range(n_models):
    for j in range(i + 1, n_models):
        labels_i = kmeans_models[i].labels_
        labels_j = kmeans_models[j].labels_
        score = adjusted_rand_score(labels_i, labels_j)
        ars_scores.append((i, j, score))

# Print the ARS scores
for i, j, score in ars_scores:
    print(f"Adjusted Rand Score between model {i+1} and model {j+1}: {score}")


Adjusted Rand Score between model 1 and model 2: 0.26248789001059003
Adjusted Rand Score between model 1 and model 3: 0.206102636127567
Adjusted Rand Score between model 1 and model 4: 0.13059965375233443
Adjusted Rand Score between model 1 and model 5: 0.18711848499570904
Adjusted Rand Score between model 1 and model 6: 0.11645774279958483
Adjusted Rand Score between model 1 and model 7: 0.14016603516226947
Adjusted Rand Score between model 1 and model 8: 0.14686107513730262
Adjusted Rand Score between model 1 and model 9: 0.21491689433781092
Adjusted Rand Score between model 1 and model 10: 0.15213197621212093
Adjusted Rand Score between model 1 and model 11: 0.1877252743897785
Adjusted Rand Score between model 1 and model 12: 0.14176047804788508
Adjusted Rand Score between model 1 and model 13: 0.15756595308077542
Adjusted Rand Score between model 1 and model 14: 0.13019363140490742
Adjusted Rand Score between model 1 and model 15: 0.11955650158280873
Adjusted Rand Score between mod