In [None]:
import os # https://docs.python.org/3/library/os.html
import pandas as pd # https://pandas.pydata.org/docs/getting_started/install.html

# Folder path where CSV files are located
folder_path = '' # Your path to a 'Train' folder

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over each CSV file and read it into a DataFrame
for csv_file in csv_files:
    # Build the full path to the CSV file
    file_path = os.path.join(folder_path, csv_file)
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path) # https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
    
    # Append DataFrame to the list
    dfs.append(df) # https://pandas.pydata.org/pandas-docs/version/1.4/reference/api/pandas.DataFrame.append.html

# Combine all DataFrames into a single DataFrame
full_df = pd.concat(dfs, ignore_index=True) # https://pandas.pydata.org/docs/reference/api/pandas.concat.html

# Print the first few rows of the combined DataFrame
print(full_df)

In [None]:
full_df['Date'] = pd.to_datetime(full_df['Date']) # https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
full_df=full_df.sort_values('Date') # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
print(full_df)

In [None]:
from sklearn.preprocessing import LabelEncoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

# Label Encoding Symbols
encoder = LabelEncoder()
full_df['Symbol_Encoded'] = encoder.fit_transform(full_df['Symbol'])
# Store mapping for later
symbol_mapping = dict(zip(full_df["Symbol_Encoded"], full_df["Symbol"]))
name_mapping = dict(zip(full_df["Symbol_Encoded"], full_df["Name"]))
df_encoded = full_df.drop(columns=['Symbol','Name'])
print("Symbol Mapping:", symbol_mapping)
# Display the new DataFrame
print(df_encoded.head())

In [None]:
# Feature engineering

import seaborn as sns # https://seaborn.pydata.org/tutorial/introduction.html
import matplotlib.pyplot as plt # https://matplotlib.org/2.0.2/users/pyplot_tutorial.html

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html

# Get correlations with the target column
target_column = 'Close'  # Replace with your target column name
correlation_with_target = correlation_matrix[target_column]

# Display the correlation with target, sorted in descending order
print(correlation_with_target.sort_values(ascending=False))

In [None]:
df_encoded
# Check for missing values in the entire dataset
missing_values = df_encoded.isnull().sum() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html
print(missing_values)

In [None]:
# Experiment 1
from tslearn.clustering import TimeSeriesKMeans # https://tslearn.readthedocs.io/en/stable/gen_modules/clustering/tslearn.clustering.TimeSeriesKMeans.html
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Aggregate the data for each cryptocurrency to get a single row for each
# Group by cryptocurrency symbol and calculate mean for each feature
crypto_features = df_encoded.groupby('Symbol_Encoded')[['High','Low','Open',
                                                        'Marketcap',
                                                    'Volume','trend','MA_7d','MA_14d', 'MA_30d' ]].mean()
n_clusters=6

# Step 3: Apply KMeans clustering
kmeans = TimeSeriesKMeans(n_clusters, metric="dtw",
                         max_iter=10, random_state=42)
crypto_features['KMeans_Cluster'] = kmeans.fit_predict(crypto_features)

# Step 4: Create a mapping of Symbol_Encoded to the cluster assignment
symbol_mapping = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster']))

# Step 5: Merge the cluster labels back into the original DataFrame
# Merge based on Symbol_Encoded to ensure each cryptocurrency gets its assigned cluster
df_encoded['KMeans_Cluster'] = df_encoded['Symbol_Encoded'].map(symbol_mapping) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.map.html

# Step 6: Group cryptocurrencies by cluster and display the results
clustered_cryptos = {}

# Create a dictionary of clusters and their corresponding cryptocurrencies
for cluster in df_encoded['KMeans_Cluster'].unique(): # https://pandas.pydata.org/docs/reference/api/pandas.unique.html
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos[cluster] = cluster_names
# Display the clusters
print("\nCryptocurrencies Grouped by Cluster:")
for cluster, names in clustered_cryptos.items(): # https://python-reference.readthedocs.io/en/latest/docs/dict/items.html
    print(f"Cluster {cluster}: {', '.join(names)}") # https://python-reference.readthedocs.io/en/latest/docs/str/join.html

from sklearn.metrics import silhouette_score, silhouette_samples


# Step 10: Calculate Silhouette Score
silhouette_avg = silhouette_score(crypto_features, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Visualize silhouette scores for each data point
silhouette_values = silhouette_samples(crypto_features, kmeans.labels_)
# Plotting the silhouette scores
plt.figure(figsize=(8, 6))
y_lower = 10  # Initial y position for the first cluster
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to the current cluster
    ith_cluster_silhouette_values = silhouette_values[ kmeans.labels_== i]
    ith_cluster_silhouette_values.sort()
    
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i  # Update y position for the next cluster
    plt.fill_betweenx(
        range(y_lower, y_upper),
        ith_cluster_silhouette_values,
        alpha=0.7,
        label=f'Cluster {i + 1}'
    )
    # Compute the new y_lower for the next cluster
    y_lower = y_upper + 10  

plt.axvline(x=silhouette_avg, color='red', linestyle='--')
plt.title('Silhouette Scores for n = 6')
plt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster Label')
plt.legend()
plt.show()


In [None]:
# Experiment 2
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Aggregate the data for each cryptocurrency to get a single row for each
# Group by cryptocurrency symbol and calculate mean for each feature
crypto_features = df_encoded.groupby('Symbol_Encoded')[[['High','Low','Open',
                                                        'Marketcap',
                                                    'Volume','trend','MA_7d','MA_14d', 'MA_30d']].mean()]].mean()
n_clusters = 2
# Step 2: Standardize the data
scaler = RobustScaler()
crypto_scaled = scaler.fit_transform(crypto_features)
# Step 3: Apply KMeans clustering
kmeans = TimeSeriesKMeans(n_clusters, metric="dtw",
                         max_iter=10, random_state=42)
crypto_features['KMeans_Cluster'] = kmeans.fit_predict(crypto_scaled)

# Step 4: Create a mapping of Symbol_Encoded to the cluster assignment
symbol_mapping = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster']))

# Step 5: Merge the cluster labels back into the original DataFrame
# Merge based on Symbol_Encoded to ensure each cryptocurrency gets its assigned cluster
df_encoded['KMeans_Cluster'] = df_encoded['Symbol_Encoded'].map(symbol_mapping)

# Step 6: Group cryptocurrencies by cluster and display the results
clustered_cryptos = {}
# Create a dictionary of clusters and their corresponding cryptocurrencies
for cluster in df_encoded['KMeans_Cluster'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos[cluster] = cluster_names
# Display the clusters
print("\nCryptocurrencies Grouped by Cluster:")
for cluster, names in clustered_cryptos.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

from sklearn.metrics import silhouette_score, silhouette_samples


# Step 10: Calculate Silhouette Score
silhouette_avg = silhouette_score(crypto_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Visualize silhouette scores for each data point
silhouette_values = silhouette_samples(crypto_scaled, kmeans.labels_)
# Plotting the silhouette scores
plt.figure(figsize=(8, 6))
y_lower = 10  # Initial y position for the first cluster
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to the current cluster
    ith_cluster_silhouette_values = silhouette_values[ kmeans.labels_== i]
    ith_cluster_silhouette_values.sort()
    
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i  # Update y position for the next cluster
    plt.fill_betweenx(
        range(y_lower, y_upper),
        ith_cluster_silhouette_values,
        alpha=0.7,
        label=f'Cluster {i + 1}'
    )
    # Compute the new y_lower for the next cluster
    y_lower = y_upper + 10  

plt.axvline(x=silhouette_avg, color='red', linestyle='--')
plt.title('Silhouette Scores for n = 2')
plt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster Label')
plt.legend()
plt.show()


In [None]:
# Experiment 2
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
from sklearn.decomposition import PCA # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
from sklearn.metrics import silhouette_score, silhouette_samples # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_samples.html
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Aggregate the data for each cryptocurrency
crypto_features = df_encoded.groupby('Symbol_Encoded')[['High','Low','Open',
                                                        'Marketcap',
                                                    'Volume','trend','MA_7d','MA_14d', 'MA_30d']].mean()
n_clusters = 2

# Step 2: Standardize the data
scaler = MinMaxScaler()
crypto_scaled = scaler.fit_transform(crypto_features)

# Step 3: Apply PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
crypto_pca = pca.fit_transform(crypto_scaled)

# Step 4: Apply KMeans clustering on the original features
kmeans_original = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=10, random_state=42)
kmeans_original.fit(crypto_scaled)
crypto_features['KMeans_Cluster_Original'] = kmeans_original.labels_

# Step 5: Apply KMeans clustering on PCA-reduced features
kmeans_pca = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=10, random_state=42)
kmeans_pca.fit(crypto_pca)
crypto_features['KMeans_Cluster_PCA'] = kmeans_pca.labels_

# Step 6: Compare the clustering results
symbol_mapping_original = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_Original']))
symbol_mapping_pca = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_PCA']))

df_encoded['KMeans_Cluster_Original'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_original)
df_encoded['KMeans_Cluster_PCA'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_pca)

# Step 7: Compute and compare silhouette scores
silhouette_original = silhouette_score(crypto_scaled, kmeans_original.labels_)
silhouette_pca = silhouette_score(crypto_pca, kmeans_pca.labels_)

print(f"Silhouette Score (Original Features): {silhouette_original}")
print(f"Silhouette Score (PCA-Reduced Features): {silhouette_pca}")

# Step 8: Visualize PCA components
plt.figure(figsize=(8, 6))
sns.scatterplot(x=crypto_pca[:, 0], y=crypto_pca[:, 1], hue=crypto_features['KMeans_Cluster_PCA'], palette='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot of Cryptocurrencies')
plt.legend(title='Cluster')
plt.show()

# Step 9: Group cryptocurrencies by cluster for both methods
clustered_cryptos_original = {}
clustered_cryptos_pca = {}

for cluster in df_encoded['KMeans_Cluster_Original'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_Original'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_original[cluster] = cluster_names

for cluster in df_encoded['KMeans_Cluster_PCA'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_PCA'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_pca[cluster] = cluster_names

print("\nCryptocurrencies Grouped by Clusters (Original Features):")
for cluster, names in clustered_cryptos_original.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

print("\nCryptocurrencies Grouped by Cluster (PCA Features):")
for cluster, names in clustered_cryptos_pca.items():
    print(f"Cluster {cluster}: {', '.join(names)}")


In [None]:
# Experiment 4
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from kneed import KneeLocator # https://kneed.readthedocs.io/en/stable/parameters.html

# Step 1: Aggregate the data for each cryptocurrency
crypto_features = df_encoded.groupby('Symbol_Encoded')[['High','Low','Open',
                                                        'Marketcap',
                                                    'Volume','trend','MA_7d','MA_14d', 'MA_30d']].mean()

# Step 2: Standardize the data
scaler = MinMaxScaler()
crypto_scaled = scaler.fit_transform(crypto_features)

# Step 3: Apply PCA for dimensionality reduction (2 and 3 components)
pca2 = PCA(n_components=2)
crypto_pca2 = pca2.fit_transform(crypto_scaled)

pca3 = PCA(n_components=3)
crypto_pca3 = pca3.fit_transform(crypto_scaled)

# Step 4: Determine the optimal number of clusters using Silhouette Score
silhouette_scores_original = []
silhouette_scores_pca2 = []
silhouette_scores_pca3 = []
k_values = range(2, 11)  # Silhouette Score requires at least 2 clusters

for k in k_values:
    kmeans_original = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_original = kmeans_original.fit_predict(crypto_scaled)
    silhouette_scores_original.append(silhouette_score(crypto_scaled, labels_original))
    
    kmeans_pca2 = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_pca2 = kmeans_pca2.fit_predict(crypto_pca2)
    silhouette_scores_pca2.append(silhouette_score(crypto_pca2, labels_pca2))
    
    kmeans_pca3 = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_pca3 = kmeans_pca3.fit_predict(crypto_pca3)
    silhouette_scores_pca3.append(silhouette_score(crypto_pca3, labels_pca3))

# Find the optimal number of clusters using kneed
knee_original = KneeLocator(k_values, silhouette_scores_original, curve="convex", direction="decreasing")
knee_pca2 = KneeLocator(k_values, silhouette_scores_pca2, curve="convex", direction="decreasing")
knee_pca3 = KneeLocator(k_values, silhouette_scores_pca3, curve="convex", direction="decreasing")

optimal_clusters_original = knee_original.knee
optimal_clusters_pca2 = knee_pca2.knee
optimal_clusters_pca3 = knee_pca3.knee

print(f"Optimal number of clusters (Original Features): {optimal_clusters_original}")
print(f"Optimal number of clusters (PCA2 Features): {optimal_clusters_pca2}")
print(f"Optimal number of clusters (PCA3 Features): {optimal_clusters_pca3}")

# Step 5: Apply KMeans clustering on the original and PCA features with optimal clusters
kmeans_original = TimeSeriesKMeans(n_clusters=optimal_clusters_original, metric="dtw", max_iter=10, random_state=42)
kmeans_original.fit(crypto_scaled)
crypto_features['KMeans_Cluster_Original'] = kmeans_original.labels_

kmeans_pca2 = TimeSeriesKMeans(n_clusters=optimal_clusters_pca2, metric="dtw", max_iter=10, random_state=42)
kmeans_pca2.fit(crypto_pca2)
crypto_features['KMeans_Cluster_PCA2'] = kmeans_pca2.labels_

kmeans_pca3 = TimeSeriesKMeans(n_clusters=optimal_clusters_pca3, metric="dtw", max_iter=10, random_state=42)
kmeans_pca3.fit(crypto_pca3)
crypto_features['KMeans_Cluster_PCA3'] = kmeans_pca3.labels_

# Step 6: Compare the clustering results
symbol_mapping_original = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_Original']))
symbol_mapping_pca2 = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_PCA2']))
symbol_mapping_pca3 = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_PCA3']))

df_encoded['KMeans_Cluster_Original'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_original)
df_encoded['KMeans_Cluster_PCA2'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_pca2)
df_encoded['KMeans_Cluster_PCA3'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_pca3)

# Step 7: Group cryptocurrencies by cluster
clustered_cryptos_original = {}
clustered_cryptos_pca2 = {}
clustered_cryptos_pca3 = {}

for cluster in df_encoded['KMeans_Cluster_Original'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_Original'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_original[cluster] = cluster_names

for cluster in df_encoded['KMeans_Cluster_PCA2'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_PCA2'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_pca2[cluster] = cluster_names

for cluster in df_encoded['KMeans_Cluster_PCA3'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_PCA3'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_pca3[cluster] = cluster_names

print("\nCryptocurrencies Grouped by Cluster (Original Features):")
for cluster, names in clustered_cryptos_original.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

print("\nCryptocurrencies Grouped by Cluster (PCA2 Features):")
for cluster, names in clustered_cryptos_pca2.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

print("\nCryptocurrencies Grouped by Cluster (PCA3 Features):")
for cluster, names in clustered_cryptos_pca3.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

# Step 8: Compare the Silhouette Scores
plt.figure(figsize=(8, 6))
plt.plot(k_values, silhouette_scores_original, marker='o', linestyle='-', color='b', label='Original Features')
plt.plot(k_values, silhouette_scores_pca2, marker='s', linestyle='-', color='g', label='PCA2 Features')
plt.plot(k_values, silhouette_scores_pca3, marker='^', linestyle='-', color='r', label='PCA3 Features')
plt.axvline(x=optimal_clusters_original, color='b', linestyle='--', label=f'Optimal k (Original) = {optimal_clusters_original}')
plt.axvline(x=optimal_clusters_pca2, color='g', linestyle='--', label=f'Optimal k (PCA2) = {optimal_clusters_pca2}')
plt.axvline(x=optimal_clusters_pca3, color='r', linestyle='--', label=f'Optimal k (PCA3) = {optimal_clusters_pca3}')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score Comparison: Original vs PCA2 vs PCA3")
plt.legend()
plt.show()


In [None]:
# Experiment 5
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from kneed import KneeLocator

# Step 1: Aggregate the data for each cryptocurrency
crypto_features = df_encoded.groupby('Symbol_Encoded')[['High','Low','Open',
                                                        'Marketcap',
                                                    'Volume','trend','MA_7d','MA_14d', 'MA_30d']].mean()


# Step 3: Apply PCA for dimensionality reduction (2 and 3 components)
pca2 = PCA(n_components=2)
crypto_pca2 = pca2.fit_transform(crypto_features)

pca3 = PCA(n_components=3)
crypto_pca3 = pca3.fit_transform(crypto_features)

# Step 4: Determine the optimal number of clusters using Silhouette Score
silhouette_scores_original = []
silhouette_scores_pca2 = []
silhouette_scores_pca3 = []
k_values = range(2, 11)  # Silhouette Score requires at least 2 clusters

for k in k_values:
    kmeans_original = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_original = kmeans_original.fit_predict(crypto_features)
    silhouette_scores_original.append(silhouette_score(crypto_features, labels_original))
    
    kmeans_pca2 = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_pca2 = kmeans_pca2.fit_predict(crypto_pca2)
    silhouette_scores_pca2.append(silhouette_score(crypto_pca2, labels_pca2))
    
    kmeans_pca3 = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10, random_state=42)
    labels_pca3 = kmeans_pca3.fit_predict(crypto_pca3)
    silhouette_scores_pca3.append(silhouette_score(crypto_pca3, labels_pca3))
# Find the optimal number of clusters using kneed
knee_original = KneeLocator(k_values, silhouette_scores_original, curve="convex", direction="decreasing")
knee_pca2 = KneeLocator(k_values, silhouette_scores_pca2, curve="convex", direction="decreasing")
knee_pca3 = KneeLocator(k_values, silhouette_scores_pca3, curve="convex", direction="decreasing")

optimal_clusters_original = knee_original.knee
optimal_clusters_pca2 = knee_pca2.knee
optimal_clusters_pca3 = knee_pca3.knee

print(f"Optimal number of clusters (Original Features): {optimal_clusters_original}")
print(f"Optimal number of clusters (PCA2 Features): {optimal_clusters_pca2}")
print(f"Optimal number of clusters (PCA3 Features): {optimal_clusters_pca3}")

# Step 5: Apply KMeans clustering on the original and PCA features with optimal clusters
kmeans_original = TimeSeriesKMeans(n_clusters=optimal_clusters_original, metric="dtw", max_iter=10, random_state=42)
kmeans_original.fit(crypto_features)
crypto_features['KMeans_Cluster_Original'] = kmeans_original.labels_
kmeans_pca2 = TimeSeriesKMeans(n_clusters=optimal_clusters_pca2, metric="dtw", max_iter=10, random_state=42)
kmeans_pca2.fit(crypto_pca2)
crypto_features['KMeans_Cluster_PCA2'] = kmeans_pca2.labels_

kmeans_pca3 = TimeSeriesKMeans(n_clusters=optimal_clusters_pca3, metric="dtw", max_iter=10, random_state=42)
kmeans_pca3.fit(crypto_pca3)
crypto_features['KMeans_Cluster_PCA3'] = kmeans_pca3.labels_

# Step 6: Compare the clustering results
symbol_mapping_original = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_Original']))
symbol_mapping_pca2 = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_PCA2']))
symbol_mapping_pca3 = dict(zip(crypto_features.index, crypto_features['KMeans_Cluster_PCA3']))

df_encoded['KMeans_Cluster_Original'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_original)
df_encoded['KMeans_Cluster_PCA2'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_pca2)
df_encoded['KMeans_Cluster_PCA3'] = df_encoded['Symbol_Encoded'].map(symbol_mapping_pca3)

# Step 7: Group cryptocurrencies by cluster
clustered_cryptos_original = {}
clustered_cryptos_pca2 = {}
clustered_cryptos_pca3 = {}
for cluster in df_encoded['KMeans_Cluster_Original'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_Original'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_original[cluster] = cluster_names

for cluster in df_encoded['KMeans_Cluster_PCA2'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_PCA2'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_pca2[cluster] = cluster_names

for cluster in df_encoded['KMeans_Cluster_PCA3'].unique():
    cluster_symbols = df_encoded[df_encoded['KMeans_Cluster_PCA3'] == cluster]['Symbol_Encoded'].unique()
    cluster_names = [name_mapping[symbol] for symbol in cluster_symbols]
    clustered_cryptos_pca3[cluster] = cluster_names

print("\nCryptocurrencies Grouped by Cluster (Original Features):")
for cluster, names in clustered_cryptos_original.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

print("\nCryptocurrencies Grouped by Cluster (PCA2 Features):")
for cluster, names in clustered_cryptos_pca2.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

print("\nCryptocurrencies Grouped by Cluster (PCA3 Features):")
for cluster, names in clustered_cryptos_pca3.items():
    print(f"Cluster {cluster}: {', '.join(names)}")

# Step 8: Compare the Silhouette Scores
plt.figure(figsize=(8, 6))
plt.plot(k_values, silhouette_scores_original, marker='o', linestyle='-', color='b', label='Original Features')
plt.plot(k_values, silhouette_scores_pca2, marker='s', linestyle='-', color='g', label='PCA2 Features')
plt.plot(k_values, silhouette_scores_pca3, marker='^', linestyle='-', color='r', label='PCA3 Features')
plt.axvline(x=optimal_clusters_original, color='b', linestyle='--', label=f'Optimal k (Original) = {optimal_clusters_original}')
plt.axvline(x=optimal_clusters_pca2, color='g', linestyle='--', label=f'Optimal k (PCA2) = {optimal_clusters_pca2}')
plt.axvline(x=optimal_clusters_pca3, color='r', linestyle='--', label=f'Optimal k (PCA3) = {optimal_clusters_pca3}')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score Comparison: Original vs PCA2 vs PCA3")
plt.legend()
plt.show()

print(f"Final Silhouette Score (Original Features): {silhouette_scores_original}")
print(f"Final Silhouette Score (PCA2-Reduced Features): {silhouette_scores_pca2}")
print(f"Final Silhouette Score (PCA3-Reduced Features): {silhouette_scores_pca3}")
