In [36]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [37]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [38]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [39]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [40]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Initialize the StandardScaler
scaler = StandardScaler()

# Normalize the data (excluding 'coin_id' since it's not numeric)
scaled_data = scaler.fit_transform(df_market_data)



In [41]:
# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Copy the 'coin_id' column from the original DataFrame (it was set as the index)
scaled_df['coin_id'] = df_market_data.index

# Set 'coin_id' as the index of the scaled DataFrame
scaled_df.set_index('coin_id', inplace=True)

# Display the first 5 rows of the scaled DataFrame
scaled_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Scaled DataFrame.

In [42]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 11))

# Display the list of k-values
print(k_values)


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [43]:
from sklearn.cluster import KMeans

# Create an empty list to store inertia values
inertia_values_original = []

# Loop over k-values from 1 to 11
for k in range(1, 11):
    # Create a KMeans model with n_clusters = k
    kmeans = KMeans(n_clusters=k, random_state=42)
    
    # Fit the model to the scaled data (assuming scaled_df is already created)
    kmeans.fit(scaled_df)
    
    # Append the inertia value to the inertia_values list
    inertia_values_original.append(kmeans.inertia_)

# Display the inertia values for each k
print(inertia_values_original)


[287.0, 212.12334207486256, 170.19357087970297, 79.02243535120975, 71.0979135124442, 66.10882678044752, 54.488123156241166, 43.22469266658042, 32.860884471226804, 28.404884636873096]




In [44]:


# Create a dictionary with k-values and corresponding inertia values
elbow_data = {
    'k': list(range(1, 11)),  # k-values from 1 to 11
    'inertia': inertia_values_original  # Inertia values calculated earlier
}

# Create a DataFrame from the dictionary
elbow_curve_orignal = pd.DataFrame(elbow_data)

# Display the DataFrame to check the data
print(elbow_curve_orignal)


    k     inertia
0   1  287.000000
1   2  212.123342
2   3  170.193571
3   4   79.022435
4   5   71.097914
5   6   66.108827
6   7   54.488123
7   8   43.224693
8   9   32.860884
9  10   28.404885


In [45]:
# Create a DataFrame for the elbow plot data (assuming you have the elbow_df DataFrame)
elbow_curve_orignal.hvplot.line(
    x='k', 
    y='inertia', 
    title='Elbow Curve', 
    xlabel='Number of Clusters (k)', 
    ylabel='Inertia', 
    width=800, 
    height=400, 
    rot=90
)


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for 
𝑘
k is 4, as this is where the Elbow Curve shows a significant reduction in inertia, followed by a much slower decrease. This suggests that using 4 clusters strikes a balance between minimizing inertia and avoiding overfitting the data by using too many clusters.

This value indicates that 4 clusters provide the best clustering results with a reasonable trade-off between complexity and accuracy.

---

### Cluster Cryptocurrencies with K-means Using the Original Scaled DataFrame

In [46]:
# Initialize the K-Means model using the best value for k
# Initialize the KMeans model with k=4
kmeans = KMeans(n_clusters=4, random_state=42)


In [47]:
# Fit the K-Means model using the scaled DataFrame
kmeans.fit(scaled_df)



In [48]:
# Predict the clusters for the scaled data
predicted_clusters = kmeans.predict(scaled_df)

# Print the resulting array of cluster values
print(predicted_clusters)


[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [49]:
# Create a copy of the scaled DataFrame
# Create a copy of the scaled DataFrame
df_market_data_scaled_clustered = scaled_df.copy()

In [50]:
# Add a new column to the copy of the scaled DataFrame with the predicted clusters
df_market_data_scaled_clustered['Cluster'] = predicted_clusters
# Display the copy of the scaled DataFrame with the new 'Cluster' column
df_market_data_scaled_clustered.head()  # Display the first few rows




Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,0
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,0
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,2
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,2
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,0


In [51]:
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
# Create a scatter plot using hvPlot
df_market_data_scaled_clustered.hvplot.scatter(
    x='price_change_percentage_24h', 
    y='price_change_percentage_7d', 
    c='Cluster',  # Color the points by their predicted cluster
    cmap='Set1',  # Set the color map for the clusters
    title="Cryptocurrency Clusters",
    xlabel="Price Change 24h (%)",
    ylabel="Price Change 7d (%)",
    width=800,
    height=400,
    hover_cols=['coin_id']  # Show the cryptocurrency name on hover
)


---

### Optimize Clusters with Principal Component Analysis.

In [52]:
# Create a PCA model instance and set `n_components=3`.
from sklearn.decomposition import PCA

# Create a PCA model instance with 3 components
pca = PCA(n_components=3, random_state=3)



In [53]:
# Use the PCA model with `fit_transform` to reduce the original scaled DataFrame
# down to three principal components.
#Use fit_transform to reduce the scaled data to 3 principal components
pca_data = pca.fit_transform(scaled_df)

# Convert the PCA data to a DataFrame for better readability
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2", "PC3"], index=scaled_df.index)

#Display the scaled PCA data
pca_df.head()




Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [54]:
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
# Retrieve the explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Display the explained variance for each principal component
print("Explained variance ratio for each principal component:")
print(explained_variance_ratio)

#Calculate the total explained variance (sum of the variance ratios)
total_explained_variance = explained_variance_ratio.sum()
print("\nTotal explained variance by the 3 principal components:", total_explained_variance)


Explained variance ratio for each principal component:
[0.3719856  0.34700813 0.17603793]

Total explained variance by the 3 principal components: 0.8950316570309841


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** Explained variance ratio for each principal component:
[0.3719856  0.34700813 0.17603793]

The total explained variance of the three principal components is 0.895 or 89.5%.

This means that the three principal components together capture 89.5% of the variance in the original data, which indicates that a large portion of the original data's information is retained with this dimensionality reduction.

In [55]:
# Create a new DataFrame from the PCA data
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2", "PC3"])

#Copy the crypto names (coin_id) from the original scaled DataFrame
pca_df['coin_id'] = scaled_df.index

#Set the coin_id column as index
pca_df.set_index('coin_id', inplace=True)

#Display the first five rows of the scaled PCA DataFrame
pca_df.head()



Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the Scaled PCA DataFrame

In [56]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 11))

# Display the k_values list to ensure it's correct
print(k_values)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [57]:

# Create an empty list to store the inertia values
inertia_values_pca = []

# Create a for loop to compute the inertia with each possible value of k
for k in range(1, 11):
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans = KMeans(n_clusters=k, random_state=42)
    
    # Fit the model to the data using the scaled PCA DataFrame (pca_df)
    kmeans.fit(pca_df)
    
    # Append the model's inertia to the inertia list
    inertia_values_pca.append(kmeans.inertia_)

# Print the inertia values to verify
print(inertia_values_pca)

[256.87408556789256, 182.33953007775648, 140.43183298408374, 49.66549665179736, 42.051901097917245, 31.749005244389007, 28.031869172552753, 19.45174720604135, 13.814081971558094, 10.630647573870965]




In [58]:

# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k': range(1, 11),  # k values from 1 to 11
    'inertia': inertia_values_pca  # The inertia values calculated from the KMeans models
}

# Create a DataFrame with the data to plot the Elbow curve
elbow_curve_pca = pd.DataFrame(elbow_data)

# Display the DataFrame to verify
elbow_curve_pca.head()



Unnamed: 0,k,inertia
0,1,256.874086
1,2,182.33953
2,3,140.431833
3,4,49.665497
4,5,42.051901


In [59]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
# Plot a line chart with all the inertia values computed with different values of k
elbow_curve_pca.hvplot.line(
    x='k', 
    y='inertia', 
    title="Elbow Method for Optimal k", 
    xlabel="Number of Clusters (k)", 
    ylabel="Inertia", 
    width=800, 
    height=400
)


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:**Based on the elbow method applied to the scaled PCA data, the best value for k is 4. This is determined by the point at which the inertia starts to decrease at a slower rate, indicating that adding more clusters does not significantly improve the clustering.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, the best value for k found using the scaled PCA data is the same as the best value for k found using the original scaled data, which is also 4. Both datasets show an inflection point at k=4, where the decrease in inertia becomes slower, suggesting that 4 clusters provide the optimal grouping for the cryptocurrencies.

### Cluster Cryptocurrencies with K-means Using the Scaled PCA DataFrame

In [60]:

# Initialize the K-Means model with the best value for k (k=4)
kmeans_pca = KMeans(n_clusters=4, random_state=42)


In [61]:

# Fit the K-Means model using the scaled PCA DataFrame (pca_df)
kmeans_pca.fit(pca_df)



In [62]:

# Predict the clusters to group the cryptocurrencies using the scaled PCA DataFrame
predicted_clusters_pca = kmeans_pca.predict(pca_df)

# Print the resulting array of cluster values
print(predicted_clusters_pca)


[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [63]:
# Create a copy of the scaled PCA DataFrame
pca_df_with_clusters = pca_df.copy()

# Add a new column to the copy of the PCA DataFrame with the predicted clusters
pca_df_with_clusters['Cluster'] = predicted_clusters_pca

# Display the copy of the scaled PCA DataFrame with the new cluster column
pca_df_with_clusters.head()



Unnamed: 0_level_0,PC1,PC2,PC3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,2
ripple,-0.471835,-0.22266,-0.479053,2
bitcoin-cash,-1.1578,2.041209,1.859715,0


In [64]:
# Create a scatter plot using hvPlot by setting
# `x="PC1"` and `y="PC2"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
# Create a scatter plot using hvPlot
pca_df_with_clusters.hvplot.scatter(
    x='PC1', 
    y='PC2', 
    c='Cluster', 
    cmap='viridis',  # Optional: You can change the color map if you'd like
    title="Cryptocurrency Clusters (PCA)",
    xlabel="Principal Component 1 (PC1)",
    ylabel="Principal Component 2 (PC2)",
    hover_cols=['Cluster'],  # Show the cluster label on hover
    width=800, 
    height=400
)


### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [65]:
# Composite plot to contrast the Elbow curves
# Plot the Elbow Curve for the original scaled data
elbow_curve_original = pd.DataFrame({
    'k': k_values,
    'inertia': inertia_values_original
}).hvplot.line(
    x='k', y='inertia', title="Elbow Curve for Original Scaled Data", 
    xlabel="Number of Clusters (k)", ylabel="Inertia", width=800, height=400
)

# Plot the Elbow Curve for the scaled PCA data
elbow_curve_pca = pd.DataFrame({
    'k': k_values,
    'inertia': inertia_values_pca
}).hvplot.line(
    x='k', y='inertia', title="Elbow Curve for Scaled PCA Data", 
    xlabel="Number of Clusters (k)", ylabel="Inertia", width=800, height=400
)

# Combine the two plots using the "+" operator to create a composite plot
elbow_curve_original + elbow_curve_pca

In [66]:
# Ensure the cluster column exists in both scaled_df and pca_df
scaled_df['Cluster'] = kmeans.labels_
pca_df['Cluster'] = kmeans_pca.labels_

# Create scatter plots for both the original scaled data and PCA-transformed data
scatter_original = scaled_df.hvplot.scatter(
    x="price_change_percentage_24h", 
    y="price_change_percentage_7d", 
    c="Cluster", 
    cmap="viridis", 
    hover_cols=["coin_id"], 
    title="Clusters - Original Scaled Data", 
    xlabel="24h Price Change (%)", 
    ylabel="7d Price Change (%)", 
    width=800, 
    height=400
)

scatter_pca = pca_df.hvplot.scatter(
    x="PC1", 
    y="PC2", 
    c="Cluster", 
    cmap="viridis", 
    hover_cols=["coin_id"], 
    title="Clusters - PCA Transformed Data", 
    xlabel="Principal Component 1 (PC1)", 
    ylabel="Principal Component 2 (PC2)", 
    width=800, 
    height=400
)

# Combine the two scatter plots to show side by side
scatter_original + scatter_pca


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Compared to our original data, the clusters indicated with PCA are more defined and clearly distinguishable. There is a stark difference in the distance between coins, ethlend & celsius-degree-token when comparing both cluster data. When we apply PCA with fewer features, we see those coins have a significant separation between the bulk of our clusters.

In our original data, our clusters overlap and aren't separated. With PCA, and the added evidence of lower inertia our data is more clean and can be used to draw clear predictions. So in conclusion, having fewer features to cluster our data using K-Means seems to have a positive impact on our data.