## CryptoClustering with Unsupervised Machine Learning

In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [5]:
# list the columns in the DataFrame
df_market_data.columns

Index(['price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')

---

### Prepare the Data

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
market_scaled = StandardScaler().fit_transform(df_market_data[['price_change_percentage_24h', 'price_change_percentage_7d',
                                                               'price_change_percentage_14d', 'price_change_percentage_30d', 
                                                               'price_change_percentage_60d', 'price_change_percentage_200d',
                                                               'price_change_percentage_1y']])

# Review the first five rows of the scaled data
market_scaled[:5]

array([[ 0.50852937,  0.49319307,  0.77220043,  0.23545963, -0.0674951 ,
        -0.35595348, -0.25163688],
       [ 0.18544589,  0.93444504,  0.55869212, -0.05434093, -0.27348273,
        -0.11575947, -0.19935211],
       [ 0.02177396, -0.70633685, -0.02168042, -0.06103015,  0.00800452,
        -0.55024692, -0.28206051],
       [-0.04076438, -0.81092807,  0.24945797, -0.05038797, -0.37316402,
        -0.45825882, -0.29554614],
       [ 1.19303608,  2.00095907,  1.76061001,  0.54584206, -0.29120287,
        -0.49984776, -0.27031695]])

In [10]:
# Create a DataFrame with the scaled data
market_scale_df = pd.DataFrame(
    data=market_scaled,
    columns=['price_change_percentage_24h', 'price_change_percentage_7d',
             'price_change_percentage_14d', 'price_change_percentage_30d', 
             'price_change_percentage_60d', 'price_change_percentage_200d',
             'price_change_percentage_1y'],
    index=df_market_data.index
)

# Copy the crypto names from the original DataFrame
market_scale_df['coin_id'] = df_market_data.index

# Set the coin_id column as index
market_scale_df.set_index('coin_id', inplace=True)

# Display the scaled DataFrame
market_scale_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Scaled DataFrame.

In [11]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [None]:
# Create an empty list to store the inertia values
inertia = []

# Loop throuhgh the k-values
for k in k_values:
    # Create the KMeans model
    k_model = KMeans(n_clusters=k, random_state=0)
    
    # Fit the model to the data
    k_model.fit(market_scale_df)
    
    # Append the inertia to the inertia list
    inertia.append(k_model.inertia_)

In [13]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k': k_values,
    'inertia': inertia
}

# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_data)

In [78]:
# Plot a line chart
elbow_plot = elbow_df.hvplot.line(
    x='k',
    y='inertia',
    title='Scaled Elbow Curve',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    color ='blue',
    width=800,
    height=400
)

elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for `k` is `4`. This means that `4` is the optimal number of clusters.

---

### Cluster Cryptocurrencies with K-means Using the Original Scaled DataFrame

In [15]:
# Initialize the K-Means model using the best value for k
k_model = KMeans(n_clusters=4, random_state=0)

# Fit the model to the scaled data
k_model.fit(market_scale_df)

In [16]:
# Predict the clusters to group the cryptocurrencies using the scaled DataFrame
clusters = k_model.predict(market_scale_df)

# Print the resulting array of cluster values.
print(clusters)

[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 1 2 0 0 3
 0 0 0 0]


In [60]:
# Create a copy of the scaled DataFrame
scaled_df = market_scale_df.copy()

In [61]:
# Add a new column to the copy of the scaled DataFrame with the predicted clusters
scaled_df['predicted_clusters'] = clusters

# Display the copy of the scaled DataFrame
scaled_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,predicted_clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,2
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,2
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,0
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,0
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,2


In [94]:
# Create a scatter plot
market_scaled_plot = scaled_df.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='predicted_clusters',
    hover_cols=['coin_id'],
    title='Scaled Cryptocurrency Clusters',
    xlabel='Price Change Percentage (24h)',
    ylabel='Price Change Percentage (7d)',
    width=800,
    height=400
)
market_scaled_plot

---

### Optimize Clusters with Principal Component Analysis.

In [20]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [21]:
# Perform PCA on the scaled data
pca_data = pca.fit_transform(market_scale_df)

# View the scaled PCA data
pca_data[:5]

array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])

In [None]:
# Retrieve the explained variance ratio of each principal component
explained_variance = pca.explained_variance_ratio_

In [24]:
# Print the total variance explained
print(f"Total variance explained by PCA: {explained_variance.sum():.2%}")

Total variance explained by PCA: 89.50%


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** About 89.50% of the total variance is condensed into the 3 PCA variables. This means that the three principal components capture 89.5% of the variability in the original dataset. This indicates that most of the information in the original features is retained, and the dimensionality reduction is effective. This is beneficial because the high explained variance indicates that PCA has successfully reduced the complexity of the data while preserving its essential characteristics, making it a powerful tool for clustering and analysis in the cryptocurrency market.

In [25]:
# Create a new DataFrame with the PCA data.
pca_df = pd.DataFrame(
    data=pca_data,
    columns=["PC1", "PC2", "PC3"],
    index=market_scale_df.index
)

# Copy the crypto names from the original scaled DataFrame
pca_df['coin_id'] = market_scale_df.index

# Set the coin_id column as index
pca_df.set_index('coin_id', inplace=True)

# Display the scaled PCA DataFrame
pca_df.head()

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the Scaled PCA DataFrame

In [45]:
# Create a list with the number of k-values from 1 to 11
k_pca_values = list(range(1, 12))

In [46]:
# Create an empty list to store the inertia values
inertia_pca = []

# Loop through the k-values
for k in k_pca_values:
    # Create the KMeans model
    k_model = KMeans(n_clusters=k, random_state=0)
    
    # Fit the model to the data
    k_model.fit(pca_df)
    
    # Append the inertia to the inertia list
    inertia_pca.append(k_model.inertia_)

In [47]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca_data = {
    'k': k_pca_values,
    'inertia': inertia_pca
}

# Create a DataFrame with the data to plot the Elbow curve
elbow_pca_df = pd.DataFrame(elbow_pca_data)

In [76]:
# Plot the Elbow Curve 
elbow_pca_plot = elbow_pca_df.hvplot.line(
    x='k',
    y='inertia',
    title='PCA Elbow Curve',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    color='red',
    width=800,
    height=400
)

elbow_pca_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** The best value for `k` is `4`.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, the k value does not differ from the original data since both recieved a k value of `4`. It does, however, have a smaller inertia value in the PCA-transformed data. This means that the clustering algorithm (K-means) is able to form tighter and more cohesive clusters in the reduced-dimensional space. The PCA reduces the dimensionality of the data by removing noise and redundancy, which can make clustering more efficient. Also, a smaller inertia suggests that the PCA-transformed data captures the underlying patterns in the data more effectively, allowing K-means to group similar cryptocurrencies more accurately. For cryptocurrencies, this means that the clusters formed using PCA-transformed data may better represent groups of cryptocurrencies with similar behaviors or trends, even though some variance (information) is lost during dimensionality reduction.

### Cluster Cryptocurrencies with K-means Using the Scaled PCA DataFrame

In [49]:
# Initialize the K-Means model using the best value for k
k_pca_model = KMeans(n_clusters=4, random_state=0)

# Fit the K-Means model using the PCA data
k_pca_model.fit(pca_df)

In [50]:
# Predict the clusters to group the cryptocurrencies using the scaled PCA DataFrame
pca_clusters = k_pca_model.predict(pca_df)

# Print the resulting array of cluster values.
print(pca_clusters)

[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 1 2 0 0 3
 0 0 0 0]


In [64]:
# Create a copy of the scaled PCA DataFrame
pca_market_df = pca_df.copy()

# Add a new column to the copy of the PCA DataFrame with the predicted clusters
pca_market_df['Predicted Clusters'] = pca_clusters

# Display the copy of the scaled PCA DataFrame
pca_market_df.head()

Unnamed: 0_level_0,PC1,PC2,PC3,Predicted Clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,2
ethereum,-0.458261,0.458466,0.952877,2
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,2


In [65]:
# Create a scatter plot using hvPlot
market_pca_plot = pca_market_df.hvplot.scatter(
    x='PC1',
    y='PC2',
    by='Predicted Clusters',
    hover_cols=['coin_id'],
    title='Cryptocurrency PCA Clusters',
    xlabel='PC1',
    ylabel='PC2',
    width=800,
    height=400
)
market_pca_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [93]:
# Composite plot ot contrast the elbow curves
elbow_comparison_plot = (elbow_plot * elbow_pca_plot).opts(
    title="Comparison of Elbow Curves: Original Data vs PCA Data",
    legend_position="top_left",  # Position the legend
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    width=800,
    height=400,
    show_legend=True 
)
# Display the plot
elbow_comparison_plot

In [91]:
# Composite plot to contrast the clusters
composite_plot = (market_scaled_plot * market_pca_plot).opts(
    title="Comparison of Clusters: Original Data vs PCA Data",
    xlabel="Price Change Percentage (24h)",
    ylabel="Price Change Percentage (7d)",
    width=800,
    height=400,
    show_legend=True
)

# Display the composite plot
composite_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The impact of using fewer features (via PCA) to cluster the data using K-Means can be summarized as follows:
    1. Improved Clustering Efficiency 
    * PCA reduces the dimensionality of the data, focusing only on the most important components that explain the majority of the variance. This simplifies the clustering process, making it computationally faster and more efficient, especially for high-dimensional datasets like cryptocurrency data.

    2. Reduced Noise and Redundancy
    * The original data may include "noisy" or "redundant" features that impact clustering results. By removing less significant features, PCA reduces noise and redundancy. This often results in tighter, more distinct clusters, as the clustering algorithm focuses on the core structure of the data.

    3. Better Visualization
    *  For the original data: clustering in high-dimensional space can be difficult to visualize and interpret. In the PCA data, reducing the data to two or three principal components allows for easier visualization of clusters in 2D or 3D plots. This makes it easier to identify patterns and relationships between clusters.

    4. Potential Loss of Information
    * The original data retains all features, which may be important if specific features are critical for analysis. While PCA retains most of the variance, some information is inevitably lost during dimensionality reduction. If the lost variance contains meaningful information, it could slightly affect the accuracy of the clusters.

    5. Practical Implications for Cryptocurrencies
    * In the original data, clusters may be influenced by short-term or long-term price changes, leading to less distinct groupings. In the PCA data clusters are based on the most significant patterns in the data, which may better represent groups of cryptocurrencies with similar behaviors or trends. PCA-based clustering can provide a clearer and more meaningful segmentation of cryptocurrencies, aiding in market analysis and decision-making.
    

  **Conclusion:**
  * Using fewer features via PCA generally improves clustering efficiency, reduces noise, and enhances visualization, but it may result in a slight loss of information. For cryptocurrencies, PCA-based clustering often leads to more distinct and interpretable clusters, making it a valuable tool for analysis.

Data for this dataset was generated by _edX Boot Camps LLC_, and is intended for educational purposes only.