In [2]:
# Import required libraries and dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Function to find the Best Value for k with K-means Using the Original/ pca Data.

In [3]:
def compute_elbow_curve(data, k):
    # Create a list to store the inertia values
    inertia = []
    # Create a for loop to compute the inertia with each possible value of k
    # Inside the loop:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    # 2. Fit the model to the data using `df_market_data_scaled`
    # 3. Append the model.inertia_ to the inertia list
    for k in k:
        model = KMeans(n_clusters=k, random_state=0)
        model.fit(data)
        inertia.append(model.inertia_)
    return inertia

### Function to Plot a line chart with all the inertia values computed with the different values of k to visually identify the optimal value for k for both Original/ pca Data

In [4]:
def plot_elbow_curve(data, k_values, title):
    # Compute inertia values using the compute_elbow_curve function
    inertia_values = compute_elbow_curve(data, k)
    # Create a DataFrame with the data to plot the Elbow curve
    df_elbow = pd.DataFrame({"k": k, "inertia": inertia_values})
    # Plot a line chart with all the inertia values
    return df_elbow.hvplot.line(x="k", y="inertia", title=title, xticks=k)

### Function to find Cluster Cryptocurrencies with optimal K and K-means for both the Original/pca Data

In [5]:
def perform_kmeans_clustering(data, n_clusters):
    # Initialise the K-Means model using the specified value for k
    k_model = KMeans(n_clusters=n_clusters, random_state=0)
    # Fit the K-Means model using the data
    k_model.fit(data)
    # Predict the clusters
    return k_model.predict(data)

### Function to create a scatter plot for the Clustering of Cryptocurrencies for both Original/pca data

In [6]:
def visualise_clusters(data, x_col, y_col, cluster_col, hover_cols, title):
    # Create a scatter plot using hvPlot
    return data.hvplot.scatter(x=x_col, y=y_col, by=cluster_col, hover_cols=hover_cols, title=title)

### main body coding to load and prepare data

In [7]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv("Resources/crypto_market_data.csv", index_col="coin_id")
# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [8]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [9]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90,
    title='plot of the data in df_market_data'
)


### Prepare the Data

In [10]:
# Specify columns to scale
columns_to_scale = ['price_change_percentage_24h', 'price_change_percentage_7d', 'price_change_percentage_14d',
                     'price_change_percentage_30d', 'price_change_percentage_60d', 'price_change_percentage_200d',
                     'price_change_percentage_1y']

# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_market_data_scaled = StandardScaler().fit_transform(df_market_data,columns_to_scale)

# Diplay the first five rows of the scaled data
df_market_data_scaled[0:5]

array([[ 0.50852937,  0.49319307,  0.77220043,  0.23545963, -0.0674951 ,
        -0.35595348, -0.25163688],
       [ 0.18544589,  0.93444504,  0.55869212, -0.05434093, -0.27348273,
        -0.11575947, -0.19935211],
       [ 0.02177396, -0.70633685, -0.02168042, -0.06103015,  0.00800452,
        -0.55024692, -0.28206051],
       [-0.04076438, -0.81092807,  0.24945797, -0.05038797, -0.37316402,
        -0.45825882, -0.29554614],
       [ 1.19303608,  2.00095907,  1.76061001,  0.54584206, -0.29120287,
        -0.49984776, -0.27031695]])

In [11]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    df_market_data_scaled,
    columns=['price_change_percentage_24h', 'price_change_percentage_7d', 'price_change_percentage_14d','price_change_percentage_30d', 'price_change_percentage_60d',
             'price_change_percentage_200d','price_change_percentage_1y']
)

# Copy the crypto names from the original data
df_market_data_scaled["coin_id"] = df_market_data.index
# Set the coinid column as index
df_market_data_scaled = df_market_data_scaled.set_index("coin_id")

# Display sample data
df_market_data_scaled.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


### Find the Best Value for k Using the Original Data using compute_elbow_curvefunction and plot it using plot_elbow_curve function

In [11]:
# Create a list with the number of k-values
k = list(range(1, 12))
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_original=plot_elbow_curve(df_market_data_scaled, k, "Elbow Curve")
elbow_original

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


#### Answer the following question: 

**Question:** What is the best value for `k`? 

**Answer:** Based on the elbow graph K=6 is the best value

### Cluster Cryptocurrencies with K-means and optimal K Using the Original Data-visualise_clusters function used to plot it

In [12]:
# Perform K-Means clustering on original scaled data
crypto_clusters_original = perform_kmeans_clustering(df_market_data_scaled, 6)

# Add cluster labels to the original DataFrame
df_market_data_scaled["cluster_original"] = crypto_clusters_original

# Visualise clusters for original scaled data
scatter_original=visualise_clusters(df_market_data_scaled, "price_change_percentage_24h", "price_change_percentage_7d",
                    "cluster_original", ["coin_id"], "K-Means Clustering of Cryptocurrencies - Original Scaled Data")
scatter_original

  super()._check_params_vs_input(X, default_n_init=10)


### Optimise Clusters with Principal Component Analysis.

In [13]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca_market_data = pca.fit_transform(df_market_data_scaled)

# View the first five rows of the DataFrame. 
pca_market_data[:5]

array([[ 2.49185406, -0.50193803, -0.33669683],
       [ 2.4272637 , -0.38525243, -0.90874669],
       [-0.7198418 , -0.45747863,  0.43125843],
       [-0.68065639, -0.49843262,  0.2867426 ],
       [ 3.65721715, -0.94933472, -0.19695419]])

In [14]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
explained_variance=pca.explained_variance_ratio_
explained_variance

array([0.43042171, 0.27018512, 0.19955527])

In [15]:
total_explained_variance = np.sum(explained_variance)
total_explained_variance

0.9001621091639058

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 0.90

In [16]:
# Create a new DataFrame with the PCA data.

# Creating a DataFrame with the PCA data
pca_market_data_df = pd.DataFrame(data=pca_market_data, columns=['PC1', 'PC2', 'PC3'])

# Copy the crypto names from the original data
pca_market_data_df["coin_id"] = df_market_data_scaled.index


# Set the coinid column as index
pca_market_data_df = pca_market_data_df.set_index("coin_id")

# Display sample data
pca_market_data_df.head()

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,2.491854,-0.501938,-0.336697
ethereum,2.427264,-0.385252,-0.908747
tether,-0.719842,-0.457479,0.431258
ripple,-0.680656,-0.498433,0.286743
bitcoin-cash,3.657217,-0.949335,-0.196954


### Find the Best Value for k Using the pca Data using compute_elbow_curvefunction and plot it using plot_elbow_curve function

In [17]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k
elbow_pca=plot_elbow_curve(pca_market_data_df, k, "Elbow Curve Using PCA Data")
elbow_pca

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** K=6


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** Based on this Elbow Curve ploted by PCA method k=5 looks good but the same k (k=6) looks the correct answer.

### Cluster Cryptocurrencies with K-means and optimal K Using the pca Data-visualise_clusters function used to plot it

In [18]:
# Perform K-Means clustering on PCA data

crypto_clusters_pca = perform_kmeans_clustering(pca_market_data_df, 6)

# Add cluster labels to the PCA DataFrame
pca_market_data_df["cluster_pca"] = crypto_clusters_pca

# Visualise clusters for PCA data
scatter_pca=visualise_clusters(pca_market_data_df, "PC1", "PC2", "cluster_pca", ["coin_id"],
                    "K-Means Clustering of Cryptocurrencies - PCA Data")
scatter_pca

  super()._check_params_vs_input(X, default_n_init=10)


### Visualise and Compare the Results

In this section, you will visually analyse the cluster analysis results by contrasting the outcome with and without using the optimisation techniques.

In [20]:
# Composite plot to contrast the Elbow curves
composite_plot = elbow_original + elbow_pca

# # Display the composite plot
composite_plot

In [21]:
# Composite plot to contrast the clusters
composite_plot_clusters = (scatter_original + scatter_pca)

# Display the composite plot
composite_plot_clusters

#### Answer the following question: 

  * **Question:** After visually analysing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** It's crucial to recognize that the consequences of reducing features before employing K-Means clustering, illustrated by employing PCA to transform the data into a three-dimensional space, can manifest differently depending on factors such as the data's characteristics, the clustering algorithm employed, and the specific objectives of the analysis. In the context of this challenge, the impact includes:

1-Enhanced Interpretability:
The resulting clusters may become more interpretable, simplifying the understanding of distinct patterns within the data.

2-Improved Computational Efficiency:
The computational efficiency of the clustering process may improve, leading to faster execution times, especially beneficial for handling larger datasets.

3-Noise Reduction:
Irrelevant or noisy features are potentially mitigated, contributing to more robust clustering results by focusing on the most pertinent information.