# Clustering Crypto

In [1]:
import statsmodels.api as sm

In [279]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from pathlib import Path
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [280]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [281]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
#s = requests.get(url).content
#c = pd.read_csv(s)

In [282]:
# Alternatively, use the provided csv file:
file_path = Path('Resources/crypto_data.csv')
# Create a DataFrame
crypto_df = pd.read_csv(file_path, index_col = 0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [283]:
# Keep only necessary columns:
#'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'


In [284]:
# new Data Frame
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [285]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
print(crypto_df.shape)
crypto_df.head()

(1144, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [286]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df['Algorithm'] != 'N/A']
print(crypto_df.shape)

(1144, 6)


In [287]:
# Remove the "IsTrading" column
del crypto_df['IsTrading']
print(crypto_df.shape)

(1144, 5)


In [288]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
print(crypto_df.shape)

(685, 5)


In [289]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]
print(crypto_df.shape)

(532, 5)


In [290]:
# Drop rows where there are 'N/A' text values
crypto_df.dropna()
crypto_df.isna().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [291]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name = crypto_df[['CoinName']]
coin_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [292]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df_1 = crypto_df.drop(columns = 'CoinName')
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [293]:
dummy_crypto.head()

Unnamed: 0,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,Algorithm_CryptoNight,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [294]:
# Create dummy variables for text features
dummy_crypto = pd.get_dummies(crypto_df_1[['Algorithm', 'ProofType']], drop_first = True)
 
# Combine dummy variables with Dataframe
crypto = pd.concat([crypto_df_1, dummy_crypto], axis= 1)

#Delete text variables
del crypto['Algorithm'], crypto['ProofType']
crypto.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [295]:
# Standardize data
crypto_scaled = StandardScaler().fit_transform(crypto)
crypto_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

### Reducing Dimensions Using PCA

In [296]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components = 3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [297]:
# Create a DataFrame with the principal components data
df_crypto_pca = pd.DataFrame(
    data = crypto_pca, columns = ['Var 1', 'Var 2', 'Var 3' ])
df_crypto_pca.head()

Unnamed: 0,Var 1,Var 2,Var 3
0,-0.335432,1.028073,-0.504153
1,-0.318738,1.028085,-0.504643
2,2.300667,1.659223,-0.603566
3,-0.151691,-1.302558,0.18282
4,-0.15017,-2.044063,0.369828


In [298]:
# This shows me the significance of each variable 
pca.explained_variance_ratio_

array([0.0285122 , 0.02182119, 0.02091789])

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [299]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(df_crypto_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {'k' : k, 'inertia' : inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x = 'k', y = 'inertia', title = 'Elbow Curve', xticks = k)

  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [305]:
# Initialize the K-Means model
model = KMeans(n_clusters = 4, random_state = 23)

# Fit the model
model.fit(df_crypto_pca)

# Predict clusters
predictions = model.predict(df_crypto_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
combined_df = pd.concat([ crypto_df_1.reset_index(), df_crypto_pca], axis = 1)
combined_df['class'] = model.labels_
combined_df.head()


Unnamed: 0,index,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Var 1,Var 2,Var 3,class
0,42,Scrypt,PoW/PoS,41.99995,42,-0.335432,1.028073,-0.504153,1
1,404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.318738,1.028085,-0.504643,1
2,1337,X13,PoW/PoS,29279420000.0,314159265359,2.300667,1.659223,-0.603566,1
3,BTC,SHA-256,PoW,17927180.0,21000000,-0.151691,-1.302558,0.18282,0
4,ETH,Ethash,PoW,107684200.0,0,-0.15017,-2.044063,0.369828,0


### Visualizing Results

#### 3D-Scatter with Clusters

In [339]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    combined_df,
    x='Var 1',
    y='Var 2',
    z='Var 3',
    hover_name='index',
    hover_data=['Algorithm'],
    width=800,
)
fig.update_layout(legend = dict(x = 0, y = 1))
fig.show()

NameError: name 'px' is not defined

#### Table of Tradable Cryptocurrencies

In [312]:
# Table with tradable cryptos
combined_df[
    [
        'index',
        'Algorithm',
        'ProofType',
        'TotalCoinsMined',
        'TotalCoinSupply',
        'Var 1',
        'Var 2',
        'Var 3',
        'class'
    ]
].hvplot.table()

In [324]:
# Print the total number of tradable cryptocurrencies
total = combined_df['Algorithm'].count()
print(f' There are {total} tradable cryptocurrencies')

 There are 532 tradable cryptocurrencies


#### Scatter Plot with Tradable Cryptocurrencies

In [337]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    combined_df[["TotalCoinSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinSupply", "TotalCoinsMined"], index=combined_df.index
)
plot_df["CoinName"] = combined_df["index"]
plot_df["Class"] = combined_df["class"]
plot_df.head()

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
0,4.2e-11,0.0,42,1
1,0.000532,0.001066,404,1
2,0.3141593,0.029576,1337,1
3,2.1e-05,1.8e-05,BTC,0
4,0.0,0.000109,ETH,0


In [338]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df.hvplot.scatter(
    x = 'TotalCoinsMined',
    y = 'TotalCoinSupply',
    hover_cols = ['index'],
    by = 'Class')