# Clustering Crypto

In [161]:
# Initial imports

import requests as r
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import json
from pathlib import Path
#pip install -U altair

### Fetching Cryptocurrency Data

In [162]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
request = r.get(url)
j = request.json()



In [163]:
# Create a DataFrame 
json_load = json.dumps(j, indent=5, sort_keys=False)
loaded = json.loads(json_load)
crypto_df = pd.DataFrame.from_dict(loaded).reset_index()

crypto_df

Unnamed: 0,index,Response,Message,Data,RateLimit,HasWarning,Type
0,42,Success,Coin list succesfully returned!,"{'Id': '4321', 'Url': '/coins/42/overview', 'I...",,False,100
1,300,Success,Coin list succesfully returned!,"{'Id': '749869', 'Url': '/coins/300/overview',...",,False,100
2,365,Success,Coin list succesfully returned!,"{'Id': '33639', 'Url': '/coins/365/overview', ...",,False,100
3,404,Success,Coin list succesfully returned!,"{'Id': '21227', 'Url': '/coins/404/overview', ...",,False,100
4,433,Success,Coin list succesfully returned!,"{'Id': '926547', 'Url': '/coins/433/overview',...",,False,100
...,...,...,...,...,...,...,...
7951,KNC,Success,Coin list succesfully returned!,"{'Id': '310497', 'Url': '/coins/knc/overview',...",,False,100
7952,LATX,Success,Coin list succesfully returned!,"{'Id': '849939', 'Url': '/coins/latx/overview'...",,False,100
7953,LEO,Success,Coin list succesfully returned!,"{'Id': '930571', 'Url': '/coins/leo/overview',...",,False,100
7954,LC4,Success,Coin list succesfully returned!,"{'Id': '33001', 'Url': '/coins/lc4/overview', ...",,False,100


In [164]:
# Alternatively, use the provided csv file:
file_path = Path("../Instructions/Starter_Files/Resources/crypto_data.csv")

# Create a DataFrame
coin_data = pd.read_csv(file_path)


### Data Preprocessing

In [165]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
coin_data = coin_data.loc[:,["CoinName", "Algorithm", "IsTrading", "ProofType", "TotalCoinsMined", "TotalCoinSupply"]]
coin_data


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [166]:
# Keep only cryptocurrencies that are trading
coin_data.drop(coin_data[coin_data['IsTrading'] == False ].index, inplace = True)

coin_data.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,980222600.0,1400222610
1246,Horizen,Equihash,True,PoW,7296538.0,21000000
1247,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000


In [167]:
# Remove the "IsTrading" column

coin_data = coin_data.drop(["IsTrading"], axis=1)

In [168]:
# Remove rows with at least 1 null value
coin_data.fillna(0)
coin_data = coin_data.dropna()

coin_data.isnull()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,False,False,False,False,False
2,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
7,False,False,False,False,False
...,...,...,...,...,...
1238,False,False,False,False,False
1242,False,False,False,False,False
1245,False,False,False,False,False
1246,False,False,False,False,False


In [169]:
# Remove rows with cryptocurrencies having no coins mined
coin_data.drop(coin_data[coin_data['TotalCoinsMined'] < 0 ].index, inplace = True)
coin_data.tail(150)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
989,iDealCash,Scrypt,PoW/PoS,1.404158e+09,5121951220
990,Jumpcoin,NIST5,PoW,2.106935e+07,21000000
991,Infinex,Lyra2RE,PoW,5.097690e+06,26280000
992,Bitcoin Incognito,XEVAN,PoS/PoW,1.090496e+07,21000000
993,KEYCO,Tribus,PoW,7.954470e+05,18000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [170]:
# Drop rows where there are 'N/A' text values
# I have previusly drop null values

In [171]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name_column_df = pd.DataFrame(coin_data['CoinName'])
coin_name_column_df.astype('str')
coin_name_column_df.tail(150)


Unnamed: 0,CoinName
989,iDealCash
990,Jumpcoin
991,Infinex
992,Bitcoin Incognito
993,KEYCO
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [172]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
coin_data = coin_data.drop(["CoinName"], axis=1)
coin_data

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
4,SHA-256,PoW/PoS,0.000000e+00,0
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [173]:
# Create dummy variables for text features
# Binary encode the weekly column to use as new input features for the model
added_dimmies = pd.get_dummies(coin_data, columns=["Algorithm", "ProofType"],drop_first = True)
added_dimmies

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0.000000e+00,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [174]:
# Standardize data
scaler = StandardScaler()  
scaled_coin_data = scaler.fit_transform(added_dimmies)
scaled_coin_data

array([[-0.10312268, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.07692584, -0.03826624, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10312268, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       ...,
       [-0.07878691, -0.03826599, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10294153, -0.03826638, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10311949, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394]])

### Reducing Dimensions Using PCA

In [175]:
# Use PCA to reduce dimensions to 3 principal components

pca = PCA(n_components=3)
scaled_coin_data = pca.fit_transform(scaled_coin_data)
 

In [176]:
# Create a DataFrame with the principal components data
pricipalDf = pd.DataFrame(data = scaled_coin_data,columns = ['principal component 1', 'principal component 2', 'principal component 3'])
pca_df = pd.concat([coin_name_column_df, pricipalDf], axis=1, join='outer')
pca_df

Unnamed: 0,CoinName,principal component 1,principal component 2,principal component 3
0,42 Coin,-0.176790,-1.183162,-0.401276
1,,-0.162493,-1.181603,-0.402190
2,404Coin,-0.154809,-0.863188,-0.220966
3,,0.294681,-2.024364,-0.349732
4,808,-0.232755,1.435780,0.082503
...,...,...,...,...
1238,ZEPHYR,,,
1242,Gapcoin,,,
1245,Beldex,,,
1246,Horizen,,,


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [177]:

inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pricipalDf)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



  "KMeans is known to have a memory leak on Windows "


In [178]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=5)
# Fit the model
model.fit(pricipalDf)
# Predict clusters
prediction = model.predict(pricipalDf)
coin_data["class"] = model.labels_


# Create a new DataFrame including predicted clusters and cryptocurrencies features
coin_data = pd.concat([coin_data, pca_df,], axis=1, join='outer')
#clustered_df
#prediction
coin_data

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,class,CoinName,principal component 1,principal component 2,principal component 3
0,Scrypt,PoW/PoS,4.199995e+01,42,0.0,42 Coin,-0.176790,-1.183162,-0.401276
1,,,,,,,-0.162493,-1.181603,-0.402190
2,Scrypt,PoW/PoS,1.055185e+09,532000000,0.0,404Coin,-0.154809,-0.863188,-0.220966
3,,,,,,,0.294681,-2.024364,-0.349732
4,SHA-256,PoW/PoS,0.000000e+00,0,0.0,808,-0.232755,1.435780,0.082503
...,...,...,...,...,...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000,0.0,ZEPHYR,,,
1242,Scrypt,PoW/PoS,1.493105e+07,250000000,0.0,Gapcoin,,,
1245,CryptoNight,PoW,9.802226e+08,1400222610,1.0,Beldex,,,
1246,Equihash,PoW,7.296538e+06,21000000,1.0,Horizen,,,


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [179]:
# Scale data to create the scatter plot
added_dummies = pd.get_dummies(coin_data, columns=["Algorithm", "ProofType", "CoinName"],drop_first = True)
added_dummies
scaler = MinMaxScaler()
scaler.fit(added_dummies)
scaled = scaler.fit_transform(added_dummies)
scaled_df = pd.DataFrame(scaled, columns=added_dummies.columns)
scaled_df.tail()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,class,principal component 1,principal component 2,principal component 3,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,...,CoinName_Zilbercoin,CoinName_Zoin,CoinName_ZoneCoin,CoinName_Zurcoin,CoinName_eBoost,CoinName_eMark,CoinName_gCn Coin,CoinName_iBankCoin,CoinName_iDealCash,CoinName_iOlite
1020,0.002020225,2.168404e-08,0.0,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1021,1.508204e-05,2.710505e-09,0.0,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1022,0.0009901351,1.518124e-08,0.333333,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1023,7.370324e-06,2.276825e-10,0.333333,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1024,1.296247e-07,1.084202e-11,0.0,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

plot = scaled_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", hover_cols=["CoinName"], height=1000, width=1000)
plot              

#### Table of Tradable Cryptocurrencies

In [181]:
# Table with tradable cryptos
table_plot = coin_data.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class'], sortable=True, selectable=True)
table_plot

In [191]:
# Print the total number of tradable cryptocurrencies
counts = coin_data.count()[0]
print(f' the total number of tradable cryptocurrencies is: {counts}')

 the total number of tradable cryptocurrencies is: 684
