# Clustering Crypto

In [112]:
# Initial imports
import requests
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [113]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [114]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.


In [115]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
crypto_df = pd.read_csv(file_path)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [116]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

### Data Preprocessing

In [117]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
#cleaned_crypto_df = crypto_df.loc(["CoinName","Algorithm","IsTrading",
                        #"ProofType","TotalCoinsMined","TotalCoinSupply"]
cleaned_crytpo_df = crypto_df.reset_index

In [118]:
# Keep only cryptocurrencies that are trading
trading_crypto_df = cleaned_crytpo_df(['IsTrading'] == True)
trading_crypto_df = trading_crypto_df.drop(["index"], axis=1)
trading_crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [119]:
# Keep only cryptocurrencies with a working algorithm
trading_crypto_df = trading_crypto_df.sort_values(by='Algorithm', ascending=False)
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['Algorithm'])
trading_crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
305,CIRC,CryptoCircuits,vDPOS,True,PoS,0.0,0
1038,SAFE,SafeCoin,Zhash,True,PoW/PoS,0.0,36000000
945,ELIC,Elicoin,YescryptR16,True,PoW,,10000000
257,XG,XG Sports,XG Hash,True,PoW/PoS,,0
1140,BEET,Beetle Coin,XEVAN,True,PoW/PoS,198938100.0,500000000
1087,BTXC,Bettex coin,XEVAN,True,PoS,6787405.0,50000000
1051,URALS,Urals Coin,XEVAN,True,PoW,14746150.0,210000000
992,XBI,Bitcoin Incognito,XEVAN,True,PoS/PoW,10904960.0,21000000
1042,XGS,GenesisX,XEVAN,False,PoS,0.0,19000000
1101,BITM,BitMoney,XEVAN,True,Pos,208515900.0,70000000000


In [120]:
# Remove the "IsTrading" column
trading_crypto_df = trading_crypto_df.drop(['IsTrading'], axis=1)
trading_crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
305,CIRC,CryptoCircuits,vDPOS,PoS,0.0,0
1038,SAFE,SafeCoin,Zhash,PoW/PoS,0.0,36000000
945,ELIC,Elicoin,YescryptR16,PoW,,10000000
257,XG,XG Sports,XG Hash,PoW/PoS,,0
1140,BEET,Beetle Coin,XEVAN,PoW/PoS,198938100.0,500000000
1087,BTXC,Bettex coin,XEVAN,PoS,6787405.0,50000000
1051,URALS,Urals Coin,XEVAN,PoW,14746150.0,210000000
992,XBI,Bitcoin Incognito,XEVAN,PoS/PoW,10904960.0,21000000
1042,XGS,GenesisX,XEVAN,PoS,0.0,19000000
1101,BITM,BitMoney,XEVAN,Pos,208515900.0,70000000000


In [121]:
# Remove rows with at least 1 null value
trading_crypto_df = trading_crypto_df.dropna()
trading_crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
305,CIRC,CryptoCircuits,vDPOS,PoS,0.0,0
1038,SAFE,SafeCoin,Zhash,PoW/PoS,0.0,36000000
1140,BEET,Beetle Coin,XEVAN,PoW/PoS,198938100.0,500000000
1087,BTXC,Bettex coin,XEVAN,PoS,6787405.0,50000000
1051,URALS,Urals Coin,XEVAN,PoW,14746150.0,210000000
992,XBI,Bitcoin Incognito,XEVAN,PoS/PoW,10904960.0,21000000
1042,XGS,GenesisX,XEVAN,PoS,0.0,19000000
1101,BITM,BitMoney,XEVAN,Pos,208515900.0,70000000000
940,ELP,Ellerium,XEVAN,PoW/PoS,419275.4,60000000
956,REDN,Reden,X16S,PoW,0.0,14000000


In [122]:
# Remove rows with cryptocurrencies having no coins mined,
# by keeping only rows with total coins mined greater than zero
trading_crypto_df = trading_crypto_df.loc[trading_crypto_df['TotalCoinsMined'] > 0]
trading_crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1140,BEET,Beetle Coin,XEVAN,PoW/PoS,198938100.0,500000000
1087,BTXC,Bettex coin,XEVAN,PoS,6787405.0,50000000
1051,URALS,Urals Coin,XEVAN,PoW,14746150.0,210000000
992,XBI,Bitcoin Incognito,XEVAN,PoS/PoW,10904960.0,21000000
1101,BITM,BitMoney,XEVAN,Pos,208515900.0,70000000000


In [123]:
# Drop rows where there are 'N/A' text values
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['CoinName'])
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['Algorithm'])
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['ProofType'])
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['TotalCoinsMined'])
trading_crypto_df = trading_crypto_df.dropna(axis=0, subset=['TotalCoinSupply'])
trading_crypto_df.head(10)


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1140,BEET,Beetle Coin,XEVAN,PoW/PoS,198938100.0,500000000
1087,BTXC,Bettex coin,XEVAN,PoS,6787405.0,50000000
1051,URALS,Urals Coin,XEVAN,PoW,14746150.0,210000000
992,XBI,Bitcoin Incognito,XEVAN,PoS/PoW,10904960.0,21000000
1101,BITM,BitMoney,XEVAN,Pos,208515900.0,70000000000
940,ELP,Ellerium,XEVAN,PoW/PoS,419275.4,60000000
1034,TRVC,Trivechain,X16R,PoW/PoS,36923120.0,82546564
1030,XMN,Motion,X16R,PoW,8125865.0,22075700
987,PROTON,Proton,X16R,PoS,4403800.0,45000000
1075,XCG,Xchange,X16R,PoW,9753754.0,100000000


In [124]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
CoinName_df = trading_crypto_df[["CoinName"]]
CoinName_df.head(5)

Unnamed: 0,CoinName
1140,Beetle Coin
1087,Bettex coin
1051,Urals Coin
992,Bitcoin Incognito
1101,BitMoney


In [128]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
trading_crypto_df_sans_CoinName = trading_crypto_df.drop(['CoinName'], axis=1)
trading_crypto_df_sans_CoinName = trading_crypto_df.drop(['Unnamed: 0'], axis=1)
trading_crypto_df_sans_CoinName = trading_crypto_df_sans_CoinName.drop(['CoinName'], axis=1)
trading_crypto_df_sans_CoinName.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1140,XEVAN,PoW/PoS,198938100.0,500000000
1087,XEVAN,PoS,6787405.0,50000000
1051,XEVAN,PoW,14746150.0,210000000
992,XEVAN,PoS/PoW,10904960.0,21000000
1101,XEVAN,Pos,208515900.0,70000000000
940,XEVAN,PoW/PoS,419275.4,60000000
1034,X16R,PoW/PoS,36923120.0,82546564
1030,X16R,PoW,8125865.0,22075700
987,X16R,PoS,4403800.0,45000000
1075,X16R,PoW,9753754.0,100000000


In [129]:
# Create dummy variables for text features
cleaned_crypto_df = pd.get_dummies(trading_crypto_df_sans_CoinName,
                                   columns=['Algorithm', 'ProofType'])
cleaned_crypto_df.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
1140,198938100.0,500000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1087,6787405.0,50000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1051,14746150.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
992,10904960.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1101,208515900.0,70000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
940,419275.4,60000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1034,36923120.0,82546564,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1030,8125865.0,22075700,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,4403800.0,45000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1075,9753754.0,100000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
# Standardize data
standardized_crypto_df = StandardScaler().fit_transform(cleaned_crypto_df)

### Reducing Dimensions Using PCA

In [131]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [132]:
# Create a DataFrame with the principal components data
pca_crypto_df = pca.fit_transform(standardized_crypto_df)
pca_crypto_df 

array([[-0.3368219 ,  1.26459118, -0.39141416],
       [-0.28632018,  0.95718287, -0.08343784],
       [-0.17356728, -0.90254746,  0.15451554],
       ...,
       [-0.14219447, -2.08953828,  0.43356277],
       [ 0.61123796,  1.97767581, -0.71415979],
       [-0.16238632, -2.25088852,  0.33384941]])

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [133]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values    
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_crypto_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

Running K-Means with k=4

In [135]:
pca_crypto_df = pd.DataFrame(data = pca_crypto_df, 
                             columns=["PC 1", "PC 2", "PC 3"], 
                             index = index_values)
pca_crypto_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
1140,-0.336822,1.264591,-0.391414
1087,-0.28632,0.957183,-0.083438
1051,-0.173567,-0.902547,0.154516
992,-0.334293,0.550353,0.154909
1101,0.447025,0.576839,-0.133451
940,-0.34257,1.264735,-0.391304
1034,-0.361518,0.765942,-0.486656
1030,-0.194607,-1.401252,0.0592
987,-0.305964,0.458389,-0.178773
1075,-0.194014,-1.40129,0.059191


In [136]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pca_crypto_df)

# Predict clusters
predictions = model.predict(pca_crypto_df)
pca_crypto_df["class"] = model.labels_   
print(predictions)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
# index_values = (cleaned_crypto_df.index.tolist())
# pca_crypto_df= pd.DataFrame(data = pca_crypto_df, 
#                             index = index_values)
# frames = [trading_crypto_df,pca_crypto_df]
# clustered_df = pd.concat(frames, sort=False)
# frames_2 = [clustered_df,CoinName_df]
# clustered_df = pd.concat(frames_2, sort=False)
# # clustered_df = cleaned_crypto_df.join(, how='inner')
# clustered_df.head(10)

[1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 1
 1 1 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 0 1 1 0 1 1 1
 1 1 0 1 0 2 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0
 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1
 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0
 1 0 1 1 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 1 1 0
 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1 1
 1 1 0 1 1 0 0 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1
 1 3 1 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0 1
 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 3 3
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0
 0 0 0 1 1 1 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0
 0 3 1 0 0 0 0 0 0 1 0 1 

In [140]:
clustered_df = trading_crypto_df_sans_CoinName.join(pca_crypto_df, how='inner')
clustered_df_complete = clustered_df.join(CoinName_df, how='inner')
clustered_df_complete.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
1140,XEVAN,PoW/PoS,198938100.0,500000000,-0.336822,1.264591,-0.391414,1,Beetle Coin
1087,XEVAN,PoS,6787405.0,50000000,-0.28632,0.957183,-0.083438,1,Bettex coin
1051,XEVAN,PoW,14746150.0,210000000,-0.173567,-0.902547,0.154516,0,Urals Coin
992,XEVAN,PoS/PoW,10904960.0,21000000,-0.334293,0.550353,0.154909,1,Bitcoin Incognito
1101,XEVAN,Pos,208515900.0,70000000000,0.447025,0.576839,-0.133451,1,BitMoney
940,XEVAN,PoW/PoS,419275.4,60000000,-0.34257,1.264735,-0.391304,1,Ellerium
1034,X16R,PoW/PoS,36923120.0,82546564,-0.361518,0.765942,-0.486656,1,Trivechain
1030,X16R,PoW,8125865.0,22075700,-0.194607,-1.401252,0.0592,0,Motion
987,X16R,PoS,4403800.0,45000000,-0.305964,0.458389,-0.178773,1,Proton
1075,X16R,PoW,9753754.0,100000000,-0.194014,-1.40129,0.059191,0,Xchange


### Visualizing Results

#### 3D-Scatter with Clusters

In [146]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df_complete, 
    x="PC 1", 
    y="PC 2", 
    z="PC 3", 
    color="class", 
    symbol="class",
    width=800,
    hover_name="CoinName", 
    hover_data=["Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Table of Tradable Cryptocurrencies

In [148]:
# Table with tradable cryptos
clustered_df_complete.hvplot.table(columns=['CoinName',
                                            'Algorithm',
                                            'ProofType',
                                            'TotalCoinsMined',
                                            'TotalCoinSupply',
                                            'class'],
                                   sortable=True, selectable=True)

In [149]:
# Print the total number of tradable cryptocurrencies
clustered_df_complete['CoinName'].count()

577

#### Scatter Plot with Tradable Cryptocurrencies

In [150]:
# Scale data to create the scatter plot
tradable_crypto_df = clustered_df_complete[['TotalCoinSupply', 'TotalCoinsMined']]
crypto_MinMax = MinMaxScaler().fit_transform(tradable_crypto_df)
crypto_MinMax


array([[5.00000000e-04, 2.00949869e-04],
       [5.00000000e-05, 6.85600057e-06],
       [2.10000000e-04, 1.48952315e-05],
       ...,
       [0.00000000e+00, 1.99842507e-05],
       [5.00000000e-02, 2.30324675e-02],
       [8.19621000e-05, 1.15646844e-05]])

In [151]:
index_values = (clustered_df_complete.index.tolist())
plot_df1 = pd.DataFrame(
    data = crypto_MinMax, columns=["TotalCoinSupply_scaled",
                                   "TotalCoinsMined_scaled"],
    index = index_values)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df2 = plot_df1.join(CoinName_df, how='inner')

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
class_df = clustered_df_complete['class']
plot_df_final = plot_df2.join(class_df, how='inner')

plot_df_final.head(10)

Unnamed: 0,TotalCoinSupply_scaled,TotalCoinsMined_scaled,CoinName,class
1140,0.0005,0.0002009499,Beetle Coin,1
1087,5e-05,6.856001e-06,Bettex coin,1
1051,0.00021,1.489523e-05,Urals Coin,0
992,2.1e-05,1.10152e-05,Bitcoin Incognito,1
1101,0.07,0.0002106244,BitMoney,1
940,6e-05,4.234729e-07,Ellerium,1
1034,8.3e-05,3.729646e-05,Trivechain,1
1030,2.2e-05,8.207996e-06,Motion,0
987,4.5e-05,4.448291e-06,Proton,1
1075,0.0001,9.852346e-06,Xchange,0


In [153]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df_final.hvplot.scatter(x="TotalCoinsMined_scaled",
                             y="TotalCoinSupply_scaled",
                             by="class",
                          xlabel="Total Cryptocurrency Coins Mined",
                          ylabel="Total Cryptocurrency Coin Supply",
                          )