# Clustering Crypto

In [1]:
!pip install -U altair



In [2]:
# Initial imports
import requests
import pandas as pd
import altair as alt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

import requests
import json

In [3]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

# Submit request and format output
response_data = requests.get(url).json()
print(json.dumps(response_data, indent=4))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [4]:
# Create DataFrame
crypto_df = pd.DataFrame(response_data['Data'])
crypto_df = crypto_df.transpose(copy=True)
crypto_df.loc["BTC"]

Id                                                                   1182
Url                                                   /coins/btc/overview
ImageUrl                                          /media/37746251/btc.png
ContentCreatedOn                                               1417635237
Name                                                                  BTC
Symbol                                                                BTC
CoinName                                                          Bitcoin
FullName                                                    Bitcoin (BTC)
Description             Bitcoin uses peer-to-peer technology to operat...
AssetTokenStatus                                                      N/A
Algorithm                                                         SHA-256
ProofType                                                             PoW
SortOrder                                                               1
Sponsored                             

### Fetching Cryptocurrency Data

### Data Preprocessing

In [5]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply']].copy()
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,41.999952
300,300 token,,True,,300,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
433,433 Token,,False,,,
...,...,...,...,...,...,...
OLT,OneLedger,,True,,1000000000,0
RNT,OneRoot Network,,True,,400000000,0
OPEN,Open Platform,,True,,1745447045,0
SIGNA,Signa,Shabal256,True,PoC,2139048800,2139048800


In [6]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df["IsTrading"]==True].copy()
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,41.999952
300,300 token,,True,,300,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0
...,...,...,...,...,...,...
OLT,OneLedger,,True,,1000000000,0
RNT,OneRoot Network,,True,,400000000,0
OPEN,Open Platform,,True,,1745447045,0
SIGNA,Signa,Shabal256,True,PoC,2139048800,2139048800


In [7]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df.Algorithm != "N/A"].copy()
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,41.999952
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0
808,808,SHA-256,True,PoW/PoS,0,0
...,...,...,...,...,...,...
NANO,Nano,Blake2b,True,PoW,133248290,0
NAV,NavCoin,X13,True,PoW/PoS,72827352.565308,0
NEBL,Neblio,PoS,True,,18959467.798138,0
NVC,NovaCoin,Scrypt,True,PoW/PoS,3582622.714205,0


In [8]:
# Remove the "IsTrading" column
crypto_df.drop(columns="IsTrading", inplace=True)
crypto_df.tail(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
KMD,Komodo,Equihash,dPoW/PoW,132226914.13131,0.0
LTC,Litecoin,Scrypt,PoW,69626008.233471,69626008.233471
MEC,MegaCoin,Scrypt,PoW,39854933.7431,0.0
MONA,MonaCoin,Scrypt,PoW,84796674.971579,0.0
NMC,Namecoin,SHA-256,PoW,17983800.0,0.0
NANO,Nano,Blake2b,PoW,133248290.0,0.0
NAV,NavCoin,X13,PoW/PoS,72827352.565308,0.0
NEBL,Neblio,PoS,,18959467.798138,0.0
NVC,NovaCoin,Scrypt,PoW/PoS,3582622.714205,0.0
SIGNA,Signa,Shabal256,PoC,2139048800.0,2139048800.0


In [9]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(how='any',axis=0)

In [10]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df.TotalCoinsMined != 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952
NSR,NuShares,PoS,PoS,6172691537.8311,0
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0
CMTC,CometCoin,Scrypt,PoW,872830,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,0
...,...,...,...,...,...
NANO,Nano,Blake2b,PoW,133248290,0
NAV,NavCoin,X13,PoW/PoS,72827352.565308,0
NEBL,Neblio,PoS,,18959467.798138,0
NVC,NovaCoin,Scrypt,PoW/PoS,3582622.714205,0


In [11]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[
    (crypto_df.CoinName != "N/A") & 
    (crypto_df.Algorithm != "N/A") & 
    (crypto_df.ProofType != "N/A") & 
    (crypto_df.TotalCoinsMined != "N/A") &
    (crypto_df.CirculatingSupply != "N/A")
].copy()
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952
NSR,NuShares,PoS,PoS,6172691537.8311,0
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0
CMTC,CometCoin,Scrypt,PoW,872830,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,0
...,...,...,...,...,...
NMC,Namecoin,SHA-256,PoW,17983800,0
NANO,Nano,Blake2b,PoW,133248290,0
NAV,NavCoin,X13,PoW/PoS,72827352.565308,0
NVC,NovaCoin,Scrypt,PoW/PoS,3582622.714205,0


In [12]:
crypto_df_with_name = crypto_df.copy()

In [13]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coinName_df = crypto_df.index
coinName_df

Index(['42', 'NSR', 'TRI', 'CMTC', 'CHAT', 'QRL', 'PURA', 'BTCP', 'ADK',
       'DAPS',
       ...
       'KCASH', 'KMD', 'LTC', 'MEC', 'MONA', 'NMC', 'NANO', 'NAV', 'NVC',
       'SIGNA'],
      dtype='object', length=133)

In [14]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns="CoinName",inplace=True)
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,Scrypt,PoW/PoS,41.999952,41.999952
NSR,PoS,PoS,6172691537.8311,0
TRI,X13,PoW/PoS,191620.842403,0
CMTC,Scrypt,PoW,872830,0
CHAT,Scrypt,PoW/PoS,1000000000,0
...,...,...,...,...
NMC,SHA-256,PoW,17983800,0
NANO,Blake2b,PoW,133248290,0
NAV,X13,PoW/PoS,72827352.565308,0
NVC,Scrypt,PoW/PoS,3582622.714205,0


In [15]:
# Create dummy variables for text features
encoded_features_df = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
encoded_features_df

Unnamed: 0,TotalCoinsMined,CirculatingSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,41.999952,41.999952,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6172691537.8311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,191620.842403,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NMC,17983800,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NANO,133248290,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
NAV,72827352.565308,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NVC,3582622.714205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Standardize data
encoded_features_scaled = StandardScaler().fit_transform(encoded_features_df)
encoded_features_scaled

array([[-0.16752606, -0.10797706, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883],
       [-0.11711374, -0.10797707, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883],
       [-0.1675245 , -0.10797707, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883],
       ...,
       [-0.16693128, -0.10797707, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883],
       [-0.16749681, -0.10797707, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883],
       [-0.15005647,  0.07281094, -0.08703883, ..., -0.08703883,
        -0.08703883, -0.08703883]])

### Reducing Dimensions Using PCA

In [17]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [18]:
# Create a DataFrame with the principal components data
crypto_pca = pca.fit_transform(encoded_features_scaled)

pcs_df = pd.DataFrame(
    data=crypto_pca, index=coinName_df,columns=["PC 1", "PC 2", "PC 3"]
)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,0.698362,-1.288289,-1.103989
NSR,1.352145,-0.461180,-0.689938
TRI,1.462562,-1.678126,-1.390150
CMTC,-1.047678,-0.149514,-0.267213
CHAT,0.698820,-1.285196,-1.103923
...,...,...,...
NMC,-1.603583,0.588150,0.192789
NANO,-1.669371,0.450630,0.255502
NAV,1.462596,-1.677902,-1.390145
NVC,0.698363,-1.288278,-1.103988


In [19]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.02842379, 0.02644755, 0.02477487])

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [20]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using altair
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(x="k", y="inertia").properties(
    title='Elbow Curve'
).interactive()

Running K-Means with k=4

In [21]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Append class to pcs_df
pcs_df["class"] = model.labels_

In [22]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
crypto_df_with_class = crypto_df_with_name.join(pcs_df,how="inner")
crypto_df_with_class.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC 1,PC 2,PC 3,class
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952,0.698362,-1.288289,-1.103989,0
NSR,NuShares,PoS,PoS,6172691537.8311,0.0,1.352145,-0.46118,-0.689938,0
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0.0,1.462562,-1.678126,-1.39015,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0.0,-1.047678,-0.149514,-0.267213,1
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,0.0,0.69882,-1.285196,-1.103923,0


In [23]:
# Display coin class based on PC 1 & PC 2
alt.Chart(crypto_df_with_class).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "CirculatingSupply"],
    color="class",
).interactive()

### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [24]:
# Scale data to create the scatter plot
x_y_scaled = StandardScaler().fit_transform(crypto_df_with_class[["TotalCoinsMined","CirculatingSupply"]])

crypto_df_with_class[["TotalCoinsMinedScaled","CirculatingSupplyScaled"]] = x_y_scaled
crypto_df_with_class

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC 1,PC 2,PC 3,class,TotalCoinsMinedScaled,CirculatingSupplyScaled
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952,0.698362,-1.288289,-1.103989,0,-0.167526,-0.107977
NSR,NuShares,PoS,PoS,6172691537.8311,0,1.352145,-0.461180,-0.689938,0,-0.117114,-0.107977
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0,1.462562,-1.678126,-1.390150,0,-0.167525,-0.107977
CMTC,CometCoin,Scrypt,PoW,872830,0,-1.047678,-0.149514,-0.267213,1,-0.167519,-0.107977
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,0,0.698820,-1.285196,-1.103923,0,-0.159359,-0.107977
...,...,...,...,...,...,...,...,...,...,...,...
NMC,Namecoin,SHA-256,PoW,17983800,0,-1.603583,0.588150,0.192789,1,-0.167379,-0.107977
NANO,Nano,Blake2b,PoW,133248290,0,-1.669371,0.450630,0.255502,1,-0.166438,-0.107977
NAV,NavCoin,X13,PoW/PoS,72827352.565308,0,1.462596,-1.677902,-1.390145,0,-0.166931,-0.107977
NVC,NovaCoin,Scrypt,PoW/PoS,3582622.714205,0,0.698363,-1.288278,-1.103988,0,-0.167497,-0.107977


In [25]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(crypto_df_with_class).mark_circle(size=60).encode(
    x="TotalCoinsMinedScaled",
    y="CirculatingSupplyScaled",
    tooltip=["CoinName"],
    color="CoinName",
).interactive()

#### Table of Tradable Cryptocurrencies

In [26]:
# Table with tradable cryptos
display(crypto_df_with_class[["CoinName", "Algorithm", "ProofType", "TotalCoinsMined", "CirculatingSupply", "class"]])

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,class
42,42 Coin,Scrypt,PoW/PoS,41.999952,41.999952,0
NSR,NuShares,PoS,PoS,6172691537.8311,0,0
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0,0
CMTC,CometCoin,Scrypt,PoW,872830,0,1
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,0,0
...,...,...,...,...,...,...
NMC,Namecoin,SHA-256,PoW,17983800,0,1
NANO,Nano,Blake2b,PoW,133248290,0,1
NAV,NavCoin,X13,PoW/PoS,72827352.565308,0,0
NVC,NovaCoin,Scrypt,PoW/PoS,3582622.714205,0,0


In [27]:
# Print the total number of tradable cryptocurrencies
total_rows = crypto_df_with_class.count()
total_rows["CoinName"]

133