# Clustering Crypto

In [1]:
!pip install -U altair



In [2]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import altair as alt

### Fetching Cryptocurrency Data

In [3]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [4]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame(response["Data"]).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.643889,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


### Data Preprocessing

In [5]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply']]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42.0
300,300 token,,True,,300.0,0.0
365,365Coin,X11,True,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,0.0
433,433 Token,,False,,,


In [6]:
crypto_df.shape

(8538, 6)

In [7]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,42,42
300,300 token,,True,,300,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0


In [8]:
crypto_df.shape

(7049, 6)

In [9]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df["Algorithm"] != "N/A"]
crypto_df.shape

(1645, 6)

In [10]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(columns = ["IsTrading"])
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42.0
365,365Coin,X11,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,0.0,0.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,0.0,0.0
2015,2015 coin,X11,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,PoW/PoS,,
XPY,PayCoin,SHA-256,PoS,,
PRC,ProsperCoin,Scrypt,PoW,,


In [11]:
# Remove rows with at least 1 null value
crypto_df.isnull().sum()
crypto_df = crypto_df.dropna()
crypto_df.shape

(711, 5)

In [12]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
crypto_df.shape

(313, 5)

In [13]:
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42
NSR,NuShares,PoS,PoS,6179180000.0,0
TRI,Triangles Coin,X13,PoW/PoS,199980.0,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,0
PURA,Pura,X11,PoW,188359000.0,0
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,0
FOIN,Foin,SHA-256,,92631000.0,0
NVL,Nevula,NEP-5,,40000000000.0,0


In [14]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != "N/A"].dropna()
crypto_df.shape

(140, 5)

In [15]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = crypto_df.index
coins_name 

Index(['42', 'NSR', 'TRI', 'CMTC', 'CHAT', 'PURA', 'ADK', 'DAPS', 'VEIL',
       'RVC',
       ...
       'SMART', 'SC', 'SHIFT', 'SLS', 'SAFEX', 'ETH', 'ETC', 'RDD', 'QTUM',
       'PST'],
      dtype='object', length=140)

In [16]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
42,Scrypt,PoW/PoS,42.0,42
NSR,PoS,PoS,6179180000.0,0
TRI,X13,PoW/PoS,199980.0,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,0


In [17]:
# Create dummy variables for text features
X = pd.get_dummies(data = crypto_df, columns = ["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,CirculatingSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,42.0,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6179180000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,199980.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Standardize data
X = StandardScaler().fit_transform(X)

### Reducing Dimensions Using PCA

In [19]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X)

In [20]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    crypto_pca,
    columns = ["PC 1", "PC 2", "PC 3"],
    index = coins_name
)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.229946,1.084834,-1.437891
NSR,-0.124272,1.417266,-0.284619
TRI,-0.174194,1.86917,-1.747092
CMTC,-0.367994,-0.916143,-0.371211
CHAT,-0.22994,1.084833,-1.437891
PURA,-0.319884,-0.447762,-0.345001
ADK,-0.342485,-1.276736,0.328657
DAPS,-0.116708,2.094297,6.145095
VEIL,-0.176564,1.835805,-1.846389
RVC,-0.378832,-1.741345,0.267136


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [21]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(pcs_df)
    inertia.append(k_model.inertia_)

# Create the Elbow Curve using altair
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Create Elbow plot
alt.Chart(df_elbow).mark_line().encode(
    x="k", 
    y="inertia"
)

Running K-Means with `k=10`

In [22]:
# Initialize the K-Means model
model = KMeans(n_clusters = 10, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
k_10 = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1)
clustered_df["Class"] = k_10
clustered_df["CoinName"] = coins_name
clustered_df.head(20)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC 1,PC 2,PC 3,Class,CoinName
42,Scrypt,PoW/PoS,42.0,42,-0.229946,1.084834,-1.437891,0,42
NSR,PoS,PoS,6179180000.0,0,-0.124272,1.417266,-0.284619,5,NSR
TRI,X13,PoW/PoS,199980.0,0,-0.174194,1.86917,-1.747092,8,TRI
CMTC,Scrypt,PoW,872830.0,0,-0.367994,-0.916143,-0.371211,7,CMTC
CHAT,Scrypt,PoW/PoS,1000000000.0,0,-0.22994,1.084833,-1.437891,0,CHAT
PURA,X11,PoW,188359000.0,0,-0.319884,-0.447762,-0.345001,7,PURA
ADK,IMesh,PoW,25000000.0,0,-0.342485,-1.276736,0.328657,1,ADK
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,0,-0.116708,2.094297,6.145095,3,DAPS
VEIL,X16RT,PoW/PoS,119516000.0,0,-0.176564,1.835805,-1.846389,8,VEIL
RVC,X16R,PoW,10501500000.0,0,-0.378832,-1.741345,0.267136,1,RVC


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [23]:
# Scale data to create the scatter plot
scaler = MinMaxScaler()
x_minmax = scaler.fit_transform(X)

In [24]:
# Plot the scatter with x="PC 1" and y="PC 2"
# Plot the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    color='Class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'CirculatingSupply']
).interactive()

#### Table of Tradable Cryptocurrencies

In [25]:
# Table with tradable cryptos
# clustered_df.altair.table(columns=["CoinName", "Algorithm", "TotalCoinsMined", "CirculatingSupply"], sortable=True, selectable=True)
display(clustered_df)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC 1,PC 2,PC 3,Class,CoinName
42,Scrypt,PoW/PoS,42,42,-0.229946,1.084834,-1.437891,0,42
NSR,PoS,PoS,6.17918e+09,0,-0.124272,1.417266,-0.284619,5,NSR
TRI,X13,PoW/PoS,199980,0,-0.174194,1.869170,-1.747092,8,TRI
CMTC,Scrypt,PoW,872830,0,-0.367994,-0.916143,-0.371211,7,CMTC
CHAT,Scrypt,PoW/PoS,1000000000,0,-0.229940,1.084833,-1.437891,0,CHAT
...,...,...,...,...,...,...,...,...,...
ETH,Ethash,PoW,1.21229e+08,1.21229e+08,-0.360040,-1.352094,0.110140,1,ETH
ETC,EtcHash,PoW,1.3404e+08,1.3404e+08,-0.377658,-1.731455,0.265241,1,ETC
RDD,Scrypt,PoW/PoS,3.10338e+10,0,-0.229746,1.084805,-1.437895,0,RDD
QTUM,POS 3.0,PoS,104211489,0,-0.103684,1.735199,-0.377010,5,QTUM


In [26]:
# Print the total number of tradable cryptocurrencies
print(f" Total Number of Tradable Coins: {coins_name.size}")

 Total Number of Tradable Coins: 140
