# Clustering Crypto

In [31]:
# Initial imports
import altair as alt
from vega_datasets import data

import requests
import pandas as pd
#import matplotlib.pyplot as plt
#import hvplot.pandas
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import json

In [32]:
#%pip install vega_datasets

### Fetching Cryptocurrency Data

In [33]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
crypto_data=requests.get(url).json()


In [34]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_url_df=pd.DataFrame(crypto_data["Data"]).T
crypto_url_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,4.461121,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


### Data Preprocessing

In [35]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

crypto_df = crypto_url_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'MaxSupply']]

In [36]:
# Keep only cryptocurrencies that are trading
is_Trding = crypto_df['IsTrading']==True
crypto_trading = crypto_df[is_Trding]
crypto_trading.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
300,300 token,,True,,300.0,300
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0


In [37]:
# Keep only cryptocurrencies with a working algorithm
[i for i in crypto_df.Algorithm.unique() if i.startswith("N")]

['N/A', 'NIST5', 'NeoScrypt', 'NEP-5', 'NRC20 Token']

In [38]:
crypto_df = crypto_df[crypto_df['Algorithm'] != 'N/A']
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0


In [39]:
# Remove the 'IsTrading' column
crypto_df=crypto_df.drop(columns="IsTrading")
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
365,365Coin,X11,PoW/PoS,0.0,-1
404,404Coin,Scrypt,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,PoW,0.0,0
808,808,SHA-256,PoW/PoS,0.0,0


In [40]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True,how="any",axis=0)


In [41]:
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
MaxSupply          0
dtype: int64

In [42]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df['TotalCoinsMined']!=0]
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 342 entries, 42 to KMD
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CoinName         342 non-null    object
 1   Algorithm        342 non-null    object
 2   ProofType        342 non-null    object
 3   TotalCoinsMined  342 non-null    object
 4   MaxSupply        342 non-null    object
dtypes: object(5)
memory usage: 16.0+ KB


In [43]:
# Drop rows where there are 'N/A' text values
crypto_df[crypto_df.iloc[:] != "N/A"]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6173465937.8311,0
TRI,Triangles Coin,X13,PoW/PoS,191620.842403,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1


In [44]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
coin_name=pd.DataFrame(crypto_df["CoinName"], index = crypto_df.index)
coin_name.head()

Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat


In [45]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df=crypto_df.drop(columns="CoinName")


In [46]:
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6173465937.8311,0
TRI,X13,PoW/PoS,191620.842403,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1


In [47]:
crypto_df= crypto_df.rename(columns={'MaxSupply': 'TotalCoinSupply'})

In [48]:
# Create dummy variables for text features
features_df=crypto_df.copy()
X = pd.get_dummies(features_df)
X.head()

  uniques = Index(uniques)


Unnamed: 0,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-2 Token,Algorithm_BEP-20 Token,Algorithm_BEP2 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,TotalCoinSupply_16555000000.0,TotalCoinSupply_21000000000.0,TotalCoinSupply_40000000000.0,TotalCoinSupply_45000000000.0,TotalCoinSupply_70000000000.0,TotalCoinSupply_86712634466.0,TotalCoinSupply_100000000000.0,TotalCoinSupply_184470000000.0,TotalCoinSupply_1000016730264.435,TotalCoinSupply_21000000000000.0
42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Standardize data
scaler=StandardScaler().fit(X)
X_scaled=scaler.transform(X)
X_scaled


array([[-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304],
       [-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304],
       [-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304],
       ...,
       [-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304],
       [-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304],
       [-0.05415304, -0.09407209, -0.05415304, ..., -0.05415304,
        -0.05415304, -0.05415304]])

### Reducing Dimensions Using PCA

In [50]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
print(X_pca)

[[ 3.07234048 -0.37473493 -0.17823064]
 [ 2.9804165  -0.30107037  0.0136334 ]
 [ 2.97278749 -0.33532611 -0.12631186]
 ...
 [ 0.83650696 -0.1472489   0.15265705]
 [ 0.26944357 -0.05705069  0.09719592]
 [ 4.66013096 -0.74323604 -1.09592027]]


In [51]:
# Create a DataFrame with the principal components data
pcs_df=pd.DataFrame(data=X_pca,columns=["PC 1", "PC 2","Pc 3"],index=crypto_df.index)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,Pc 3
42,3.07234,-0.374735,-0.178231
NSR,2.980416,-0.30107,0.013633
TRI,2.972787,-0.335326,-0.126312
CMTC,3.545434,-0.485029,-0.420981
CHAT,0.835138,-0.190191,-0.086727


In [52]:
pca.explained_variance_ratio_

array([0.01160845, 0.00934717, 0.00859976])

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [53]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)



In [54]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [55]:
import altair as alt
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
)

Running K-Means with `k=<4>`

In [56]:
# Initialize the K-Means model
model=KMeans(n_clusters=4,random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions=model.predict(pcs_df)
print(predictions)

[1 1 1 1 1 1 1 1 1 3 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1
 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1
 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0
 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 1 1 0
 1 1 1 1 1 1 1 0 1]


In [57]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pcs_df["class"]=model.labels_
pcs_df.head()

Unnamed: 0,PC 1,PC 2,Pc 3,class
42,3.07234,-0.374735,-0.178231,1
NSR,2.980416,-0.30107,0.013633,1
TRI,2.972787,-0.335326,-0.126312,1
CMTC,3.545434,-0.485029,-0.420981,1
CHAT,0.835138,-0.190191,-0.086727,1


In [58]:
clustered_df=pd.concat([crypto_df,coin_name,pcs_df],axis="columns", join="inner")
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName,PC 1,PC 2,Pc 3,class
42,Scrypt,PoW/PoS,41.999952,42,42 Coin,3.07234,-0.374735,-0.178231,1
NSR,PoS,PoS,6173465937.8311,0,NuShares,2.980416,-0.30107,0.013633,1
TRI,X13,PoW/PoS,191620.842403,0,Triangles Coin,2.972787,-0.335326,-0.126312,1
CMTC,Scrypt,PoW,872830.0,0,CometCoin,3.545434,-0.485029,-0.420981,1
CHAT,Scrypt,PoW/PoS,1000000000.0,-1,OpenChat,0.835138,-0.190191,-0.086727,1


### Visualizing Results

#### 3D-Scatter with Clusters

In [59]:
import altair as alt
from vega_datasets import data

source = clustered_df

alt.Chart(source).mark_circle(size=60).encode(
     x="PC 1",
    y="PC 2",
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

#### Table of Tradable Cryptocurrencies

In [60]:
# Table with tradable cryptos
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        clustered_df[
            [
                "CoinName",
                "Algorithm",
                "ProofType",
                "TotalCoinSupply",
                "TotalCoinsMined",
                "class",
            ]
        ]
    )

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,class
42,42 Coin,Scrypt,PoW/PoS,42.0,41.999952,1
NSR,NuShares,PoS,PoS,0.0,6173465937.8311,1
TRI,Triangles Coin,X13,PoW/PoS,0.0,191620.842403,1
CMTC,CometCoin,Scrypt,PoW,0.0,872830.0,1
CHAT,OpenChat,Scrypt,PoW/PoS,-1.0,1000000000.0,1
QRL,Quantum Resistant Ledger,RandomX,PoW,105000000.0,76026801.631209,1
PURA,Pura,X11,PoW,-1.0,188358976.839698,1
BTCP,Bitcoin Private,Equihash,PoW,22873588.0,3818878.387802,1
ADK,Aidos Kuneen,IMesh,PoW,0.0,25000000.0,1
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,70000000000.0,62319462900.0,3


In [61]:
# Print the total number of tradable cryptocurrencies
print("The total number of tradable cryptocurrencies:"+ str(clustered_df["CoinName"].nunique()))

The total number of tradable cryptocurrencies:341


#### Scatter Plot with Tradable Cryptocurrencies

In [62]:
# Scale data to create the scatter plot
scaled=clustered_df[["TotalCoinsMined","TotalCoinSupply"]].copy()
scaler_1=MinMaxScaler()
cluster_scaled=pd.DataFrame(scaler_1.fit_transform(scaled), 
                            columns=["TotalCoinsMined","TotalCoinSupply"])


In [63]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
source =cluster_scaled

alt.Chart(source).mark_circle(size=60).encode(
     x="TotalCoinsMined",
    y="TotalCoinSupply",
   
).interactive()


In [64]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        clustered_df[
            [
                "CoinName",
                "Algorithm",
                "ProofType",
                "TotalCoinSupply",
                "TotalCoinsMined",
                "class",
            ]
        ]
    )

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,class
42,42 Coin,Scrypt,PoW/PoS,42.0,41.999952,1
NSR,NuShares,PoS,PoS,0.0,6173465937.8311,1
TRI,Triangles Coin,X13,PoW/PoS,0.0,191620.842403,1
CMTC,CometCoin,Scrypt,PoW,0.0,872830.0,1
CHAT,OpenChat,Scrypt,PoW/PoS,-1.0,1000000000.0,1
QRL,Quantum Resistant Ledger,RandomX,PoW,105000000.0,76026801.631209,1
PURA,Pura,X11,PoW,-1.0,188358976.839698,1
BTCP,Bitcoin Private,Equihash,PoW,22873588.0,3818878.387802,1
ADK,Aidos Kuneen,IMesh,PoW,0.0,25000000.0,1
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,70000000000.0,62319462900.0,3
