# Clustering Crypto

In [2]:
#!pip install -U altair

In [3]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import altair as alt

In [4]:
#alt.renderers.enable('mimetype')
#alt.renderers.enable('notebook')
alt.renderers.enable('default')

RendererRegistry.enable('default')

### Fetching Cryptocurrency Data

In [5]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [6]:
# Create a DataFrame
df_cryptocompare= pd.DataFrame.from_dict(response['Data']).T

In [7]:
df_cryptocompare.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,AssetLaunchDate,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,BuiltOn,SmartContractAddress,DecimalPoints,Difficulty
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0000-00-00,0.0,0.0,0.0,0.0,,,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,2017-07-01,300.0,0.0,0.0,0.0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0,
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,0000-00-00,0.0,0.0,0.0,0.0,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,0000-00-00,0.0,0.0,0.0,0.0,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


https://compassmining.io/education/when-will-all-bitcoins-be-mined/

### Data Preprocessing

In [8]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
keep_list = ['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']
df_cryptocompare = df_cryptocompare[keep_list]

In [9]:
# Keep only cryptocurrencies that are trading
df_cryptocompare = df_cryptocompare[df_cryptocompare['IsTrading'] == True]
df_cryptocompare.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0,0
300,300 token,,True,,300,300
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0


In [10]:
# Keep only cryptocurrencies with a working algorithm
df_cryptocompare = df_cryptocompare[~df_cryptocompare.Algorithm.str.match('N/A')]
df_cryptocompare.shape

(1495, 6)

In [11]:
df_cryptocompare.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
611,SixEleven,SHA-256,True,PoW,0,0
808,808,SHA-256,True,PoW/PoS,0,0


In [12]:
# Remove the "IsTrading" column
df_cryptocompare.drop(["IsTrading"],axis=1,inplace=True)

In [13]:
df_cryptocompare.tail()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
ACT,Achain,DPoS,DPoS,1000000000.0,0
BTG,Bitcoin Gold,Equihash,PoW,18729704.860885,21000000
ICX,ICON Project,Loopchain,PoS,889168519.418303,-1
DGB,DigiByte,Multiple,PoW,14324667000.795696,21000000000
YOOSHI,YooShi,BEP-20 Token,,443435939231077.2,-1


In [14]:
# Remove rows with at least 1 null value
df_cryptocompare.dropna(axis="rows",inplace=True)
df_cryptocompare.shape

(285, 5)

In [15]:
df_cryptocompare.tail()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
ACT,Achain,DPoS,DPoS,1000000000.0,0
BTG,Bitcoin Gold,Equihash,PoW,18729704.860885,21000000
ICX,ICON Project,Loopchain,PoS,889168519.418303,-1
DGB,DigiByte,Multiple,PoW,14324667000.795696,21000000000
YOOSHI,YooShi,BEP-20 Token,,443435939231077.2,-1


In [16]:
# Remove rows with cryptocurrencies having no coins mined
df_cryptocompare = df_cryptocompare[df_cryptocompare.TotalCoinsMined > 0 ]
df_cryptocompare.shape

(144, 5)

In [17]:
df_cryptocompare.tail()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
ACT,Achain,DPoS,DPoS,1000000000.0,0
BTG,Bitcoin Gold,Equihash,PoW,18729704.860885,21000000
ICX,ICON Project,Loopchain,PoS,889168519.418303,-1
DGB,DigiByte,Multiple,PoW,14324667000.795696,21000000000
YOOSHI,YooShi,BEP-20 Token,,443435939231077.2,-1


In [18]:
# Drop rows where there are 'N/A' text values
df_cryptocompare = df_cryptocompare[df_cryptocompare.iloc[:] != 'N/A']
df_cryptocompare.shape

(144, 5)

In [19]:
df_cryptocompare.tail()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
ACT,Achain,DPoS,DPoS,1000000000.0,0
BTG,Bitcoin Gold,Equihash,PoW,18729704.860885,21000000
ICX,ICON Project,Loopchain,PoS,889168519.418303,-1
DGB,DigiByte,Multiple,PoW,14324667000.795696,21000000000
YOOSHI,YooShi,BEP-20 Token,,443435939231077.2,-1


In [20]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
df_coinsname= df_cryptocompare["CoinName"]
df_coinsname.head()

NVC           NovaCoin
XCP       CounterParty
NSR           NuShares
MONA          MonaCoin
TRI     Triangles Coin
Name: CoinName, dtype: object

In [21]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df_cryptocompare.drop(columns="CoinName", axis=1, inplace=True)
df_cryptocompare.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NVC,Scrypt,PoW/PoS,3254038.07424,-1
XCP,SHA-256,PoW,2615025.583979,-1
NSR,PoS,PoS,6158524533.0671,0
MONA,Scrypt,PoW,81732287.471579,-1
TRI,X13,PoW/PoS,178912.59513,0


In [22]:
# Create dummy variables for text features
X= pd.get_dummies(df_cryptocompare,columns=["Algorithm","ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,MaxSupply,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BEP2 Token,Algorithm_BEP20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_Proof of Authority,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW,ProofType_mPoW
NVC,3254038.07424,-1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XCP,2615025.583979,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6158524533.0671,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MONA,81732287.471579,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,178912.59513,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [23]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:3]

array([[-0.08431186, -0.12921792, -0.14586499, -0.0836242 , -0.0836242 ,
        -0.0836242 , -0.11867817, -0.0836242 , -0.0836242 , -0.14586499,
        -0.11867817, -0.11867817, -0.0836242 , -0.0836242 , -0.27317918,
        -0.11867817, -0.0836242 , -0.0836242 , -0.0836242 , -0.28758784,
        -0.0836242 , -0.24253563, -0.0836242 , -0.0836242 , -0.0836242 ,
        -0.0836242 , -0.11867817, -0.0836242 , -0.0836242 , -0.0836242 ,
        -0.0836242 , -0.0836242 , -0.0836242 , -0.16903085, -0.11867817,
        -0.0836242 , -0.0836242 , -0.0836242 , -0.11867817, -0.18966081,
        -0.0836242 , -0.14586499, -0.11867817, -0.31501848, -0.11867817,
        -0.0836242 , -0.0836242 , -0.0836242 ,  2.56494588, -0.0836242 ,
        -0.0836242 , -0.0836242 , -0.0836242 , -0.0836242 , -0.16903085,
        -0.0836242 , -0.20851441, -0.0836242 , -0.0836242 , -0.0836242 ,
        -0.0836242 , -0.0836242 , -0.24253563, -0.0836242 , -0.0836242 ,
        -0.11867817, -0.11867817, -0.0836242 , -0.2

### Reducing Dimensions Using PCA

In [24]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
principal_components = pca.fit_transform(X)

In [25]:
pca.components_.shape

(3, 82)

In [26]:
principal_components[:3]

array([[-0.73833363, -0.7514798 ,  0.06511541],
       [ 1.82966475,  0.12134861, -0.05015627],
       [-1.3522361 , -0.36859789,  0.13292491]])

Once you have reduced the data dimensions, create a DataFrame named pcs_df using as columns names "PC 1", "PC 2" and "PC 3"; use the crypto_df.index as the index for this new DataFrame.

In [27]:
# Create a DataFrame with the principal components data
df_pcs = pd.DataFrame( principal_components,
columns= [f"PC {i}" for i in range(1, pca.components_.shape[0] + 1)], index=df_cryptocompare.index)
print(df_pcs.shape)
df_pcs.head()

(144, 3)


Unnamed: 0,PC 1,PC 2,PC 3
NVC,-0.738334,-0.75148,0.065115
XCP,1.829665,0.121349,-0.050156
NSR,-1.352236,-0.368598,0.132925
MONA,1.057846,-0.250667,-0.03341
TRI,-1.47111,-0.874997,0.112899


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [28]:
inertia = []
# creating 10 possible k values 
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pcs)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_dict = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_dict)
alt.Chart(df_elbow).mark_line().encode(x="k", y="inertia")


Running K-Means with `k=<your best value for k here>`

In [29]:
# Initialize the K-Means model
kmodel = KMeans(n_clusters=4, random_state=0)
# Fit the model
kmodel.fit(df_pcs)
# Predict clusters
predictions = kmodel.predict(df_pcs)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_crypto_predict = pd.concat([df_cryptocompare, df_pcs], axis=1, sort=False)
# add coin name deleted before clustering 
df_crypto_predict["CoinName"] = df_coinsname.iloc[:]
df_crypto_predict["Class"] = kmodel.labels_
df_crypto_predict.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
NVC,Scrypt,PoW/PoS,3254038.07424,-1,-0.738334,-0.75148,0.065115,NovaCoin,0
XCP,SHA-256,PoW,2615025.583979,-1,1.829665,0.121349,-0.050156,CounterParty,2
NSR,PoS,PoS,6158524533.0671,0,-1.352236,-0.368598,0.132925,NuShares,0
MONA,Scrypt,PoW,81732287.471579,-1,1.057846,-0.250667,-0.03341,MonaCoin,2
TRI,X13,PoW/PoS,178912.59513,0,-1.47111,-0.874997,0.112899,Triangles Coin,0


### Visualizing Results

#### Scatter with Clusters

In [30]:
# Create a Scatter with the PCA data and the clusters
alt.Chart(df_crypto_predict).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    color=alt.Color(
        "Class",
        scale=alt.Scale(domain=[0, 1, 2, 3], range=["red", "green", "blue", "orange"]),
    ),
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "MaxSupply"],
).interactive()

In [31]:
# Print the total number of tradable cryptocurrencies
print(f"There total number of tradable cryptocurrencies are {df_crypto_predict.shape[0]}")

There total number of tradable cryptocurrencies are 144


#### Scatter Plot with Tradable Cryptocurrencies

In [32]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
scaled = mm_scaler.fit_transform(
    df_crypto_predict[["MaxSupply", "TotalCoinsMined"]]
)
scaled_df = pd.DataFrame(
    scaled, columns=["MaxSupply", "TotalCoinsMined"], index=df_crypto_predict.index
)
scaled_df["CoinName"] = df_crypto_predict["CoinName"]
scaled_df["Class"] = df_crypto_predict["Class"]
scaled_df.head()

Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,Class
NVC,0.0,7.334687e-09,NovaCoin,0
XCP,0.0,5.893639e-09,CounterParty,2
NSR,9.999833e-13,1.388819e-05,NuShares,0
MONA,0.0,1.843123e-07,MonaCoin,2
TRI,9.999833e-13,3.999185e-10,Triangles Coin,0


In [33]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
#set up styling
col_pal =[ '#203B57','#2387EF','#98EF23','#8914EE','#F20535']
color= ['red', 'steelblue', 'chartreuse', '#F4D03F', '#D35400', '#7D3C98']
#scale for the axis
scale= (scaled_df.Class.min() - .1,scaled_df.Class.max() +.1)

alt.Chart(scaled_df).mark_circle(size=100).encode(
    alt.X('TotalCoinsMined',
    scale=alt.Scale(domain=(scaled_df.TotalCoinsMined.min() - .1,scaled_df.TotalCoinsMined.max() +.1))
    ),
    alt.Y('MaxSupply',scale=alt.Scale(domain=((scaled_df.MaxSupply.min() - .1,scaled_df.MaxSupply.max() + .1)))
    ),
    color=alt.Color(
        "Class",
       
        scale=alt.Scale(domain=(scaled_df.Class.min(),scaled_df.Class.max()),range=col_pal),
    ),
    tooltip=["CoinName", "TotalCoinsMined", "MaxSupply"],
).interactive()

#### Table current tradable cryptocurrencies using the display() command.

In [34]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        df_crypto_predict[[
            "CoinName","Algorithm","ProofType", "MaxSupply",          "TotalCoinsMined","Class",
            ]]
    )

Unnamed: 0,CoinName,Algorithm,ProofType,MaxSupply,TotalCoinsMined,Class
NVC,NovaCoin,Scrypt,PoW/PoS,-1.0,3254038.07424,0
XCP,CounterParty,SHA-256,PoW,-1.0,2615025.583979,2
NSR,NuShares,PoS,PoS,0.0,6158524533.0671,0
MONA,MonaCoin,Scrypt,PoW,-1.0,81732287.471579,2
TRI,Triangles Coin,X13,PoW/PoS,0.0,178912.59513,0
EMC,Emercoin,SHA-256,PoW/PoS,-1.0,47444129.697603,0
SAFEX,SafeExchangeCoin,Scrypt,PoC,-1.0,2147483647.0,0
CMTC,CometCoin,Scrypt,PoW,0.0,872830.0,2
XSN,Stakenet,X11,TPoS,-1.0,120256219.132966,0
CHAT,OpenChat,Scrypt,PoW/PoS,-1.0,1000000000.0,0
