# Clustering Crypto

In [4]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from urllib.request import Request, urlopen
import json

### Fetching Cryptocurrency Data

In [5]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"


In [6]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

data = urlopen(Request(url)).read()
json_data = json.loads(data)
df_full = pd.DataFrame(json_data["Data"])
df_full.head()

Unnamed: 0,42,300,365,404,433,611,808,888,1337,2015,...,DHT,XAYA,CHI,CEL,ROT,YAMV2,YAMV1,YAM,SHROOM,JACS
Algorithm,Scrypt,,X11,Scrypt,,SHA-256,SHA-256,,X13,X11,...,,NeoScrypt,,,,,,,,
BlockNumber,197923,0.0,,50759,10821438.0,,0,0.0,2761747,,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BlockReward,0,0.0,,15.6382,2.0,,0,0.0,83.0292,,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BlockTime,0,0.0,,60,0.0,,0,88.0,60,,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BuiltOn,,7605.0,,,7605.0,,,,,,...,7605.0,,7605.0,7605.0,7605.0,7605.0,7605.0,7605.0,7605.0,7605.0


In [8]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
df_transpose = df_full.T
df_transpose.head()

Unnamed: 0,Algorithm,BlockNumber,BlockReward,BlockTime,BuiltOn,CoinName,ContentCreatedOn,DecimalPlaces,FullName,FullyPremined,...,Rating,SmartContractAddress,SortOrder,Sponsored,Symbol,Taxonomy,TotalCoinSupply,TotalCoinsFreeFloat,TotalCoinsMined,Url
42,Scrypt,197923.0,0.0,0.0,,42 Coin,1427211129,0,42 Coin (42),0,...,"{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",,34,False,42,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...",42,,42.0,/coins/42/overview
300,,0.0,0.0,0.0,7605.0,300 token,1517935016,18,300 token (300),0,...,"{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",0xaec98a708810414878c3bcdf46aad31ded4a4557,2212,False,300,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...",300,,300.0,/coins/300/overview
365,X11,,,,,365Coin,1480032918,0,365Coin (365),0,...,"{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",,916,False,365,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...",2300000000,,,/coins/365/overview
404,Scrypt,50759.0,15.6382,60.0,,404Coin,1466100361,0,404Coin (404),0,...,"{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",,602,False,404,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...",532000000,,1379950.0,/coins/404/overview
433,,10821438.0,2.0,0.0,7605.0,433 Token,1541597321,18,433 Token (433),0,...,"{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",0x738505a5f31bf72e0b70298bca81150eb1b7c751,3505,False,433,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...",1000000000,,112518000.0,/coins/433/overview


### Data Preprocessing

In [9]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

df = df_transpose[["CoinName", "Algorithm", "IsTrading", "ProofType", "TotalCoinsMined", "TotalCoinSupply"]]
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
300,300 token,,True,,300.0,300
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1379950.0,532000000
433,433 Token,,False,,112518000.0,1000000000


In [10]:
# Keep only cryptocurrencies that are trading

df = df.loc[df.IsTrading != False]
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
300,300 token,,True,,300.0,300
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1379950.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000


In [11]:
# Keep only cryptocurrencies with a working algorithm

df = df.loc[df.Algorithm != "N/A"]
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1379950.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [12]:
# Remove the 'IsTrading' column

df = df.drop(columns = "IsTrading")
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1379950.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [13]:
# Remove rows with at least 1 null value

df = df.dropna()
df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [14]:
# Remove rows with cryptocurrencies having no coins mined

df = df.loc[df.TotalCoinsMined != 0]
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42
404,404Coin,Scrypt,PoW/PoS,1379950.0,532000000
1337,EliteCoin,X13,PoW/PoS,29482600000.0,314159265359
BTCD,BitcoinDark,SHA-256,PoW/PoS,1288862.0,22000000
XPY,PayCoin,SHA-256,PoS,11995300.0,12500000


In [15]:
# Drop rows where there are 'N/A' text values

for i in df.columns:
    print (df[df[i] == "N/A"])

# Dropping rows where there are "N/A" text values in ProofType
df = df.loc[df["ProofType"].str.contains("N/A") == False]

# Dropping rows where there are "N/A" text values in TotalCoinsSupply
df = df.loc[df["TotalCoinSupply"].str.contains("N/A") == False]

Empty DataFrame
Columns: [CoinName, Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply]
Index: []
Empty DataFrame
Columns: [CoinName, Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply]
Index: []
                          CoinName      Algorithm ProofType TotalCoinsMined  \
XPD                    PetroDollar       SHA-256D       N/A        63993275   
CREVA                   Creva Coin            X11       N/A        36390750   
UNC                         UnCoin            X11       N/A         8388608   
BCY                    BitCrystals   Counterparty       N/A       100000000   
SCOT                      Scotcoin   Counterparty       N/A      1000000000   
PASC                   Pascal Coin         Pascal       N/A        32607325   
NETKO                        Netko          Blake       N/A        10969740   
WGR                         Wagerr            PoS       N/A       200000000   
GRWI         Growers International           DPoS       N/A     1.22049e+06   
AURS  

In [16]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 

crypto_name = df.CoinName
crypto_name.head()

42          42 Coin
404         404Coin
1337      EliteCoin
BTCD    BitcoinDark
XPY         PayCoin
Name: CoinName, dtype: object

In [17]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

cluster_df = df.drop(columns = "CoinName")
print(cluster_df.shape)
cluster_df.head()

(539, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,42.0,42
404,Scrypt,PoW/PoS,1379950.0,532000000
1337,X13,PoW/PoS,29482600000.0,314159265359
BTCD,SHA-256,PoW/PoS,1288862.0,22000000
XPY,SHA-256,PoS,11995300.0,12500000


In [18]:
# Create dummy variables for text features

cluster_df_dummy = pd.get_dummies(cluster_df[["Algorithm", "ProofType"]])
# cluster_df_dummy = pd.get_dummies(cluster_df["ProofType"])
cluster_df_dummy.head()

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_C31,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTCD,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XPY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Standardize data

cluster_df_scaled = StandardScaler().fit_transform(cluster_df_dummy)
cluster_df_scaled[0:5]

array([[-0.04311306, -0.04311306, -0.04311306, -0.06102782, -0.04311306,
        -0.0748132 , -0.04311306, -0.06102782, -0.06102782, -0.04311306,
        -0.04311306, -0.04311306, -0.18046343, -0.04311306, -0.04311306,
        -0.08646754, -0.04311306, -0.04311306, -0.13031167, -0.08646754,
        -0.04311306, -0.04311306, -0.04311306, -0.04311306, -0.15720951,
        -0.04311306, -0.04311306, -0.13031167, -0.04311306, -0.04311306,
        -0.08646754, -0.04311306, -0.04311306, -0.04311306, -0.04311306,
        -0.04311306, -0.04311306, -0.04311306, -0.04311306, -0.0748132 ,
        -0.09676412, -0.06102782, -0.04311306, -0.13749033, -0.13031167,
        -0.13031167, -0.04311306, -0.04311306, -0.04311306, -0.0748132 ,
        -0.18587346, -0.04311306, -0.04311306, -0.04311306, -0.04311306,
        -0.0748132 , -0.19115036, -0.04311306, -0.31622777, -0.04311306,
        -0.08646754, -0.0748132 , -0.06102782, -0.04311306,  1.45451261,
        -0.04311306, -0.04311306, -0.06102782, -0.0

### Reducing Dimensions Using PCA

In [20]:
# Use PCA to reduce dimensions to 3 principal components

pca = PCA(n_components=3)

cluster_pca = pca.fit_transform(cluster_df_scaled)

In [21]:
# Create a DataFrame with the principal components data

df_cluster_pca = pd.DataFrame(data = cluster_pca, 
                              columns=["PC 1", "PC 2", "PC 3"], index = cluster_df.index
                              )
                    
print(cluster_pca.shape)
df_cluster_pca.head()

(539, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,0.862548,-0.851033,0.02252
404,0.862548,-0.851033,0.02252
1337,1.847576,-0.832858,0.119968
BTCD,0.885688,-0.463606,0.071518
XPY,0.568357,0.017943,0.049537


### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [22]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_cluster_pca)
    inertia.append(km.inertia_)


# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [23]:
# Initialize the K-Means model

model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_cluster_pca)

# Predict clusters
predictions = model.predict(df_cluster_pca)

# Add the predicted class columns
df_cluster_pca["class"] = model.labels_
df_cluster_pca.head()
# df_cluster_pca.shape

# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_clustered = pd.DataFrame({
    "Algorithm" : cluster_df.Algorithm,
    "ProofType": cluster_df.ProofType,
    "TotalCoinsMined": cluster_df.TotalCoinsMined,
    "TotalCoinSupply": cluster_df.TotalCoinSupply,
    "principal component 1": df_cluster_pca["PC 1"],
    "principal component 2": df_cluster_pca["PC 2"], 
    "principal component 3": df_cluster_pca["PC 3"],
    "CoinName": crypto_name,
    "Class": df_cluster_pca["class"]
    }, index = cluster_df.index
    )

df_clustered.head(20)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,principal component 1,principal component 2,principal component 3,CoinName,Class
42,Scrypt,PoW/PoS,42.0,42,0.862548,-0.851033,0.02252,42 Coin,0
404,Scrypt,PoW/PoS,1379950.0,532000000,0.862548,-0.851033,0.02252,404Coin,0
1337,X13,PoW/PoS,29482600000.0,314159265359,1.847576,-0.832858,0.119968,EliteCoin,0
BTCD,SHA-256,PoW/PoS,1288862.0,22000000,0.885688,-0.463606,0.071518,BitcoinDark,0
XPY,SHA-256,PoS,11995300.0,12500000,0.568357,0.017943,0.049537,PayCoin,0
PRC,Scrypt,PoW,6268245.0,21000000,-1.270415,-0.050764,-0.040578,ProsperCoin,2
KOBO,X15,PoW/PoS,25600400.0,350000000,1.90457,-0.907186,0.099671,KoboCoin,0
ARG,Scrypt,PoW,13833000.0,64000000,-1.270415,-0.050764,-0.040578,Argentum,2
BLU,Scrypt,PoW/PoS,647272555.0,0,0.862548,-0.851033,0.02252,BlueCoin,0
XMY,Multiple,PoW,1750253250.0,2000000000,-1.700987,0.498611,-0.053748,MyriadCoin,2


### Visualizing Results

#### 3D-Scatter with Clusters

In [24]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    df_cluster_pca,
    x="principal component 1",
    y="principal component 2",
    z="principal component 3",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['PC 1', 'PC 2', 'PC 3', 'class'] but received: principal component 1

#### Table of Tradable Cryptocurrencies

In [25]:
# Table with tradable cryptos

columns = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']
df_clustered.hvplot.table(columns,colormap="viridis")

In [26]:
# Print the total number of tradable cryptocurrencies
print("The number of tradable cryptocurrencies is") 
df_clustered["CoinName"].count()

The number of tradable cryptocurrencies is


539

#### Scatter Plot with Tradable Cryptocurrencies

In [27]:
# Scale data to create the scatter plot
df_clustered['TotalCoinsMined'] = df_clustered['TotalCoinsMined'].astype(float) / 100000000

df_clustered['TotalCoinSupply'] = df_clustered['TotalCoinSupply'].astype(float) / 100000000


In [28]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

df_clustered.hvplot(
    kind="scatter", 
    x="TotalCoinsMined", 
    y="TotalCoinSupply", 
    c='Class',
    rot= 90, 
    colormap="inferno", 
    hover_cols=['CoinName']
)