# Clustering Crypto

In [9]:
# Initial imports
import altair as alt
from vega_datasets import data

import requests
import pandas as pd
import matplotlib.pyplot as plt
#import hvplot.pandas
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import json

### Fetching Cryptocurrency Data

In [10]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
crypto_data=requests.get(url).json()
# crypto_data

In [11]:
crypto_url_df=pd.DataFrame(crypto_data["Data"]).T
crypto_url_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,1.80026,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [12]:
crypto_url_df.columns

Index(['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 'Name', 'Symbol',
       'CoinName', 'FullName', 'Description', 'AssetTokenStatus', 'Algorithm',
       'ProofType', 'SortOrder', 'Sponsored', 'Taxonomy', 'Rating',
       'IsTrading', 'TotalCoinsMined', 'CirculatingSupply', 'BlockNumber',
       'NetHashesPerSecond', 'BlockReward', 'BlockTime', 'AssetLaunchDate',
       'AssetWhitepaperUrl', 'AssetWebsiteUrl', 'MaxSupply', 'MktCapPenalty',
       'IsUsedInDefi', 'IsUsedInNft', 'PlatformType', 'AlgorithmType',
       'Difficulty', 'BuiltOn', 'SmartContractAddress', 'DecimalPoints'],
      dtype='object')

In [13]:
crypto_url_df = crypto_url_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'MaxSupply']]

In [14]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.



In [15]:
# Alternatively, use the provided csv file:
#file_path = Path("Resources/crypto_data.csv")

In [16]:
# Create a DataFrame
crypto_dff=pd.read_csv("crypto_data.csv")
crypto_dff.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [17]:
crypto_dff.columns[1:].values

array(['CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'], dtype=object)

### Data Preprocessing

In [18]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df=crypto_dff.drop(columns="Unnamed: 0")


In [19]:
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [20]:
crypto_df = crypto_url_df.copy()

In [21]:
# Keep only cryptocurrencies that are trading
is_Trding = crypto_df['IsTrading']==True


In [22]:
crypto_trading = crypto_df[is_Trding]
crypto_trading.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
LAMB,Lambda,,True,,6000000000.0,-1.0
LEMO,LemoChain,,True,,1600000000.0,-1.0
ANONCOIN,Anoncoin,,True,PoW,,
BZNT,Bezant,,True,,999999820.0,-1.0
BEZ,Bezop,,True,,89267250.0,-1.0


In [23]:
[i for i in crypto_df.Algorithm.unique() if i.startswith("N")]

['N/A', 'NIST5', 'NeoScrypt', 'NEP-5', 'NRC20 Token']

In [24]:
# Keep only cryptocurrencies with a working algorithm

crypto_df = crypto_df[crypto_df['Algorithm'] != 'N/A']
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,42,42
365,365Coin,X11,True,PoW/PoS,0,-1
404,404Coin,Scrypt,True,PoW/PoS,0,-1
611,SixEleven,SHA-256,True,PoW,0,0
808,808,SHA-256,True,PoW/PoS,0,0


In [25]:
# Remove the 'IsTrading' column
# crypto_df=crypto_df.drop(columns="IsTrading")
# crypto_df.head()

In [26]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True,how="any",axis=0)


In [27]:
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
IsTrading          0
ProofType          0
TotalCoinsMined    0
MaxSupply          0
dtype: int64

In [28]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df['TotalCoinsMined']!=0]
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 343 entries, 42 to KMD
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CoinName         343 non-null    object
 1   Algorithm        343 non-null    object
 2   IsTrading        343 non-null    object
 3   ProofType        343 non-null    object
 4   TotalCoinsMined  343 non-null    object
 5   MaxSupply        343 non-null    object
dtypes: object(6)
memory usage: 18.8+ KB


In [29]:
# Drop rows where there are 'N/A' text values
# crypto_df[crypto_df.iloc[:] != "N/A"]


In [30]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
coin_name=pd.DataFrame(crypto_df["CoinName"], index = crypto_df.index)
coin_name.head()

Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat


In [31]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

crypto_df=crypto_df.drop(columns="CoinName")


In [32]:
crypto_df

Unnamed: 0,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,True,PoW/PoS,42,42
NSR,PoS,True,PoS,6.17316e+09,0
TRI,X13,True,PoW/PoS,191621,0
CMTC,Scrypt,True,PoW,872830,0
CHAT,Scrypt,True,PoW/PoS,1000000000,-1
...,...,...,...,...,...
HPB,DPoS,True,,101119950,100000000
ZEN,Equihash,True,PoW,1.20837e+07,21000000
ICX,Loopchain,True,PoS,9.22773e+08,-1
KCASH,SHA-512,True,Zero-Knowledge Proof,1000000000,-1


In [33]:
# Create dummy variables for text features
features_df=crypto_df.copy()
X = pd.get_dummies(features_df)
X.head()

Unnamed: 0,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-2 Token,Algorithm_BEP-20 Token,Algorithm_BEP2 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,MaxSupply_16555000000.0,MaxSupply_21000000000.0,MaxSupply_40000000000.0,MaxSupply_45000000000.0,MaxSupply_70000000000.0,MaxSupply_86712634466.0,MaxSupply_100000000000.0,MaxSupply_184470000000.0,MaxSupply_1000016730264.435,MaxSupply_21000000000000.0
42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Standardize data
scaler=StandardScaler().fit(X)


In [35]:
X_scaled=scaler.transform(X)
X_scaled

array([[-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381],
       [-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381],
       [-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381],
       ...,
       [-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381],
       [-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381],
       [-0.05407381, -0.09393364, -0.05407381, ..., -0.05407381,
        -0.05407381, -0.05407381]])

### Reducing Dimensions Using PCA

In [36]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
print(X_pca)

[[ 3.1086133  -0.348436   -0.18684218]
 [ 3.05948581 -0.28290458  0.02631284]
 [ 3.02225906 -0.3114768  -0.13119661]
 ...
 [ 1.06267432 -0.16065133  0.1907608 ]
 [ 0.51945796 -0.07702667  0.13200202]
 [ 4.41531411 -0.65925473 -1.2130241 ]]


In [37]:
# Create a DataFrame with the principal components data
pcs_df=pd.DataFrame(data=X_pca,columns=["PC 1", "PC 2","Pc 3"],index=crypto_df.index)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,Pc 3
42,3.108613,-0.348436,-0.186842
NSR,3.059486,-0.282905,0.026313
TRI,3.022259,-0.311477,-0.131197
CMTC,3.54307,-0.44868,-0.44504
CHAT,0.985523,-0.192809,-0.075442


In [38]:
pca.explained_variance_ratio_

array([0.01213543, 0.00930482, 0.0085606 ])

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [39]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

In [40]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [43]:
#df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [44]:
import altair as alt
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
)

Running K-Means with `k=<4>`

In [53]:
# Initialize the K-Means model
model=KMeans(n_clusters=4,random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions=model.predict(pcs_df)
print(predictions)

[1 1 1 1 1 1 1 1 1 3 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1
 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 2 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]


In [54]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pcs_df["class"]=model.labels_
pcs_df.head()

Unnamed: 0,PC 1,PC 2,Pc 3,class
42,3.108613,-0.348436,-0.186842,1
NSR,3.059486,-0.282905,0.026313,1
TRI,3.022259,-0.311477,-0.131197,1
CMTC,3.54307,-0.44868,-0.44504,1
CHAT,0.985523,-0.192809,-0.075442,1


In [45]:
clustered_df=pd.concat([crypto_df,coin_name,pcs_df],axis="columns", join="inner")


In [46]:
clustered_df = clustered_df.rename(columns={'MaxSupply': 'TotalCoinSupply'})

In [58]:
clustered_df.tail(20)

Unnamed: 0,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName,PC 1,PC 2,Pc 3
NAV,X13,True,PoW/PoS,72875200.0,-1,NavCoin,1.238951,-0.208669,-0.026886
BCN,CryptoNight,True,PoW,184467000000.0,184470000000,ByteCoin,3.548518,-0.48156,-0.568095
CLO,Ethash,True,PoW,3113700000.0,6500000000,Callisto Network,3.468381,-0.44833,-0.545638
ADA,Ouroboros,True,PoS,33034300000.0,45000000000,Cardano,3.477194,-0.268461,0.437442
NXS,SHA3,True,PoW/nPoS,72770000.0,78000000,Nexus,4.535646,36.801916,-0.347686
CLOAK,X13,True,PoW/PoS,5767520.0,-1,CloakCoin,1.238951,-0.208669,-0.026886
DASH,X11,True,PoW/PoSe,10591100.0,18900000,Dash,3.58714,-0.357071,0.140012
DERO,CryptoNight,True,PoW,18400000.0,-1,Dero,1.220593,-0.279202,-0.317436
DGB,Multiple,True,PoW,15108100000.0,21000000000,DigiByte,3.356841,-0.454492,-0.641857
DOGE,Scrypt,True,PoW,133283000000.0,-1,Dogecoin,1.759762,-0.345872,-0.34073


### Visualizing Results

#### 3D-Scatter with Clusters

In [57]:
import altair as alt
from vega_datasets import data

source = clustered_df

alt.Chart(source).mark_circle(size=60).encode(
     x="PC 1",
    y="PC 2",
    #color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()

In [51]:
#%conda install -c conda-forge altair vega_datasets

#### Table of Tradable Cryptocurrencies

In [None]:
# Table with tradable cryptos


In [None]:
# Print the total number of tradable cryptocurrencies


#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot



In [281]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
source = clustered_df

alt.Chart(source).mark_circle(size=60).encode(
     x="TotalCoinsMined",
    y="TotalCoinSupply",
   
).interactive()
