# Clustering Crypto

In [34]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

### Fetching Cryptocurrency Data

In [35]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [36]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [37]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [38]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
df = df.drop(['Unnamed: 0'], axis=1)    # drop the index column. 
                                        # Got key error when trying to drop the index column without specifying axis=1 
                                        # because it is a multi-index.
df


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [39]:
# Keep only cryptocurrencies that are trading
df = df[df['IsTrading'] == True]    # set df equal all coins that are True in the IsTrading column. 
                                    #This is done by setting the df equal to df[df['IsTrading'] == True] 
                                    # or call df then specifically 'IsTrading' column in df and set it equal to True.

In [40]:
# Keep only cryptocurrencies with a working algorithm
df.dropna(inplace=True) # drop all rows with NaN values and set inplace=True to keep the changes.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [41]:
# Remove the "IsTrading" column
df.drop(columns=['IsTrading'], inplace=True) 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [42]:
# Remove rows with at least 1 null value
df.dropna(axis=0, inplace=True) # drop all rows with NaN values by setting axis = 0 and set inplace=True to keep the changes.

In [43]:
df = df[df['TotalCoinsMined'] !=0] # remove all rows with TotalCoinsMined = 0

In [44]:
for i in df.columns:
    df = df[df[i] != "N/A"] # remove all rows with N/A in the column

df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [45]:
# Remove rows with cryptocurrencies having no coins mined
# df.dropna(axis=0, subset=['TotalCoinsMined'], inplace=True)     # drop all rows where the TotalCoinsMined column has NaN values 
                                                                # by setting axis = 0, subset = ['TotalCoinsMined'] and set inplace=True 
                                                                # to keep the changes.


In [46]:
# Drop rows where there are 'N/A' text values
# df.dropna(axis=0, ) # drop all rows where the TotalCoinsMined column has NaN values
                    # axis = 0 tells the fn to drop all rows not columns with NaN values.

In [47]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coindf = df['CoinName'] 


In [48]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df.drop('CoinName', axis=1, inplace=True) 


In [49]:
df = df.dropna()

In [50]:
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [51]:

import pandas as pd
 
con = pd.Series(list('abcba'))
print(con)
print(pd.get_dummies(con))

0    a
1    b
2    c
3    b
4    a
dtype: object
   a  b  c
0  1  0  0
1  0  1  0
2  0  0  1
3  0  1  0
4  1  0  0


In [52]:
import pandas as pd
import numpy as np
 
 
# list
li = ['s', 'a', 't', np.nan]
print(li)
print(pd.get_dummies(li))

['s', 'a', 't', nan]
   a  s  t
0  0  1  0
1  1  0  0
2  0  0  1
3  0  0  0


In [53]:
df.tail()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1238,SHA-256,DPoS,2000000000.0,2000000000
1242,Scrypt,PoW/PoS,14931050.0,250000000
1245,CryptoNight,PoW,980222600.0,1400222610
1246,Equihash,PoW,7296538.0,21000000
1247,Scrypt,PoS,128327.0,1000000


In [54]:
# Create dummy variables for text features
dummy_df = pd.get_dummies(df, columns=['Algorithm', 'ProofType']) # create dummy variables for the Algorithm and ProofType columns because they are text features.
# dummy_df = pd.get_dummies(df)
dummy_df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Standardize data
standardized_df = StandardScaler().fit_transform(dummy_df)  # StandardScaler() is a class that standardizes the data. A class is a blueprint for an object.
                                                            # fit_transform() is a method that takes in a dataframe and returns a dataframe with the data standardized.

In [56]:
standardized_df

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Reducing Dimensions Using PCA

In [57]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)   # n_components = 3 is the number of principal components to keep.
                            # pca stands for Principal Component Analysis. Principal components are the main features of the data.
                            # Principal component analysis takes the data and analyzes it to find the main features.


In [58]:
# Create a DataFrame with the principal components data
standardized_df = pca.fit_transform(standardized_df)    # set new variable equal to the standardized_df dataframe and then use the fit_transform() method to transform the data.
                                                        # pca.fit_transform() tells python to transform the data using the pca object.
standardized_df

array([[-0.3301885 ,  0.86821062, -0.62032688],
       [-0.31354605,  0.86864147, -0.6207914 ],
       [ 2.28331595,  1.52606327, -0.66042016],
       ...,
       [ 0.33830179, -2.07223414,  0.44228372],
       [-0.12940705, -1.91059979,  0.39063454],
       [-0.29103721,  0.69507567, -0.34389502]])

In [59]:
principal_df = pd.DataFrame(standardized_df, columns=['PCA1', 'PCA2', 'PCA3']).dropna()  # create a new DataFrame with the principal components data and drop all rows with NaN values.
                                                                                         # creating columns = ['PCA1', 'PCA2', 'PCA3'] for the new DataFrame. PCA1, PCA2, PCA3 are the names of the principal components.

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [60]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k: 
    kmodel = KMeans(n_clusters=i, random_state=0)
    kmodel.fit(principal_df)
    inertia.append(kmodel.inertia_)

elbow_data = {'k': k, 'inertia': inertia}

elbow_df = pd.DataFrame(elbow_data)

elbow_df.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve',
    xticks=k
)

# Create the Elbow Curve using hvPlot


  f"KMeans is known to have a memory leak on Windows "


In [61]:
df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

Running K-Means with `k=<your best value for k here>`

In [72]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(principal_df)
# Predict clusters
cluster_predict = model.predict(principal_df)
cluster_predict = pd.DataFrame(cluster_predict)
cluster_predict
# Create a new DataFrame including predicted clusters and cryptocurrencies features
crypto_predicted = pd.concat([df, principal_df, coindf, cluster_predict], axis=1)

In [73]:
#dropping the n/a and renaming the 0 
crypto_predicted = pd.DataFrame(crypto_predicted.dropna())
crypto_predicted.rename(columns={0:'Predictions'}, inplace=True)

### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot


In [74]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
crypto_predicted.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    by= "Predictions"
)

#### Table of Tradable Cryptocurrencies

In [75]:
# Table with tradable cryptos
crypto_predicted.hvplot.table(columns=['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'CoinName', 'Predictions'], sortable=True, selectable=True)
crypto_predicted

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PCA1,PCA2,PCA3,CoinName,Predictions
0,Scrypt,PoW/PoS,4.199995e+01,42,-0.330189,0.868211,-0.620327,42 Coin,0.0
2,Scrypt,PoW/PoS,1.055185e+09,532000000,2.283316,1.526063,-0.660420,404Coin,0.0
5,X13,PoW/PoS,2.927942e+10,314159265359,-0.142660,-1.067416,-0.037894,EliteCoin,1.0
7,SHA-256,PoW,1.792718e+07,21000000,-0.127450,-2.028255,0.517812,Bitcoin,1.0
8,Ethash,PoW,1.076842e+08,0,-0.130451,-1.811873,0.470847,Ethereum,1.0
...,...,...,...,...,...,...,...,...,...
516,X13,PoS,2.500124e+06,2500124,-0.393787,6.602570,14.940973,RoyalCoin,3.0
522,X14,PoW/PoS,1.000000e+08,100000000,3.780251,1.432579,0.232492,GanjaCoin V2,0.0
523,PoS,PoS,1.781868e+07,301000000,-0.269869,0.566589,-0.060254,TeamUP,0.0
525,SHA-256D,PoW/PoS,1.082163e+09,7506000000,-0.383773,1.422669,-0.239343,LanaCoin,0.0


In [76]:
# Print the total number of tradable cryptocurrencies
print(len(crypto_predicted['Algorithm']))

205
