In [46]:
# Import dependencies
import pandas as pd
import hvplot.pandas
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [49]:
# Load the CryptoCompare dataset with index set
file_path = 'Resources/crypto_data.csv'
crypto_df = pd.read_csv(file_path,index_col=0)
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


# Data Preprocessing

In [50]:
# Remove all cryptocurrencies that aren’t trading
NotTrading = crypto_df[crypto_df['IsTrading'] == False].index
crypto_df.drop(NotTrading, inplace=True)
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [52]:
# Remove all cryptocurrencies that don’t have an algorithm defined

# Find null values
for column in crypto_df.columns:
    print(f'Column {column} has {crypto_df[column].isnull().sum()}\
    null values')
crypto_df.to_csv('algorithm_defined.csv')    
# There are no columns in the dataset that don't have an algorithm defined

Column CoinName has 0    null values
Column Algorithm has 0    null values
Column IsTrading has 0    null values
Column ProofType has 0    null values
Column TotalCoinsMined has 459    null values
Column TotalCoinSupply has 0    null values


In [53]:
# 3. Remove the IsTrading column
crypto_df = crypto_df.drop(columns=['IsTrading'])

In [54]:
# 4. Remove all cryptocurrencies with at least one null value

# Find null values
for column in crypto_df.columns:
    print(f'Column {column} has {crypto_df[column].isnull().sum()}\
    null values')

# Drop null rows
crypto_df = crypto_df.dropna()

Column CoinName has 0    null values
Column Algorithm has 0    null values
Column ProofType has 0    null values
Column TotalCoinsMined has 459    null values
Column TotalCoinSupply has 0    null values


In [55]:
# Validate null values have been removed
for column in crypto_df.columns:
    print(f'Column {column} has {crypto_df[column].isnull().sum()}\
    null values')

Column CoinName has 0    null values
Column Algorithm has 0    null values
Column ProofType has 0    null values
Column TotalCoinsMined has 0    null values
Column TotalCoinSupply has 0    null values


In [56]:
# 5. Remove all cryptocurrencies without coins mined
crypto_df = crypto_df[~(crypto_df['TotalCoinsMined'] <=0)]

In [57]:
# Sort TotalCoinsMined from low to high to ensure no values under 0
crypto_df.sort_values('TotalCoinsMined')

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
MOON,MoonCoin,Scrypt,PoW,8.800000e+01,384000000000
SYNC,SyncCoin,X11,PoW/PoS,1.177000e+03,1000
BTB,BitBar,Scrypt,PoW/PoS,4.257948e+04,500000
PLTC,PlatinCoin,CryptoNight,PoW,8.430000e+04,600000518
...,...,...,...,...,...
QWC,Qwertycoin,CryptoNight Heavy,PoW,9.955311e+10,184470000000
NYC,NewYorkCoin,Scrypt,PoW,1.430067e+11,0
GCN,gCn Coin,Scrypt,PoW,1.630551e+11,200000000000
BCN,ByteCoin,CryptoNight,PoW,1.840668e+11,184467440735


In [58]:
# Store the names of all cryptocurrencies on a DataFrame named coins_name, 
# and use the crypto_df.index as the index for this new DataFrame
coins_name = crypto_df[['CoinName']]
coins_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [59]:
# Check crypto_df to see if the index is the same
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [60]:
# Remove the CoinName column
crypto_df=crypto_df.drop(columns=['CoinName'])
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [61]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X
X = pd.get_dummies(crypto_df)
X.describe()

Unnamed: 0,TotalCoinsMined,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,TotalCoinSupply_91388946,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
count,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,...,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0
mean,5340456000.0,0.00188,0.00188,0.00188,0.003759,0.005639,0.00188,0.003759,0.003759,0.00188,...,0.00188,0.003759,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188
std,45645680000.0,0.043355,0.043355,0.043355,0.061256,0.074952,0.043355,0.061256,0.061256,0.043355,...,0.043355,0.061256,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355
min,41.99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8359849.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24743970.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,186725000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,989988700000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [62]:
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame. 
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.0433963  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.0433963  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561 -0.0433963  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11671506 -0.0433963  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11474682 -0.0433963  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


# Reducing Data Dimensions Using PCA

In [63]:
# Use the PCA algorithm from sklearn to reduce the dimensions of the X DataFrame
# down to three principal components
# Initialize PCA model
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)

In [64]:
# Once you have reduced the data dimensions, create a DataFrame named “pcs_df” 
# that includes the following columns: PC 1, PC 2, and PC 3. 
# Use the crypto_df.index as the index for this new DataFrame.

pcs_df = pd.DataFrame(data=crypto_pca, columns=['PC 1', 'PC 2', 'PC 3'],index=crypto_df.index
)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.344296,-0.180519,0.111838
404,-0.323347,-0.178939,0.106118
1337,0.194188,-0.018816,0.238971
BTC,-0.196011,-0.197971,-0.283254
ETH,-0.162160,-0.146047,-0.198312
...,...,...,...
ZEPH,2.550026,0.023909,-0.465110
GAP,-0.333150,-0.225318,0.040271
BDX,0.035008,-0.198514,-0.378970
ZEN,-0.312114,-0.306152,-0.289720


# Clustering Cryptocurrencies Using K-means

### You’ll use the KMeans algorithm from sklearn to cluster the cryptocurrencies using the PCA data.

Complete the following tasks:

In [65]:
# Create an elbow curve to find the best value for K, and use the pcs_df DataFrame

inertia = []
k = list(range(1,11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {'k':k, 'inertia':inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow Curve')    

In [67]:
# Once you define the best value for K, run the K-means algorithm to predict the K clusters 
# for the cryptocurrencies’ data. Use the pcs_df to run the K-means algorithm

def get_clusters(k, data):
    # Create a copy of the DataFrame   
    data = data.copy()       
    # Initialize the K-Means model   
    model = KMeans(n_clusters=k, random_state=0)   
    # Fit the model   
    model.fit(data)   
    # Predict clusters   
    predictions = model.predict(data)   
    # Create return DataFrame with predicted clusters   
    data["class"] = model.labels_   
    return data

four_clusters = get_clusters(4, pcs_df)
four_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.344296,-0.180519,0.111838,0
404,-0.323347,-0.178939,0.106118,0
1337,0.194188,-0.018816,0.238971,0
BTC,-0.196011,-0.197971,-0.283254,0
ETH,-0.16216,-0.146047,-0.198312,0


In [68]:
# Create a new DataFrame named “clustered_df,” that includes the following columns: 
# Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class

first_merge = four_clusters.merge(crypto_df, left_index=True, right_index=True)
clustered_df = first_merge.merge(coins_name, left_index=True, right_index=True)
clustered_df = clustered_df[['Algorithm',
                   'ProofType',
                   'TotalCoinsMined',
                   'TotalCoinSupply',
                   'PC 1',
                   'PC 2',
                   'PC 3',
                   'CoinName',
                   'class']]
clustered_df.rename(columns = {'class':'Class'}, inplace=True)
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.344296,-0.180519,0.111838,42 Coin,0
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.323347,-0.178939,0.106118,404Coin,0
1337,X13,PoW/PoS,2.927942e+10,314159265359,0.194188,-0.018816,0.238971,EliteCoin,0
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.196011,-0.197971,-0.283254,Bitcoin,0
ETH,Ethash,PoW,1.076842e+08,0,-0.162160,-0.146047,-0.198312,Ethereum,0
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,2.550026,0.023909,-0.465110,ZEPHYR,0
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.333150,-0.225318,0.040271,Gapcoin,0
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.035008,-0.198514,-0.378970,Beldex,0
ZEN,Equihash,PoW,7.296538e+06,21000000,-0.312114,-0.306152,-0.289720,Horizen,0


# Visualizing Results

In [69]:
# Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame. 
# You should include the following parameters on the plot: hover_name="CoinName" and hover_data=["Algorithm"] 
# to show this additional info on each data point

fig = px.scatter_3d(clustered_df,
                    x="PC 1",
                    y="PC 2",
                    z="PC 3",
                    hover_name="CoinName",
                    hover_data=['Algorithm'],
                    color='Class',
                    symbol='Class',
                    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [70]:
# Use hvplot.table to create a data table with all the current tradable cryptocurrencies. 
# The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, 
# TotalCoinsMined, and Class
data_table = clustered_df[['CoinName',
                           'Algorithm',
                           'ProofType',
                           'TotalCoinSupply',
                           'TotalCoinsMined',
                           'Class']]
data_table.hvplot.table()

In [71]:
# Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies 
# having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the number of available coins versus 
# the total number of mined coins. Use the hover_cols=["CoinName"] parameter to include the cryptocurrency 
# name on each data point

# Plotting the clusters
clustered_df.hvplot.scatter(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    hover_cols=['CoinName'],
    )