In [48]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

import plotly.express as px
import hvplot.pandas
import plotly.figure_factory as ff #ff.create_dendogram

In [3]:
# Load and create Crypto DataFrame
file_path = "./Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [4]:
crypto_df.shape

(1252, 6)

In [5]:
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

### Data Preprocessing

In [6]:
# 1. Remove all cryptocurrencies that aren’t trading.
df1 = crypto_df.loc[crypto_df["IsTrading"] == True]
df1

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [7]:
# Find null values.
for column in df1.columns:
    print(f"Column {column} has {df1[column].isnull().sum()} null values.")

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column IsTrading has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 459 null values.
Column TotalCoinSupply has 0 null values.


In [8]:
# 2. Remove all cryptocurrencies that don’t have an algorithm defined.
df2 = df1[df1["Algorithm"] != "N/A"]
df2

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [9]:
# 3. Remove the IsTrading column.
df3 = df2.drop(["IsTrading"], axis=1)
df3

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [10]:
# Find null values.
for column in df3.columns:
    print(f"Column {column} has {df3[column].isnull().sum()} null values.")

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 459 null values.
Column TotalCoinSupply has 0 null values.


In [11]:
# 4. Remove all cryptocurrencies with at least one null value.
df4 = df3.dropna(axis=0, how='any')
df4

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [12]:
# Find null values.
for column in df4.columns:
    print(f"Column {column} has {df4[column].isnull().sum()} null values.")

Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 0 null values.
Column TotalCoinSupply has 0 null values.


In [13]:
# Find duplicate entries.
print(f"Duplicate entries: {df4.duplicated().sum()}")

Duplicate entries: 0


In [14]:
# 5. Remove all cryptocurrencies without coins mined.
df5 = df4[df4["TotalCoinsMined"] > 0]
df5.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [15]:
# 6. Store the names of all cryptocurrencies on a DataFrame named coins_name, and use the crypto_df.index 
# as the index for this new DataFrame.
coins_name = pd.DataFrame(df5["CoinName"], index = df5.index)
coins_name.head(10)

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
LTC,Litecoin
DASH,Dash
XMR,Monero
ETC,Ethereum Classic
ZEC,ZCash


In [16]:
# 7. Remove the CoinName column.
df7 = df5.drop(columns = "CoinName")
df7

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [17]:
# 8. Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
X = pd.get_dummies(df7, columns=["Algorithm", "ProofType"])
X.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# 9. Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame.
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

### Reducing Data Dimensions Using PCA

In [19]:
# Use PCA to reduce the dimensions of the X DataFrame down to three principal components.
pca = PCA(n_components = 3)

In [20]:
# Get the principals components
crypto_pca = pca.fit_transform(X_scaled)
crypto_pca

array([[-0.3411887 ,  1.02253428, -0.57844505],
       [-0.32453153,  1.02264112, -0.5788864 ],
       [ 2.29467445,  1.51593569, -0.69572985],
       ...,
       [ 0.32503512, -2.3495667 ,  0.43947389],
       [-0.15808173, -1.94929565,  0.42130427],
       [-0.29356019,  0.81950099, -0.2480123 ]])

In [21]:
# Create a DataFrame named “pcs_df” that includes the following columns: PC 1, PC 2, and PC 3. 
# Use the crypto_df.index as the index for this new DataFrame.
pcs_df = pd.DataFrame(data = crypto_pca, 
                      columns = ["PC 1",
                                 "PC 2", 
                                 "PC 3"], 
                      index = df7.index)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.341189,1.022534,-0.578445
404,-0.324532,1.022641,-0.578886
1337,2.294674,1.515936,-0.695730
BTC,-0.144239,-1.323780,0.157568
ETH,-0.151458,-2.020234,0.386821
...,...,...,...
ZEPH,2.476701,0.879164,-0.162472
GAP,-0.339236,1.022404,-0.578468
BDX,0.325035,-2.349567,0.439474
ZEN,-0.158082,-1.949296,0.421304


### Clustering Cryptocurrencies Using K-means

In [22]:
# 1. Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.

# Create an empty list to hold inertia values
inertia = []

# Store a range of K values to test
k = list(range(1,11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Define a dataframe to plot the Elbow Curve using hvplot
elbow_data = {"k": k, 
              "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x = "k", 
                     y = "inertia", 
                     title = "Elbow Curve", 
                     xticks = k)



In [59]:
# 2. Once you define the best value for K, run the K-means algorithm to predict the K clusters for 
# the cryptocurrencies’ data. Use the pcs_df to run the K-means algorithm.

# Initialize the K Starting Centroids
model = KMeans(n_clusters = 4, random_state = 5)

# Fitting the model
model.fit(pcs_df)

# Get predictions
predictions = model.predict(pcs_df)

# Add a new "Class" column
pcs_df["Class"] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,Class
42,-0.341189,1.022534,-0.578445,3
404,-0.324532,1.022641,-0.578886,3
1337,2.294674,1.515936,-0.695730,3
BTC,-0.144239,-1.323780,0.157568,0
ETH,-0.151458,-2.020234,0.386821,0
...,...,...,...,...
ZEPH,2.476701,0.879164,-0.162472,3
GAP,-0.339236,1.022404,-0.578468,3
BDX,0.325035,-2.349567,0.439474,0
ZEN,-0.158082,-1.949296,0.421304,0


In [60]:
# 3. Create a new DataFrame named “clustered_df” 
clustered_df = pd.concat([df7, pcs_df], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.341189,1.022534,-0.578445,3
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.324532,1.022641,-0.578886,3
1337,X13,PoW/PoS,29279420000.0,314159265359,2.294674,1.515936,-0.69573,3
BTC,SHA-256,PoW,17927180.0,21000000,-0.144239,-1.32378,0.157568,0
ETH,Ethash,PoW,107684200.0,0,-0.151458,-2.020234,0.386821,0


In [61]:
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.341189,1.022534,-0.578445,3,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.324532,1.022641,-0.578886,3,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.294674,1.515936,-0.69573,3,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.144239,-1.32378,0.157568,0,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.151458,-2.020234,0.386821,0,Ethereum


In [62]:
clustered_df = clustered_df[["Algorithm", 
                             "ProofType",
                             "TotalCoinsMined",
                             "TotalCoinSupply", 
                             "PC 1", 
                             "PC 2", 
                             "PC 3", 
                             "CoinName", 
                             "Class"]]
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.341189,1.022534,-0.578445,42 Coin,3
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.324532,1.022641,-0.578886,404Coin,3
1337,X13,PoW/PoS,29279420000.0,314159265359,2.294674,1.515936,-0.69573,EliteCoin,3
BTC,SHA-256,PoW,17927180.0,21000000,-0.144239,-1.32378,0.157568,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.151458,-2.020234,0.386821,Ethereum,0


### Visualizing Results

In [63]:
# 1. Create a 3D scatter plot. 
fig = px.scatter_3d(
    clustered_df,
    x = "PC 1",
    y = "PC 2",
    z = "PC 3",
    hover_name = "CoinName",
    hover_data = ["Algorithm"],
    color = "Class",
    symbol = "Class",
    width = 800
)
fig.update_layout(legend = dict(x = 0, y = 1))
fig.show()

In [64]:
# 2. Create hvplot.table with all the current tradable cryptocurrencies. 
clustered_df.hvplot.table(columns = ['CoinName',
                                     'Algorithm',
                                     'ProofType',
                                     'TotalCoinSupply',
                                     'TotalCoinsMined', 
                                     'Class'], 
                          width=800)

In [65]:
# Scale the training set using MinMaxScaler()
X_scaled = MinMaxScaler().fit_transform(clustered_df[['TotalCoinsMined','TotalCoinSupply']])
X_scaled

array([[0.00000000e+00, 4.20000000e-11],
       [1.06585544e-03, 5.32000000e-04],
       [2.95755135e-02, 3.14159265e-01],
       ...,
       [9.90135079e-04, 1.40022261e-03],
       [7.37028150e-06, 2.10000000e-05],
       [1.29582282e-07, 1.00000000e-06]])

In [66]:
# Create DataFrame
df_y = pd.DataFrame(data = X_scaled, 
                    columns = ['TotalCoinsMined','TotalCoinSupply'], 
                    index = clustered_df.index)
df_y

Unnamed: 0,TotalCoinsMined,TotalCoinSupply
42,0.000000e+00,4.200000e-11
404,1.065855e-03,5.320000e-04
1337,2.957551e-02,3.141593e-01
BTC,1.810842e-05,2.100000e-05
ETH,1.087731e-04,0.000000e+00
...,...,...
ZEPH,2.020225e-03,2.000000e-03
GAP,1.508199e-05,2.500000e-04
BDX,9.901351e-04,1.400223e-03
ZEN,7.370282e-06,2.100000e-05


In [67]:
# Add "CoinName" and "Class" to DataFrame
df_y['CoinName'] = clustered_df['CoinName']
df_y['Class'] = clustered_df['Class']
df_y.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,CoinName,Class
42,0.0,4.2e-11,42 Coin,3
404,0.001066,0.000532,404Coin,3
1337,0.029576,0.3141593,EliteCoin,3
BTC,1.8e-05,2.1e-05,Bitcoin,0
ETH,0.000109,0.0,Ethereum,0


In [68]:
# Create scatter plot
df_y.hvplot.scatter(x = "TotalCoinsMined", 
                    y = "TotalCoinSupply", 
                    hover_cols = ["CoinName"], 
                    by = "Class")

In [71]:
df_y.groupby("Class").count()

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,CoinName
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,238,238,238
1,6,6,6
2,1,1,1
3,287,287,287


Observations:
* Green outlier "BitTorrent" should be removed from dataset.
* Recommend removing the outliers as they are products built on blockchain but not cryptocurrencies.