In [11]:
import warnings
warnings.filterwarnings('ignore')

In [124]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [101]:
# Read the source dataset into a DataFrame
file_path = "crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [102]:
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


## Data Preprocessing

In [103]:
# Remove all cryptocurrencies that aren’t trading
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [104]:
# Remove all cryptocurrencies that don’t have an algorithm defined
crypto_df["Algorithm"].isnull().sum()

0

In [105]:
# Remove the IsTrading column
crypto_df.drop(labels=["IsTrading"], axis="columns", inplace=True)
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,PoW,,1000000000
1244,UOS,UOS,SHA-256,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [106]:
# Remove all cryptocurrencies with at least one null value
crypto_df.dropna(inplace=True)
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [107]:
# Remove all cryptocurrencies without coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [108]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name, 
# and use the crypto_df.index as the index for this new DataFrame
coins_name = crypto_df[["Unnamed: 0", "CoinName"]].set_index("Unnamed: 0")
coins_name.index.name = None
coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [109]:
# Remove the CoinName column from crypto_df DataFrame
crypto1_df = crypto_df.set_index("Unnamed: 0")
crypto1_df.drop(labels=["CoinName"], axis="columns", inplace=True)
crypto1_df.index.name = None
crypto1_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [110]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = crypto1_df.copy()
X['Algorithm'] = le.fit_transform(X['Algorithm'])
X['ProofType'] = le.fit_transform(X['ProofType'])
X

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,52,15,4.199995e+01,42
404,52,15,1.055185e+09,532000000
1337,66,15,2.927942e+10,314159265359
BTC,47,12,1.792718e+07,21000000
ETH,20,12,1.076842e+08,0
...,...,...,...,...
ZEPH,47,1,2.000000e+09,2000000000
GAP,52,15,1.493105e+07,250000000
BDX,10,12,9.802226e+08,1400222610
ZEN,18,12,7.296538e+06,21000000


In [111]:
X1 = pd.get_dummies(crypto1_df, columns=["Algorithm", "ProofType"])
X1

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame. 
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[ 3.76459118e-01,  8.91356555e-01, -1.17108170e-01,
        -1.52870298e-01],
       [ 3.76459118e-01,  8.91356555e-01, -9.39695522e-02,
        -1.45008997e-01],
       [ 1.21543803e+00,  8.91356555e-01,  5.24945609e-01,
         4.48942416e+00],
       ...,
       [-2.14047761e+00,  1.67233875e-03, -9.56133629e-02,
        -1.32179374e-01],
       [-1.66106109e+00,  1.67233875e-03, -1.16948169e-01,
        -1.52559984e-01],
       [ 3.76459118e-01, -1.48113469e+00, -1.17105357e-01,
        -1.52855521e-01]])

In [113]:
X1_scaled = StandardScaler().fit_transform(X1)
X1_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

## Reducing Data Dimensions Using PCA

In [114]:
# Use the PCA algorithm from sklearn to reduce the dimensions of the X DataFrame down to three principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
pcs_df = pd.DataFrame(data=X_pca, columns=["PC1", "PC2", "PC3"]).set_index([X.index.tolist()])
pcs_df.head(10)

Unnamed: 0,PC1,PC2,PC3
42,-0.417875,0.810296,0.372138
404,-0.396564,0.815135,0.373256
1337,3.124076,2.20978,0.504335
BTC,-0.192083,0.016266,-0.07291
ETH,-0.044116,-1.167492,1.012525
LTC,-0.217954,0.235824,-0.273789
DASH,-0.482771,1.336609,-0.110253
XMR,-0.001979,-1.518631,1.334156
ETC,-0.041907,-1.167016,1.012991
ZEC,-0.034542,-1.255502,1.092984


In [119]:
pca = PCA(n_components=3)
X1_pca = pca.fit_transform(X1_scaled)
pcs1_df = pd.DataFrame(data=X1_pca, columns=["PC1", "PC2", "PC3"]).set_index([X1.index.tolist()])
pcs1_df.head(10)

Unnamed: 0,PC1,PC2,PC3
42,-0.333437,1.076995,-0.507005
404,-0.316786,1.077155,-0.507448
1337,2.315273,1.692614,-0.555636
BTC,-0.149094,-1.28812,0.17803
ETH,-0.16128,-2.038474,0.364704
LTC,-0.170028,-1.090194,-0.034373
DASH,-0.391155,1.276606,-0.430337
XMR,-0.158948,-2.15869,0.369498
ETC,-0.159723,-2.038567,0.364674
ZEC,-0.129433,-2.218993,0.413702


## Clustering Cryptocurrencies Using K-means

In [120]:
# Create an elbow curve to find the best value for K, and use the pcs_df DataFrame
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs1_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [121]:
# Run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data
model = KMeans(n_clusters=4, random_state=0)
model.fit(pcs1_df)
predictions = model.predict(pcs1_df)

In [123]:
# Create a new DataFrame named “clustered_df,” that includes the following columns: 
# Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class
clustered_df = crypto1_df.copy()
clustered_df = clustered_df.join(pcs1_df, how="inner")
clustered_df = clustered_df.join(coins_name, how="inner")
clustered_df["class"] = model.labels_

clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,class
42,Scrypt,PoW/PoS,41.99995,42,-0.333437,1.076995,-0.507005,42 Coin,1
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.316786,1.077155,-0.507448,404Coin,1
1337,X13,PoW/PoS,29279420000.0,314159265359,2.315273,1.692614,-0.555636,EliteCoin,1
BTC,SHA-256,PoW,17927180.0,21000000,-0.149094,-1.28812,0.17803,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.16128,-2.038474,0.364704,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000,-0.170028,-1.090194,-0.034373,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000,-0.391155,1.276606,-0.430337,Dash,1
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.158948,-2.15869,0.369498,Monero,0
ETC,Ethash,PoW,113359700.0,210000000,-0.159723,-2.038567,0.364674,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000,-0.129433,-2.218993,0.413702,ZCash,0


## Visualizing Results

In [129]:
# Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame. 
# You should include the following parameters on the plot: hover_name="CoinName" 
# and hover_data=["Algorithm"] to show this additional info on each data point.
fig = px.scatter_3d(
                    clustered_df,
                    x="PC1",
                    y="PC2",
                    z="PC3",
                    color="class",
                    symbol="class",
                    width=1000,
                    hover_name="CoinName",
                    hover_data=["Algorithm"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [126]:
# Use hvplot.table to create a data table with all the current tradable cryptocurrencies. 
# The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, TotalCoinsMined and Class.


In [None]:
# Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies 
# having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the number of available coins 
# versus the total number of mined coins. 
# Use the hover_cols=["CoinName"] parameter to include the cryptocurrency name on each data point.