## Objectives
 - prepare data for dimesion reduction using PCA and clustering via K-Means
 - predict clusters using K-Means
 - create plots and data tables to present results 
 

In [236]:
#imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px 
from path import Path

In [237]:
#load data
file_path = Path("../Resources/crypto_data.csv")
crypto_df = pd.read_csv(file_path)
crypto_df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


- remove all currencies that aren't trading
- remove all currencies that don't have a defined algo
- remove the IsTrading column
- remove all currencies with at least one null value 
- remove all currencies without coins mined
- store the names of all currencies on a dataframe named coins_name, use cryptoDf.index as index 
- remove coinName column
- create dummies variables for all text features, store resulting data on dataframe named X
use standardScaler to standardize all data on X. do this before PCA and K-Means 

In [238]:
#remove all currencies that aren't trading
crypto_df = crypto_df[crypto_df["IsTrading"]==True]
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [239]:
#drop IsTrading column
crypto_df = crypto_df.drop(["IsTrading"], axis=1)
#check for remaining null values
for column in crypto_df.columns:
    print(f"Column{column} has {crypto_df[column].isnull().sum()} null values")

ColumnUnnamed: 0 has 0 null values
ColumnCoinName has 0 null values
ColumnAlgorithm has 0 null values
ColumnProofType has 0 null values
ColumnTotalCoinsMined has 459 null values
ColumnTotalCoinSupply has 0 null values


In [240]:
#drop currencies with null values
crypto_df = crypto_df.dropna()
for column in crypto_df.columns:
    print(f"Column{column} has {crypto_df[column].isnull().sum()} null values")

ColumnUnnamed: 0 has 0 null values
ColumnCoinName has 0 null values
ColumnAlgorithm has 0 null values
ColumnProofType has 0 null values
ColumnTotalCoinsMined has 0 null values
ColumnTotalCoinSupply has 0 null values


In [241]:
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [242]:
#reset index
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df.index.name = None
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [243]:
#create coinName DF
coins_name = pd.DataFrame(crypto_df["CoinName"])
coins_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
808,808
1337,EliteCoin
BTC,Bitcoin


In [244]:
#drop coinName column from cryptoDf
crypto_df = crypto_df.drop(columns=["CoinName"], axis=1)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
808,SHA-256,PoW/PoS,0.0,0
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000


In [245]:
#create dummie variables for algorithm and proof type
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
808,0.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [246]:
#standardized data with StandardScaler
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:1])

[[-0.10282804 -0.03823841 -0.03823596 -0.03823596 -0.03823596 -0.03823596
  -0.05411338 -0.07664017 -0.03823596 -0.05411338 -0.05411338 -0.03823596
  -0.03823596 -0.18216065 -0.05411338 -0.03823596 -0.03823596 -0.08574929
  -0.03823596 -0.10160947 -0.06632365 -0.03823596 -0.03823596 -0.1642757
  -0.03823596 -0.03823596 -0.13908716 -0.03823596 -0.03823596 -0.07664017
  -0.03823596 -0.03823596 -0.03823596 -0.03823596 -0.06632365 -0.03823596
  -0.07664017 -0.08574929 -0.07664017 -0.03823596 -0.03823596 -0.12775161
  -0.1335313  -0.13908716 -0.03823596 -0.05411338 -0.03823596 -0.06632365
  -0.1689039  -0.03823596 -0.03823596 -0.03823596 -0.07664017 -0.17342199
  -0.33468341 -0.03823596 -0.08574929 -0.06632365 -0.05411338 -0.03823596
   1.42042992 -0.06632365 -0.03823596 -0.03823596 -0.06632365 -0.06632365
  -0.03823596 -0.03823596 -0.03823596 -0.03823596 -0.03823596 -0.03823596
  -0.03823596 -0.41586681 -0.03823596 -0.19054822 -0.03823596 -0.10870529
  -0.07664017 -0.09400279 -0.03823596 -

use PCA to reduce dimensions

In [247]:
#initialized PCA model and get principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)

In [248]:
#transform PCA data to dataframe
crypto_pca_df = pd.DataFrame(data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=crypto_df.index)
crypto_pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.208992,1.184306,-0.46
404,-0.19445,1.182384,-0.460585
808,-0.176358,0.818071,-0.345441
1337,0.301864,1.879153,-0.554624
BTC,-0.237339,-1.4359,0.099833


- create an elbow curve to find best value for K using crypto_pca_df
- run K-Means algo to predict clusters for crypto data, using crypto_pca_df
- create a new dataframe named clustered_df that includes following columns: Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, Class

In [249]:
#create elbow curve
inertia = []
k = list(range(1,11))
#loop through k values and find inertia
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_pca_df)
    inertia.append(km.inertia_)
#use dataframe to plot elbow curve
elbow_data = {"k":k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

based on the elbow curve 4 clusters appears to be the best choice

In [250]:
#function to cluster and plot dataset
def testClusterAmount(df, clusters):
    df = df.copy()
    #initialize model
    model = KMeans(n_clusters=clusters, random_state=5)
    #fit the model
    model.fit(df)
    #make predictions
    predictions = model.predict(df)
    df["class"] = model.labels_
    return df

In [251]:
#use testClusterAmount for 4 clusters
clustered_df = testClusterAmount(crypto_pca_df, 4)
clustered_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.208992,1.184306,-0.46,0
404,-0.19445,1.182384,-0.460585,0
808,-0.176358,0.818071,-0.345441,0
1337,0.301864,1.879153,-0.554624,0
BTC,-0.237339,-1.4359,0.099833,3


In [252]:
#merge dataframes
clustered_df = pd.concat([crypto_df, clustered_df, coins_name], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.208992,1.184306,-0.46,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.19445,1.182384,-0.460585,0,404Coin
808,SHA-256,PoW/PoS,0.0,0,-0.176358,0.818071,-0.345441,0,808
1337,X13,PoW/PoS,29279420000.0,314159265359,0.301864,1.879153,-0.554624,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.237339,-1.4359,0.099833,3,Bitcoin


## visualizing the results

In [253]:
#plot in 3d
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class", width=800, hover_name="CoinName", hover_data=["Algorithm"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [254]:
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "class"])

In [272]:
#need to standardize data before graphing. divide everything by 1000? 10000? 
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", hover_cols=["CoinName"], by="class", size="TotalCoinsMined")

In [275]:
#plot in 3d
fig = px.scatter(clustered_df, x="TotalCoinsMined", y="TotalCoinSupply", color="class", width=800, hover_name="CoinName", hover_data=["Algorithm"])
fig.show()