In [129]:
# Initial imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import hvplot.pandas
import re
from sklearn.preprocessing import StandardScaler

In [130]:
# Load data
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.dtypes, type(crypto_df)

(CoinName            object
 Algorithm           object
 IsTrading             bool
 ProofType           object
 TotalCoinsMined    float64
 TotalCoinSupply     object
 dtype: object,
 pandas.core.frame.DataFrame)

In [131]:
# Removing characters and spacing
# Code Template: https://stackoverflow.com/questions/50444346/fast-punctuation-removal-with-pandas
# https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace(r'[^\w\s]+', '')
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace(' ', '')
print(type(crypto_df))

<class 'pandas.core.frame.DataFrame'>


In [132]:
# Converting TotalCoinSupply to float64
crypto_df[['TotalCoinSupply']] = crypto_df[['TotalCoinSupply']].astype(np.float64)

In [133]:
# Remove all cryptocurrencies that aren’t trading
# Code Template: https://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
crypto_df.drop(crypto_df[crypto_df.IsTrading == False].index, inplace=True)

In [134]:
# Remove all cryptocurrencies that don’t have an algorithm defined
crypto_df.drop(crypto_df[(crypto_df.Algorithm == '')].index, inplace=True)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0


In [135]:
# Remove the IsTrading column
crypto_df.drop(columns=['IsTrading'],axis=1, inplace=True)
# crypto_df.head()
# crypto_df.isna().sum()
# print(len(crypto_df))
crypto_df.dtypes

CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [136]:
# Remove all cryptocurrencies with at least one null value
# Code Template: https://stackoverflow.com/questions/21800169/python-pandas-get-index-of-rows-which-column-matches-certain-value

# crypto_df.isna().sum()

null_coins = crypto_df.index[crypto_df['TotalCoinsMined'].isna() == True].tolist()
# print(null_coins)

crypto_df.drop(null_coins, inplace=True)

# crypto_df.isna().sum()

In [137]:
# Remove all cryptocurrencies without coins mined
not_mined = crypto_df.index[crypto_df['TotalCoinsMined'] == 0].tolist()

crypto_df.drop(not_mined, inplace=True)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,31400000000000.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000.0
DASH,Dash,X11,PoW/PoS,9031294.0,22000000.0
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0.0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000.0
ZEC,ZCash,Equihash,PoW,7383056.0,21000000.0


In [138]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name,
# and use the crypto_df.index as the index for this new DataFrame
coin_names_df = pd.DataFrame(crypto_df['CoinName'])
# type(coin_names_df), coin_names_df.head(10)

In [139]:
# Remove the CoinName column
crypto_df.drop(columns=['CoinName'], axis=1, inplace=True)
# crypto_df.tail()
crypto_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [140]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
X.head()
# X.dtypes

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,31400000000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
# Use the StandardScaler to standardize all of the data from the X DataFrame
X_scaled = StandardScaler().fit_transform(X)

In [142]:
# Initialize PCA model
pca = PCA(n_components=3)

In [143]:
# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[-0.32757709,  1.02667949, -0.49470308],
       [-0.31465367,  1.02711768, -0.49498984],
       [ 2.39299623,  1.62487258, -0.65030489],
       ...,
       [ 0.35016252, -2.29614552,  0.38075237],
       [-0.08086531, -1.99026272,  0.25004283],
       [-0.28034656,  0.78212137, -0.23981325]])

In [144]:
# Transform PCA data
pcs_df = pd.DataFrame(
    data=X_pca, columns=['PC 1', 'PC 2', 'PC 3']
)
pcs_df= pcs_df.set_index(crypto_df.index)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.327577,1.026679,-0.494703
404,-0.314654,1.027118,-0.49499
1337,2.392996,1.624873,-0.650305
BTC,-0.111546,-1.30156,0.197254
ETH,-0.107607,-2.014399,0.365648


In [145]:
# Find the best value for K
inertia = []
k = list(range(1, 533))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve", width=400)

In [146]:
def get_clusters(k, data):
    
    # Create a copy of the DataFrame
    data = data.copy()
    
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)
    
    # Fit the model
    model.fit(data)
    
    # Predict clusters
    predictions = model.predict(data)
    
    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_
    
    return data

In [147]:
clustered_df = get_clusters(25, pcs_df)
clustered_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.327577,1.026679,-0.494703,7
404,-0.314654,1.027118,-0.49499,7
1337,2.392996,1.624873,-0.650305,16
BTC,-0.111546,-1.30156,0.197254,21
ETH,-0.107607,-2.014399,0.365648,9


In [148]:
# Ensure required columns are present
# Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class
req_data = crypto_df['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']

clustered_df = clustered_df.join(req_data)

KeyError: ('Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply')