In [15]:
# Import dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [16]:
# Import dataset
file_path = "crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


## Exploratory Data Analysis, Data Pre-Processing & Cleaning

In [17]:
# Check Data types
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [18]:
# Check the number of rows
crypto_df.count()

CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [19]:
# Convert TotalCoinSupply to float64
crypto_df['TotalCoinSupply'] = pd.to_numeric(crypto_df['TotalCoinSupply'],errors='coerce')
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [20]:
# Remove all cryptocurrencies that aren't trading
# Remove all cryptocurrencies that aren’t trading.

crypto_df = crypto_df[crypto_df['IsTrading']== True]

crypto_df.count()

CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1141
dtype: int64

In [21]:
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0


In [22]:
# Remove all cryptocurrencies that don't have an algorithm defined
crypto_df['Algorithm'].groupby(crypto_df['Algorithm']) \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=True) \
                             .head()

Unnamed: 0,Algorithm,count
0,1GB AES Pattern Search,1
33,Jump Consistent Hash,1
39,M7 POW,1
40,Momentum,1
87,Zhash,1


In [23]:
# Remove the "IsTrading" column
crypto_df.drop(columns=['IsTrading'], inplace=True)

crypto_df.count()

CoinName           1144
Algorithm          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1141
dtype: int64

In [24]:
# Remove all cryptocurrencies with at least one null value.
crypto_df = crypto_df.dropna()
crypto_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [25]:
# Remove all cryptocurrencies without coins mined.
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]
crypto_df.count()

CoinName           532
Algorithm          532
ProofType          532
TotalCoinsMined    532
TotalCoinSupply    532
dtype: int64

In [26]:
# Store the names of all cryptocurrencies on a DataFrame named "coins_name", and use the crypto_df.index as the index for this new DataFrame
coins_name = pd.DataFrame(crypto_df['CoinName'])
coins_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [27]:
# Remove the coin name column
crypto_df.drop(columns=['CoinName'], inplace=True)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [28]:
# Create dummy variables for all of the text features, and store the resulting data on a DataFrame named X

# Copy the current DataFrame to a new one called "X"
X = crypto_df.copy()

# Encode the text columns
le = LabelEncoder()

# Transform the text columns
X['Algorithm'] = le.fit_transform(X['Algorithm'])
X['ProofType'] = le.fit_transform(X['ProofType'])

X.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,52,15,41.99995,42.0
404,52,15,1055185000.0,532000000.0
1337,66,15,29279420000.0,314159300000.0
BTC,47,12,17927180.0,21000000.0
ETH,20,12,107684200.0,0.0


In [29]:
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame.

data_scaler = StandardScaler()

# Fit
X_scaled = data_scaler.fit_transform(X)

## Reducing Data Dimensions Using PCA

In [30]:
# Reduce the dimensions of the X DataFrame down to three principal components

# Initialize the PCA model
pca = PCA(n_components=3)

# Get three principal components
X_pca = pca.fit_transform(X_scaled)

print(pca.explained_variance_ratio_)

[0.4350467  0.28670411 0.20538956]


In [31]:
# Create a new DataFrame that uses crypto_df.index as the index for this new DataFrame.
pcs_df = pd.DataFrame(data=X_pca, index=X.index, columns=["PC1", "PC2", "PC3"])
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.417875,0.810296,0.372138
404,-0.396564,0.815135,0.373256
1337,3.124076,2.20978,0.504335
BTC,-0.192083,0.016266,-0.07291
ETH,-0.044116,-1.167492,1.012525


## Clustering Cryptocurrencies using K-Means

In [32]:
# Create an elbow curve to find the best value for K and use the pcs_df DataFrame

# Create an empty list to hold inertia values
inertia = []

# Store a range of K values that want to test
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k)

In [34]:
# Once you define the best value for K, run the K-means algorithm to predict the K clusters for the cryptocurrencies' data.

# Create a new DataFrame called "clustered_df"
clustered_df = X.merge(pcs_df, left_index=True, right_index=True)         \
              .merge(coins_name, left_index=True, right_index=True)

# Create a KMeans model with 5 clusters
model = KMeans(n_clusters=5, random_state=5).fit(pcs_df)

# Calculate predicted values.
y_pred = model.predict(pcs_df)

# Join to original
clustered_df['Class'] = model.labels_
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
42,52,15,41.99995,42.0,-0.417875,0.810296,0.372138,42 Coin,0
404,52,15,1055185000.0,532000000.0,-0.396564,0.815135,0.373256,404Coin,0
1337,66,15,29279420000.0,314159300000.0,3.124076,2.20978,0.504335,EliteCoin,4
BTC,47,12,17927180.0,21000000.0,-0.192083,0.016266,-0.07291,Bitcoin,0
ETH,20,12,107684200.0,0.0,-0.044116,-1.167492,1.012525,Ethereum,3


## Visualizing Results

In [40]:
# Create a 3D Scatter Plot using Plotly Express to plot the clusters using the clustered_df DataFrame.

# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC1", y="PC2", z="PC3", color="Class", symbol="Class", hover_name="CoinName", hover_data=["Algorithm"], width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [41]:
# Use hvplot.table to create a data table with all the current tradable crypto currencies.

cols = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']

clustered_table = clustered_df.hvplot.table(cols, width=600)

clustered_table

In [42]:
# Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies
cluster_plot = clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class", hover_cols=["CoinName"], rot=45, title="Cryptocurrencies Mined vs. Supply")

cluster_plot