In [1]:
#Import Dependencies

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Data Preparation

In [2]:
#Read in CSV file

file_path = Path("Resources/crypto_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [4]:
# Discard all cryptocurrencies that are not being traded. In other words, filter for currencies that are currently being traded. Once you have done this, drop the IsTrading column from the dataframe
df = df.drop(df.index[df['IsTrading'] == False]).dropna()
df = df.drop(columns=['IsTrading'])
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 0 to 1247
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       685 non-null    object 
 1   CoinName         685 non-null    object 
 2   Algorithm        685 non-null    object 
 3   ProofType        685 non-null    object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  685 non-null    object 
dtypes: float64(1), object(5)
memory usage: 37.5+ KB


In [6]:
# Remove all rows that have at least one null value
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [7]:
#Filter cryptocurrencies that have been mined. That is, the totatl coins mined should be greater than 0
df = df.loc[df["TotalCoinsMined"] >= 0]
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1247
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       684 non-null    object 
 1   CoinName         684 non-null    object 
 2   Algorithm        684 non-null    object 
 3   ProofType        684 non-null    object 
 4   TotalCoinsMined  684 non-null    float64
 5   TotalCoinSupply  684 non-null    object 
dtypes: float64(1), object(5)
memory usage: 37.4+ KB


In [9]:
# create a new DF for the cryptocurrencies names
coin_names = df['CoinName']
coin_names_df = pd.DataFrame(coin_names)
coin_names_df

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
4,808
5,EliteCoin
7,Bitcoin
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [10]:
# Delete the CoinName from the original dataframe.
df.drop(columns = ['CoinName'], inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,4.199995e+01,42
2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
5,1337,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
1238,ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Equihash,PoW,7.296538e+06,21000000


In [11]:
# Index 
df.set_index('Unnamed: 0')

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
808,SHA-256,PoW/PoS,0.000000e+00,0
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1247
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       684 non-null    object 
 1   Algorithm        684 non-null    object 
 2   ProofType        684 non-null    object 
 3   TotalCoinsMined  684 non-null    float64
 4   TotalCoinSupply  684 non-null    object 
dtypes: float64(1), object(4)
memory usage: 32.1+ KB


In [13]:
# Your next step in data preparation is to convert the remaining features with text values, Algorithm and ProofType, into numerical data. To accomplish this task, use Pandas to create dummy variables. 
df["TotalCoinSupply"] = df["TotalCoinSupply"].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1247
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       684 non-null    object 
 1   Algorithm        684 non-null    object 
 2   ProofType        684 non-null    object 
 3   TotalCoinsMined  684 non-null    float64
 4   TotalCoinSupply  684 non-null    float64
dtypes: float64(2), object(3)
memory usage: 32.1+ KB


In [14]:
# Drop Column "Unnamed: 0"
df = df.drop(columns=['Unnamed: 0'])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1247
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        684 non-null    object 
 1   ProofType        684 non-null    object 
 2   TotalCoinsMined  684 non-null    float64
 3   TotalCoinSupply  684 non-null    float64
dtypes: float64(2), object(2)
memory usage: 26.7+ KB


In [16]:
# Create Dummy Variables
df =pd.get_dummies(df, columns=["Algorithm", "ProofType"])
print(df.shape)
df.head()

(684, 107)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1247
Columns: 107 entries, TotalCoinsMined to ProofType_dPoW/PoW
dtypes: float64(2), uint8(105)
memory usage: 86.2 KB


# Examine the number of rows and columns of your dataset now. How did they change?
The starting dataframe began with 1251 rows and 7 columns.  With droping the nulls and the unnecessary columns, as well as adding in the dummy variables, we end up with a dataframe that has 684 rows and 107 columns.

In [18]:
# Standardize your dataset so that columns that contain larger values do not unduly influence the outcome.
df_scaled = StandardScaler().fit_transform(df)
df_scaled

array([[-0.10312268, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.07692584, -0.03826624, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10312268, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       ...,
       [-0.07878691, -0.03826599, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10294153, -0.03826638, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394],
       [-0.10311949, -0.03826639, -0.03826394, ..., -0.03826394,
        -0.03826394, -0.03826394]])

# Dimensionality Reduction

In [None]:
# Perform dimensionality reduction with PCA. Rather than specify the number of principal components when you instantiate the PCA model, it is possible to state the desired explained variance. 

In [None]:
#How did the number of the features change?

In [None]:
# Next, further reduce the dataset dimensions with t-SNE and visually inspect the results. In order to accomplish this task, run t-SNE on the principal components: the output of the PCA transformation. 

In [None]:
#Then create a scatter plot of the t-SNE output. 

In [None]:
# Observe whether there are distinct clusters or not.

# Cluster Analysis with k-Means

In [None]:
# Create an elbow plot to identify the best number of clusters. Use a for-loop to determine the inertia for each k between 1 through 10. 

In [None]:
# Determine, if possible, where the elbow of the plot is, and at which value of k it appears.

# Recommedation

In [None]:
# Based on your findings, make a brief (1-2 sentences) recommendation to your clients. Can the cryptocurrencies be clustered together? If so, into how many clusters?