In [1]:
#Import Dependencies

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
#Read crypto_data.csv into Pandas. The dataset was obtained from CryptoCompare

df = pd.read_csv('crypto_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
#Convert IsTrading Column Boolean Value to Strings

mask = df.applymap(type) != bool
d = {True: 'TRUE', False: 'FALSE'}
df = df.where(mask, df.replace(d))

In [4]:
#Discard all cryptocurrencies that are not being traded

df = df.loc[df['IsTrading'] == 'TRUE']

In [5]:
#Dropping the IsTrading Column

df = df.drop('IsTrading', axis = 1)

In [6]:
#Remove all rows that have at least one null value

df = df.dropna()

In [7]:
#Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero
df = df.loc[df['TotalCoinsMined']>0]

In [8]:
#Delete the CoinName from the original dataframe

df = df.drop('CoinName', axis = 1)
df = df.drop('Unnamed: 0', axis = 1)

In [9]:
#Convert the remaining features with text values, Algorithm and ProofType, into numerical data

df = pd.get_dummies(data=df, columns=['Algorithm','ProofType'])
df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#Standardize your dataset so that columns that contain larger values do not unduly influence the outcome
scaler = StandardScaler()
scaled_array = scaler.fit_transform(df)
scaled_array

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

In [11]:
# Perform dimensionality reduction with PCA

from sklearn.decomposition import PCA
pca = PCA(n_components = 0.9)
df_pca = pca.fit_transform(scaled_array)

In [12]:
df_pca.shape

(532, 74)

In [None]:
#Further reduce the dataset dimensions with t-SNE

from sklearn.manifold import TSNE
tsne = TSNE(learning_rate = 250)
tsne_features = tsne.fit_transform(df_pca)



In [None]:
tsne_features.shape

In [None]:
#Conver the numpy array into a Data Frame

tsne_df = pd.DataFrame(data = tsne_features, columns = ['Feature1','Feature2'])
tsne_df

In [None]:
#Visulize the data

plt.scatter(tsne_df['Feature1'], tsne_df['Feature2'])
plt.show()

In [None]:
#Create an elbow plot to identify the best number of clusters

from sklearn.cluster import KMeans
k = [1,2,3,4,5,6,7,8,9,10]
inertia = []

for i in k:
    km = KMeans (n_clusters = i, random_state = 0)
    km.fit(tsne_df)
    inertia.append(km.inertia_)
elbow_df = pd.DataFrame({'k':k, 'inertia':inertia})
elbow_df

In [None]:
# Visualize the Elbow curve. Looks like the best cluster is 4

import matplotlib.pyplot as plt
plt.plot(k, inertia)
plt.show()

In [None]:
# Extra work: After determine the best cluster =4, using this data to run the Kmean in order to add the 'Class' columns 

model = KMeans(n_clusters = 4, random_state = 0)
model.fit(tsne_df)
predictions = model.predict(tsne_df)
tsne_df['class'] = predictions
tsne_df.sample(5)

In [None]:
#Plot again with the color = 'class' to better visualize the four clusters

plt.scatter(tsne_df['Feature1'], tsne_df['Feature2'],c = tsne_df['class'])
plt.show()

Findings: After creating the elbow plot with Kmeans I think the best numbers of clusters is 4. Then I run the Kmeans again with n_clusters = 4. Next, adding the 'class' columns back to the original dataframe and visulize the data using scatter plot.

As the result, I believe the cryptocurrencies can be clustered together into 4 clusters. 