In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE



# Data Preparation

In [2]:
df=pd.read_csv('crypto_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
df.info()
# df.isnull().sum().sum() # 500 null rows that will have to be dealt with
# df.duplicated().sum() #confirm no duplictaes
#contains nulls and some floats are objects


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [4]:
#correct data type of "TotalCoinSupply"
df["TotalCoinSupply"] = pd.to_numeric(df.TotalCoinSupply, errors='coerce')

In [5]:
#only take types that are trading and drop unnecessary columns
trading = df[df['IsTrading'] == True].drop(columns={"IsTrading", 'CoinName', 'Unnamed: 0'}).dropna()
trading

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
2,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
4,SHA-256,PoW/PoS,0.000000e+00,0.000000e+00
5,X13,PoW/PoS,2.927942e+10,3.141593e+11
7,SHA-256,PoW,1.792718e+07,2.100000e+07
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2.000000e+09
1242,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
1245,CryptoNight,PoW,9.802226e+08,1.400223e+09
1246,Equihash,PoW,7.296538e+06,2.100000e+07


In [6]:
# reduce down to types that have actually been mined
mined=trading[trading['TotalCoinsMined']>0]
mined

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
2,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
5,X13,PoW/PoS,2.927942e+10,3.141593e+11
7,SHA-256,PoW,1.792718e+07,2.100000e+07
8,Ethash,PoW,1.076842e+08,0.000000e+00
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2.000000e+09
1242,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
1245,CryptoNight,PoW,9.802226e+08,1.400223e+09
1246,Equihash,PoW,7.296538e+06,2.100000e+07


In [7]:
mined= mined.reset_index(drop=True)

In [8]:
numeric=pd.get_dummies(mined)
# print(numeric.columns)
numeric

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
528,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
529,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
530,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


DF grew from 4 columns to 98, meaning there are a lot of different options in Algorithm(+/- 70) and ProofType(+/- 25).

In [9]:
#scale rows outside of the 0-1 scale
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric[['TotalCoinsMined', 'TotalCoinSupply']])

In [16]:
scaled_data2 = scaler.fit_transform(numeric)

In [17]:
new_df = pd.DataFrame(scaled_data)
new_df

Unnamed: 0,0,1
0,-0.117108,-0.152870
1,-0.093970,-0.145009
2,0.524946,4.489424
3,-0.116715,-0.152560
4,-0.114747,-0.152870
...,...,...
527,-0.073251,-0.123317
528,-0.116781,-0.149176
529,-0.095613,-0.132179
530,-0.116948,-0.152560


In [18]:
#add these rows to the original df with updated names
numeric['TotalCoinsMinedAdj']=new_df[0] 
numeric['TotalCoinSupplyAdj']=new_df[1]

In [19]:
numeric

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,...,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW,TotalCoinsMinedAdj,TotalCoinSupplyAdj
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.117108,-0.152870
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.093970,-0.145009
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.524946,4.489424
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.116715,-0.152560
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.114747,-0.152870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.073251,-0.123317
528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.116781,-0.149176
529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.095613,-0.132179
530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.116948,-0.152560


In [20]:
#clean up the stuff we dont need
numeric=numeric.drop(columns={'TotalCoinsMined', "TotalCoinSupply"})
numeric

KeyError: "['TotalCoinSupply', 'TotalCoinsMined'] not found in axis"

# Dimensionality Reduction

### PCA Model

In [14]:
# Initialize PCA model
 
pca = PCA(n_components=.90)
reduced_pca = pca.fit_transform(numeric)

In [15]:
#confirm variace
pca.explained_variance_ratio_.sum()

0.90488890007667

In [None]:
# Transform PCA data to a DataFrame
reduced_df = pd.DataFrame(
    data=reduced_pca
)
reduced_df.head()

###  t-SNE

In [None]:
# Initialize t-SNE model
tsne = TSNE(n_components=2, verbose=1, perplexity=25, n_iter=2000, learning_rate=35)
# Reduce dimensions
tsne_features = tsne.fit_transform(reduced_df)
# tsne_features = tsne.transform(reduced_df)

In [None]:
tsne.n_features_in_
# tsne.n_iter_
# tsne.kl_divergence_
# tsne.embedding_

In [None]:
tsne_df= pd.DataFrame(tsne_features)
# tsne_df["class"] = tsne.n_features_in_
tsne_df

In [None]:
# The first column of transformed features
reduced_df['x'] = tsne_df[0]

# The second column of transformed features
reduced_df['y'] = tsne_df[1]

In [None]:
reduced_df

In [None]:


plt.figure(figsize=(16,10))
plt.scatter(reduced_df['x'], reduced_df['y'], c=reduced_df['y'])
plt.show()



# Cluster Analysis with k-Means

In [None]:
model = KMeans()
model.fit(reduced_df)

In [None]:
inertia = []
k = list(range(1,20))


# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(reduced_df)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow['Change Rate']= df_elbow["inertia"].pct_change().map('{:,.2f}'.format)

plt.figure(figsize=(16,10))
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,20))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
df_elbow.head(20)

Based on all available information, it looks like the Crypto's will be very difficult to clump effectivly.  (If they do clump, it's into +/- 14 groups at best before the rate of change begins to normalize and the curve hits its "elbow".)