In [14]:
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [2]:
file_path = "~\Documents\Data Bootcamp\Cryptocurrencies\Resources\crypto_data.csv"
df_crypto = pd.read_csv(file_path)
df_crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
#Set unnamed: 0 as df_crypto index
df_crypto = df_crypto.set_index('Unnamed: 0')

#Remove all cryptocurrencies that aren’t trading.
df_crypto = df_crypto[df_crypto["IsTrading"]==True]
#Remove all cryptocurrencies that don’t have an algorithm defined.
df_crypto.Algorithm.unique()

#All Crypto has an algorithm defined!

#Remove the IsTrading column.
df_crypto = df_crypto.drop(["IsTrading"],axis = 1)

#Remove all cryptocurrencies with at least one null value
df_crypto = df_crypto.dropna(axis = 0,how = "any")

#Remove all cryptocurrencies without coins mined.
df_crypto = df_crypto[df_crypto.TotalCoinsMined != 0]

In [4]:
#Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.
coins_name = pd.DataFrame(df_crypto.CoinName,columns = ["CoinName"])
coins_name["index_col"] = df_crypto.index.values
coins_name = coins_name.set_index('index_col')

#Remove the CoinName column.
df_crypto = df_crypto.drop(["CoinName"],axis = 1)

#Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
# I think by dummy variables it wants us to change string values into integers?
#Copy the DF
X = df_crypto.copy()
#Use Pandas to assign integer values for each string
#X["Unnamed: 0"] = pd.Series(df_crypto["Unnamed: 0"]).astype('category').cat.codes.values
X.Algorithm = pd.Series(df_crypto.Algorithm).astype('category').cat.codes.values
X.ProofType = pd.Series(df_crypto.ProofType).astype('category').cat.codes.values
#X = X.drop(columns = ['TotalCoinsMined','TotalCoinSupply'])

#Use the StandardScaler from sklearn  to standardize all of the data from the X DataFrame. 
X_scaled = StandardScaler().fit_transform(X)







# Reduce Dimensionality using Principal Component Analysis

In [5]:
#Initialize PCA Model with 3 components
pca = PCA(n_components = 3)


In [6]:
#attach PCA model to X Data frame (scaled df_crypto) for analysis
crypto_pca = pca.fit_transform(X_scaled)
crypto_pca

array([[-0.41266543,  0.82315023,  0.35722699],
       [-0.39130558,  0.82787259,  0.35833556],
       [ 3.14315232,  2.18674766,  0.487273  ],
       ...,
       [ 0.02754817, -1.58427704,  1.41762995],
       [-0.04208592, -1.24744988,  1.09935354],
       [ 0.09716371, -0.69628935, -1.34263979]])

In [7]:
pca.explained_variance_ratio_

array([0.43456239, 0.28670335, 0.20580514])

In [8]:
pcs_df = pd.DataFrame(
    data = crypto_pca,columns = ["PC 1","PC 2","PC 3"]
)

pcs_df
#pd.concat([df_crypto_pca.reset_index(drop = True),coins_name],axis = 1)

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.412665,0.823150,0.357227
1,-0.391306,0.827873,0.358336
2,3.143152,2.186748,0.487273
3,-0.190301,-0.003829,-0.043688
4,-0.051013,-1.161387,1.020473
...,...,...,...
528,0.560750,-2.082122,-2.376795
529,-0.409905,0.823729,0.357770
530,0.027548,-1.584277,1.417630
531,-0.042086,-1.247450,1.099354


In [9]:
#Set pcs_df index as index from df_crypto
pcs_df['index_col'] = df_crypto.index.values
pcs_df = pcs_df.set_index('index_col')
pcs_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
index_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.412665,0.823150,0.357227
404,-0.391306,0.827873,0.358336
1337,3.143152,2.186748,0.487273
BTC,-0.190301,-0.003829,-0.043688
ETH,-0.051013,-1.161387,1.020473
...,...,...,...
ZEPH,0.560750,-2.082122,-2.376795
GAP,-0.409905,0.823729,0.357770
BDX,0.027548,-1.584277,1.417630
ZEN,-0.042086,-1.247450,1.099354


# K Means Clustering

In [10]:
#Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.
# Find the best value for K
inertia = []
k = list(range(1,10))

#Calculate the intertia for the range of K values
for i in k:
    km = KMeans(n_clusters = i,random_state = 0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Craete the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x = "k", y = "inertia",xticks = k, title = "K Means Elbow Curve")


# I like 5 means cluster~!

In [11]:
#Once you define the best value for K, run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data. Use the pcs_df to run the K-means algorithm.
#Initalize the model with K = 5

model = KMeans(n_clusters = 5, random_state = 0)

#Fit the model
model.fit(pcs_df)

#get predictions
predictions = model.predict(pcs_df)
print(predictions)

[0 0 4 0 3 0 0 3 3 3 1 3 0 1 0 0 0 0 0 0 3 4 0 0 0 0 0 3 0 0 0 0 0 0 1 3 3
 0 0 0 0 0 1 0 0 3 0 0 0 0 1 0 0 0 0 0 0 0 0 3 1 0 0 1 0 0 0 0 0 3 0 0 0 0
 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 3 0 0 1 0 0 0 4 3 1 3 0 0 3 0 1 0 0 0 1
 0 0 0 0 0 1 3 0 0 0 0 3 0 0 0 0 0 0 3 0 0 0 1 0 0 1 3 0 3 0 0 0 0 3 0 3 0
 0 0 0 0 3 0 1 0 3 0 0 0 1 0 1 1 0 0 0 0 0 3 0 0 1 0 0 0 1 0 1 3 0 0 3 0 0
 0 0 3 0 0 1 0 0 0 0 1 0 0 3 1 1 0 1 0 3 3 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 3 0 1 0 0 1 0 0 3 3 0 0 0 0 0 0 3 1 0 0 1 0 3 0 0 3 1 3 0 3 0 0 1 0 3
 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 3 3 3 3 3 3 3 0 3 0 0 0 0 0 0 0 3 0 0 4 3
 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 3 1 1 0 3 3 0 0 3 1 3 0 0 3 0 3 0 0 0 1 0
 1 0 3 3 3 0 1 3 0 3 0 3 3 3 1 1 0 3 0 0 0 0 3 0 3 1 3 0 0 0 1 0 0 3 1 3 0
 4 0 0 3 0 3 3 1 1 0 0 0 0 3 1 0 0 1 1 3 3 0 3 3 0 0 3 1 3 1 1 0 1 0 0 1 3
 1 0 3 3 1 0 1 3 3 3 3 0 1 3 0 3 0 1 1 1 1 3 0 3 0 0 1 0 1 3 1 0 0 3 3 0 4
 1 0 1 1 0 3 1 0 0 0 0 1 3 1 3 0 0 1 3 0 3 3 3 3 0 0 3 0 3 1 0 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 3 0 3 0 

In [12]:
#Create a new DataFrame named “clustered_df,” that includes the following columns: Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class. You should maintain the index of the crypto_df DataFrames as is shown below:
clustered_df = df_crypto.join(pcs_df,how = "inner").join(coins_name,how = "inner")
clustered_df['class'] = model.labels_


clustered_df


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.412665,0.823150,0.357227,42 Coin,0
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.391306,0.827873,0.358336,404Coin,0
1337,X13,PoW/PoS,2.927942e+10,314159265359,3.143152,2.186748,0.487273,EliteCoin,4
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.190301,-0.003829,-0.043688,Bitcoin,0
ETH,Ethash,PoW,1.076842e+08,0,-0.051013,-1.161387,1.020473,Ethereum,3
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,0.560750,-2.082122,-2.376795,ZEPHYR,1
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.409905,0.823729,0.357770,Gapcoin,0
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.027548,-1.584277,1.417630,Beldex,3
ZEN,Equihash,PoW,7.296538e+06,21000000,-0.042086,-1.247450,1.099354,Horizen,3


In [18]:
#Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame.

fig = px.scatter_3d(
    clustered_df,
    x = 'PC 1',
    y = 'PC 2',
    z = 'PC 3',
    color = 'class',
    symbol = 'class',
    hover_name = 'CoinName',
    hover_data = ['Algorithm','ProofType'],
    width = 800
)
fig.update_layout(legend = dict(x = 0,y = 1))
fig.show()

In [21]:
# Use hvplot.table to create a data table with all the current tradable cryptocurrencies. The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, TotalCoinsMined, and Class.\
clustered_df.hvplot.table(
    columns = [
        'CoinName',
        'Algorithm',
        'ProofType',
        'TotalCoinSupply',
        'TotalCoinsMined',
        'class'
        ]
)


In [22]:
#Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the number of available coins versus the total number of mined coins. Use the hover_cols=["CoinName"] parameter to include the cryptocurrency name on each data point.
clustered_df.hvplot.scatter(
    x = 'TotalCoinsMined',
    y = 'TotalCoinSupply',
    hover_cols = ['CoinName'],
    by = 'class'
)