# Data Processing

In [32]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', None)

In [33]:
file_to_load = ("Resource/crypto_data.csv")

In [34]:
df_crypto = pd.read_csv(file_to_load)
df_crypto

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [37]:
df_crypto.dtypes

Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [40]:
#Remove the IsTrading column and CoinName
df_crypto.drop(columns = ["IsTrading","CoinName"], inplace =True)
df_crypto

KeyError: "['IsTrading' 'CoinName'] not found in axis"

In [41]:
# Find Null values
for column in df_crypto.columns:
    print(f"Column {column} has {df_crypto[column].isnull().sum()}null values")

Column Unnamed: 0 has 0null values
Column Algorithm has 0null values
Column ProofType has 0null values
Column TotalCoinsMined has 508null values
Column TotalCoinSupply has 0null values


In [44]:
# Drop null rows
df_crypto = df_crypto.dropna()
df_crypto

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42.0
2,404,Scrypt,PoW/PoS,1055185000.0,532000000.0
4,808,SHA-256,PoW/PoS,0.0,0.0
5,1337,X13,PoW/PoS,29279420000.0,314159265359.0
7,BTC,SHA-256,PoW,17927180.0,21000000.0
8,ETH,Ethash,PoW,107684200.0,0.0
9,LTC,Scrypt,PoW,63039240.0,84000000.0
10,DASH,X11,PoW/PoS,9031294.0,22000000.0
11,XMR,CryptoNight-V7,PoW,17201140.0,0.0
12,ETC,Ethash,PoW,113359700.0,210000000.0


In [45]:
#Remove all cryto that dont have algorithm defined
df_crypto["Algorithm"].isnull().sum()

0

In [46]:
# Remove all cryptocurrencies without coins mined.
coins_mined = df_crypto['TotalCoinsMined'] > 0

df_crypto = df_crypto.loc[coins_mined]
df_crypto

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42.0
2,404,Scrypt,PoW/PoS,1055185000.0,532000000.0
5,1337,X13,PoW/PoS,29279420000.0,314159265359.0
7,BTC,SHA-256,PoW,17927180.0,21000000.0
8,ETH,Ethash,PoW,107684200.0,0.0
9,LTC,Scrypt,PoW,63039240.0,84000000.0
10,DASH,X11,PoW/PoS,9031294.0,22000000.0
11,XMR,CryptoNight-V7,PoW,17201140.0,0.0
12,ETC,Ethash,PoW,113359700.0,210000000.0
13,ZEC,Equihash,PoW,7383056.0,21000000.0


In [47]:
# Convert 'TotalCoinSupply' to float
df_crypto['TotalCoinSupply'] = df_crypto['TotalCoinSupply'].astype(float)
df_crypto.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42.0
2,404,Scrypt,PoW/PoS,1055185000.0,532000000.0
5,1337,X13,PoW/PoS,29279420000.0,314159300000.0
7,BTC,SHA-256,PoW,17927180.0,21000000.0
8,ETH,Ethash,PoW,107684200.0,0.0


In [48]:
# Create dummies variables for all of the text features, 
        
X = pd.get_dummies(df_crypto, columns=['Algorithm', 'ProofType'])

X.head()

Unnamed: 0.1,Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,42,41.99995,42.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,404,1055185000.0,532000000.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,1337,29279420000.0,314159300000.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ETH,107684200.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Scale all the data from the X DataFrame
from sklearn.preprocessing import StandardScaler


In [57]:
crypto_scaled = StandardScaler().fit_transform(X)
print(crypto_scaled[:2])

ValueError: could not convert string to float: 'BTC'

# Reducing Data Dimensions Using PCA

In [53]:
# Initialize PCA model and reduce DF down to 3 components
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [54]:
# Get two principal components for the iris data
crypto_pca = pca.fit_transform(crypto_scaled)

NameError: name 'crypto_scaled' is not defined

In [52]:
# Transform PCA data to a DF
pcs_df = pd.DataFrame(
    data=crypto_pca, 
    index=crypto_df.index,
    columns = ['PC 1', 'PC 2', 'PC 3'])

pcs_df.head()

NameError: name 'crypto_pca' is not defined

# Clustering Cryptocurrencies Using K-means

In [58]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [59]:
# Store values of K to plot
inertia = []
k = list(range(1, 11))

In [60]:
# Loop through K values and find inertia
# Looking for the best K
for j in k:
    km = KMeans(n_clusters=j, random_state=0)
    
    # Fitting the data
    km.fit(pcs_df)
    
    inertia.append(km.inertia_)

NameError: name 'pcs_df' is not defined

In [61]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {'k': k, 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='k', y='inertia', title='Elbow Curve', xticks=k)

ValueError: arrays must all be same length

# K = 4

In [62]:
# Initializing model with K = 4
model = KMeans(n_clusters = 4, random_state = 0)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [63]:
# Initializing model with K = 4
model = KMeans(n_clusters = 4, random_state = 0)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [64]:
# Get predictions
predictions = model.predict(pcs_df)
print(predictions)

NameError: name 'pcs_df' is not defined

In [65]:

# Add a new class column to pcs_df
pcs_df['class'] = model.labels_
pcs_df.head()

AttributeError: 'KMeans' object has no attribute 'labels_'

In [66]:
# Create clustered_df
clustered_df = pd.concat([crypto_df, pcs_df, coins_name], axis = 1, join='inner')
clustered_df = clustered_df[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC 1', 'PC 2', 'PC 3', 'CoinName', 'class']]
clustered_df.head()

NameError: name 'crypto_df' is not defined

# Visualizing

In [67]:
# Create 3D Scatter Plot
fig = px.scatter_3d(clustered_df, 
                    x='PC 1', 
                    y='PC 2', 
                    z='PC 3',
                    hover_name='CoinName', 
                    hover_data=['Algorithm'], 
                    color="class",
                    symbol="class",
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

NameError: name 'clustered_df' is not defined

In [68]:
# Create a hvplot.table
clustered_df.hvplot.table(columns=['CoinName', 
                      'Algorithm', 
                      'ProofType', 
                      'TotalCoinSupply', 
                      'TotalCoinsMined', 
                      'class'], width=800)

NameError: name 'clustered_df' is not defined

In [None]:
# Create 2D plot
clustered_df.hvplot.scatter(x='TotalCoinsMined', 
                           y='TotalCoinSupply', 
                           hover_cols=['CoinName'], 
                           by='class')