In [13]:
#initial imports
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Data Preparation

Read crypto_data.csv into Pandas

In [14]:
file_path = Path("crypto_data.csv")
crypto_df = pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


Discard all cryptocurrencies that are not being traded.

In other words, filter for currencies that are currently being traded.

In [15]:
crypto_df = crypto_df.loc[crypto_df["IsTrading"] != False]
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


Once you have done this, drop the IsTrading column from the dataframe

In [16]:
# Remove the IsTrading Column
crypto_df = crypto_df.drop(columns=["IsTrading"])
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


Remove all rows that have at least one null value

In [17]:
# Remove all rows with `null` values if any.
# Find null values
for column in crypto_df.columns:
    print(f"Column '{column}' has {crypto_df[column].isnull().sum()} null values.")

Column 'Unnamed: 0' has 0 null values.
Column 'CoinName' has 0 null values.
Column 'Algorithm' has 0 null values.
Column 'ProofType' has 0 null values.
Column 'TotalCoinsMined' has 459 null values.
Column 'TotalCoinSupply' has 0 null values.


In [18]:
crypto_df = crypto_df.dropna(axis=0, how='any')
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [19]:
# Run again just as a check
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values.")

Column Unnamed: 0 has 0 null values.
Column CoinName has 0 null values.
Column Algorithm has 0 null values.
Column ProofType has 0 null values.
Column TotalCoinsMined has 0 null values.
Column TotalCoinSupply has 0 null values.


In [20]:
# Remove duplicate entries if any.
# Find duplicate entries
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")

Duplicate entries: 0


Filter for cryptocurrencies that have been mined. 

That is, the total coins mined should be greater than zero.

In [21]:
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] != 0]
print(crypto_df)

     Unnamed: 0     CoinName    Algorithm ProofType  TotalCoinsMined  \
0            42      42 Coin       Scrypt   PoW/PoS     4.199995e+01   
2           404      404Coin       Scrypt   PoW/PoS     1.055185e+09   
5          1337    EliteCoin          X13   PoW/PoS     2.927942e+10   
7           BTC      Bitcoin      SHA-256       PoW     1.792718e+07   
8           ETH     Ethereum       Ethash       PoW     1.076842e+08   
...         ...          ...          ...       ...              ...   
1238       ZEPH       ZEPHYR      SHA-256      DPoS     2.000000e+09   
1242        GAP      Gapcoin       Scrypt   PoW/PoS     1.493105e+07   
1245        BDX       Beldex  CryptoNight       PoW     9.802226e+08   
1246        ZEN      Horizen     Equihash       PoW     7.296538e+06   
1247        XBC  BitcoinPlus       Scrypt       PoS     1.283270e+05   

     TotalCoinSupply  
0                 42  
2          532000000  
5       314159265359  
7           21000000  
8                  0

In order for your dataset to be comprehensible to a machine learning algorithm, its data should be numeric. 

Since the coin names do not contribute to the analysis of the data, delete the CoinName from the original dataframe.

In [22]:
crypto_df = crypto_df.drop(columns=["CoinName"])
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [23]:
# Well nothing was said about the unnamed column, its got non-numeric info in it, but hasn't been told to  be dropped. 
#So its going to be the index name now that it is cleaned

crypto_df = crypto_df.set_index('Unnamed: 0')
crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [24]:
clean_crypto_df = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
clean_crypto_df.head()

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Your next step in data preparation is to convert the remaining features with text values, Algorithm and ProofType, into numerical data.

To accomplish this task, use Pandas to create dummy variables. 

Examine the number of rows and columns of your dataset now. How did they change?

In [25]:
# Save the cleaned DataFrame as a `CSV` file, name it as `shopping_data_cleaned.csv`.
# Saving cleaned data
file_path = Path("cleaned_crypto_data.csv.csv")
clean_crypto_df.to_csv(file_path, index=False)


*Examine the number of rows and columns of your dataset now. How did they change?* 

**Well the number of columns grew to 100, and every column is numeric as intended. The number of rows didn't change, but they wouldn't as we are done preprocessing.**

Standardize your dataset so that columns that contain larger values do not unduly influence the outcome.

In [27]:
# Standarize data with StandarScaler
X_scaled = StandardScaler().fit_transform(clean_crypto_df)
print(X_scaled[0:5])

[[-0.11674788 -0.15286468 -0.0433555  -0.0433555  -0.0433555  -0.06137164
  -0.07523548 -0.0433555  -0.06137164 -0.06137164 -0.0433555  -0.0433555
  -0.19226279 -0.06137164 -0.09731237 -0.0433555  -0.11536024 -0.07523548
  -0.0433555  -0.0433555  -0.15176505 -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.08695652 -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.06137164 -0.0433555  -0.08695652 -0.08695652 -0.08695652 -0.0433555
  -0.13105561 -0.13827675 -0.13827675 -0.0433555  -0.06137164 -0.0433555
  -0.07523548 -0.1815096  -0.0433555  -0.0433555  -0.0433555  -0.07523548
  -0.15811388 -0.3145935  -0.0433555  -0.08695652 -0.07523548 -0.06137164
  -0.0433555   1.38873015 -0.0433555  -0.0433555  -0.06137164 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.0433555  -0.39836623 -0.0433555  -0.1815096  -0.0433555  -0.08695652
  -0.08695652 -0.10670145 -0.0433555  -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.07523

## Dimensionality Reduction

Creating dummy variables above dramatically increased the number of features in your dataset. 

Perform dimensionality reduction with PCA.

In [28]:
# Applying PCA to reduce dimensions from 100 to 3 as a test

# Initialize PCA model
pca = PCA(n_components=3)

# Get three principal components for the crypto data.
coin_pca = pca.fit_transform(X_scaled)

In [29]:
# Transform PCA data to a DataFrame
df_coin_pca = pd.DataFrame(
    data=coin_pca, columns=["principal component 1", "principal component 2", "principal component 3"]
)
df_coin_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.340058,1.063755,-0.621328
1,-0.323369,1.063874,-0.621873
2,2.312,1.584727,-0.700179
3,-0.149067,-1.312655,0.20211
4,-0.149731,-2.007486,0.494903


In [30]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.02737203, 0.02093415, 0.02005807])

### **Initial Sample Analysis**

**According to the explained variance, the first principal component contains approximately `3%` of the variance and the second principal component contains `2%` of the variance, and the third principal component contains `2%` as well. We have approximately `5%` of the information in the original dataset, so not good, not good at all. We will set the pca(n_components=0.9) and then determine the number of features were required.**

Rather than specify the number of principal components when you instantiate the PCA model, it is possible to state the desired explained variance. 

For example, say that a dataset has 100 features. Using PCA(n_components=0.99) creates a model that will preserve approximately 99% of the explained variance, whether that means reducing the dataset to 80 principal components or 3.

In [31]:
# New initialization params
pca = PCA(n_components=0.99)

# Get 99% explained variance for the crypto data.
coin_pca_99 = pca.fit_transform(X_scaled)

In [32]:
# Transform PCA data to a DataFrame
df_coin_pca_99 = pd.DataFrame(
    data=coin_pca_99, columns=[]
)
df_coin_pca_99.head()

ValueError: Shape of passed values is (533, 87), indices imply (533, 0)

In [33]:
# Okay well we know from the output error that we need 87 items to get 99%
# Fetch the explained variance
a = pca.explained_variance_ratio_
round(sum(a), 87)

0.9916837551800773

In [34]:
# Final initialization params we need to get to 90%
pca = PCA(n_components=0.90)

# Get 90% explained variance for the crypto data.
coin_pca_90 = pca.fit_transform(X_scaled)

In [35]:
# Fetch the explained variance
b = pca.explained_variance_ratio_
round(sum(b), 5)

0.90307

*For this project, preserve 90% of the explained variance in dimensionality reduction. How did the number of the features change?*

**So with 100 features one would think that keeping 99% wouldn't require 99 of the 100 variables, they are weighed differently, albeit each one seems to be pretty low, so requiring 87 variables to explain 99% of the data makes sense. It also made sense that accounting for 90% of the data would require a low number, we have been working with low-order numbers for the principal component so needing 4 or 5 when we typically use 2 or 3 wasn't too much of a change that would make it suspect outright.**

Next, further reduce the dataset dimensions with t-SNE and visually inspect the results.

In order to accomplish this task, run t-SNE on the principal components: the output of the PCA transformation. 


In [36]:
file_path = Path("cleaned_crypto_data.csv.csv")
new_df = pd.read_csv(file_path)

In [37]:
new_df.sample(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
507,3821246.0,9000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,163055100000.0,200000000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,30227750.0,78835200.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
518,140777.8,120000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
173,813092300.0,2000000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
163,20000000.0,20000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
532,128327.0,1000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369,9399343.0,30000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,20000000.0,20000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225,8377873.0,144000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
# Create a new dataframe for t-SNE


In [None]:
# Okay well we know from the output error that we need 75
# Fetch the explained variance
a = pca.explained_variance_ratio_
round(sum(a), 75)

In [None]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head(10)

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(dataframe_crypto)
    inertia.append(km.inertia_)

In [None]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)

In [None]:
#df_t_SNE = pd.read_csv(file_path)

In [None]:
#df_t_SNE.sample(10)

In [None]:
# Fitting model
model.fit(df)

In [None]:
# Initialize t-SNE model
tsne = TSNE(learning_rate=35)

In [None]:
# Reduce dimensions
tsne_features = tsne.fit_transform(df_t_SNE)

Then create a scatter plot of the t-SNE output. Observe whether there are distinct clusters or not.

## Cluster Analysis with k-Means

Create an elbow plot to identify the best number of clusters. Use a for-loop to determine the inertia for each k between 1 through 10.

In [None]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=4, random_state=5)

In [None]:
TotalCoinsMined

Determine, if possible, where the elbow of the plot is, and at which value of k it appears.

In [None]:
inertia = []
# Same as k = list(range(1, 11))
k = [1,2,3,4,5,6,7,8,9,10]


# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_coin_pca)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head(10)

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow curve for customer data')
plt.show()

In [None]:
def get_clusters(k, data):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Train the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [None]:
clusters = get_clusters(4, df_coin_pca)

In [None]:
clusters

In [None]:
def show_clusters(df_coin_pca):
    plt.scatter(df['principal component 2'], df['principal component 3'], c=df['class'])
    plt.xlabel('principal component 2')
    plt.ylabel('principal component 3')
    plt.show()

## Recommendation

Based on your findings, make a brief (1-2 sentences) recommendation to your clients. 

Can the cryptocurrencies be clustered together? 

If so, into how many clusters?