In [72]:
# Initial imports
import pandas as pd
from pathlib import Path
import numpy as np

# Data Preparation

Read crypto_data.csv into Pandas. The dataset was obtained from CryptoCompare.

Discard all cryptocurrencies that are not being traded. In other words, filter for currencies that are currently being traded. Once you have done this, drop the IsTrading column from the dataframe.

Remove all rows that have at least one null value.

Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero.

In order for your dataset to be comprehensible to a machine learning algorithm, its data should be numeric. Since the coin names do not contribute to the analysis of the data, delete the CoinName from the original dataframe.

Your next step in data preparation is to convert the remaining features with text values, Algorithm and ProofType, into numerical data. To accomplish this task, use Pandas to create dummy variables. Examine the number of rows and columns of your dataset now. How did they change?

Standardize your dataset so that columns that contain larger values do not unduly influence the outcome.

In [73]:
#loading data
file_path = Path("crypto_data.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [74]:
len(df)

1252

In [75]:
only_true_df = df.loc[df["IsTrading"] == True,:]
only_true_df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [76]:
len(only_true_df)

1144

In [77]:
istrading_drop_df = only_true_df.drop(["IsTrading"], axis='columns')
istrading_drop_df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [78]:
len(istrading_drop_df)

1144

In [79]:
#Remove all rows that have at least one null value.
istrading_drop_df.dropna(inplace=True)
istrading_drop_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [80]:
len(istrading_drop_df)

685

In [81]:
# Filter for cryptocurrencies that have been mined.
#delete the CoinName
mined_df = istrading_drop_df.loc[istrading_drop_df["TotalCoinsMined"] > 0, ["Unnamed: 0","CoinName","Algorithm","ProofType", "TotalCoinsMined","TotalCoinSupply"]]
mined_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
10,DASH,Dash,X11,PoW/PoS,9031294.0,22000000
11,XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
12,ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
13,ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [82]:
len(mined_df)

532

In [83]:
mined_df.drop('CoinName', axis=1, inplace=True)
mined_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [46]:
mined_df["Unnamed: 0"].unique()

array(['42', '404', '1337', 'BTC', 'ETH', 'LTC', 'DASH', 'XMR', 'ETC',
       'ZEC', 'BTS', 'DGB', 'BTCD', 'XPY', 'PRC', 'KOBO', 'SPR', 'ARG',
       'AUR', 'BLU', 'XMY', 'MOON', 'ZET', 'SXC', 'QTL', 'ENRG', 'QRK',
       'RIC', 'DGC', 'BTB', 'CAT', 'CBX', 'CCN', 'CRYPT', 'CSC', 'DMD',
       'XVG', 'DVC', 'EAC', 'EFL', 'EMC2', 'EMD', 'EXCL', 'FLT', 'FRK',
       'FTC', 'GDC', 'GLC', 'GLD', 'HBN', 'HYP', 'IFC', 'IOC', 'IXC',
       'KGC', 'LKY', 'LTB', 'MAX', 'MEC', 'MED', 'MINT', 'MINC', 'MZC',
       'NAUT', 'NAV', 'NOBL', 'NMC', 'NYAN', 'OPAL', 'ORB', 'POT', 'PXC',
       'RDD', 'RPC', 'SBC', 'SMC', 'SUPER', 'SYNC', 'SYS', 'TES', 'TGC',
       'TIT', 'TOR', 'TRC', 'UNB', 'UNO', 'URO', 'USDE', 'UTC', 'VIA',
       'VRC', 'VTC', 'WDC', 'XC', 'XCR', 'XJO', 'XST', 'ZCC', 'BCN',
       'XDN', 'BURST', 'SJCX', 'MONA', 'NTRN', 'FAIR', 'NLG', 'RBY',
       'PTC', 'KORE', 'WBB', 'NOTE', 'FLO', '8BIT', 'STV', 'ABY', 'FLDC',
       'U', 'UIS', 'CYP', 'OMC', 'VTR', 'GRE', 'XCN', 'MSC', 'SOON',


In [47]:
mined_df["Algorithm"].unique()

array(['Scrypt', 'X13', 'SHA-256', 'Ethash', 'X11', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'Quark', 'Groestl',
       'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256', 'PHI1612',
       'Lyra2REv2', 'CryptoNight', 'Shabal256', 'Counterparty',
       'Stanford Folding', 'QuBit', 'M7 POW', 'Lyra2RE', 'QUAIT',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'NIST5', 'Dagger',
       'X11GOST', 'POS 3.0', 'SHA-256D', 'Lyra2Z', 'X14', 'DPoS',
       'Dagger-Hashimoto', 'Blake2S', '536', 'Argon2d', 'Cloverhash',
       'Skein', 'Time Travel', 'Keccak', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'C11',
       'SkunkHash', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'IMesh', 'Green Protocol',
       'Semux BFT consensus', 'X16R', 'Tribus', 'CryptoNight Heavy',
       'Jump Consistent Hash', 'HMQ1725', 'Cryptonight-GPU', 'ECC 256K1',
       'Blake', 'Exosis', 'Equihash+Scrypt', 'Le

In [48]:
mined_df["ProofType"].unique()

array(['PoW/PoS', 'PoW', 'PoS', 'PoC', 'PoS/PoW/PoT', 'PoST', 'PoW/nPoS',
       'PoS/PoW', 'dPoW/PoW', 'PoW/PoW', 'DPoS', 'TPoS', 'PoW/PoS ',
       'Proof of Authority', 'PoW and PoS', 'POBh', 'PoW + Hive', 'PoA',
       'HPoW', 'Zero-Knowledge Proof', 'DPOS', 'Pos', 'Proof of Trust',
       'LPoS', 'PoS/LPoS'], dtype=object)

In [57]:
new_mined_df = pd.get_dummies(mined_df, columns=['Unnamed: 0', 'Algorithm','ProofType'])

In [58]:
new_mined_df.head(5)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Unnamed: 0_1337,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
len(new_mined_df)

532

In [59]:
# # Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(new_mined_df[['TotalCoinsMined', 'TotalCoinSupply']])

In [66]:
new_mined_df.columns
new_mined_df.head(5)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Unnamed: 0_1337,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# # Create a DataFrame with the transformed data
new_df= pd.DataFrame(scaled_data, columns=new_mined_df[1:])
new_df['TotalCoinsMined'] = new_mined_df['TotalCoinsMined']
new_df.head()

In [68]:
file_path = Path("new_crypto_data.csv")
new_mined_df.to_csv(file_path, index=False)

# Dimensionality Reduction

Creating dummy variables above dramatically increased the number of features in your dataset. Perform dimensionality reduction with PCA. Rather than specify the number of principal components when you instantiate the PCA model, it is possible to state the desired explained variance. For example, say that a dataset has 100 features. Using PCA(n_components=0.99) creates a model that will preserve approximately 99% of the explained variance, whether that means reducing the dataset to 80 principal components or 3. For this project, preserve 90% of the explained variance in dimensionality reduction. How did the number of the features change?

Next, further reduce the dataset dimensions with t-SNE and visually inspect the results. In order to accomplish this task, run t-SNE on the principal components: the output of the PCA transformation. Then create a scatter plot of the t-SNE output. Observe whether there are distinct clusters or not.

# Cluster Analysis with k-Means

Create an elbow plot to identify the best number of clusters. Use a for-loop to determine the inertia for each k between 1 through 10. Determine, if possible, where the elbow of the plot is, and at which value of k it appears.


# Recommendation

Based on your findings, make a brief (1-2 sentences) recommendation to your clients. Can the cryptocurrencies be clustered together? If so, into how many clusters?
