In [114]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
import re
from sklearn.preprocessing import StandardScaler

In [115]:
# Load data
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.dtypes, type(crypto_df)

(Unnamed: 0          object
 CoinName            object
 Algorithm           object
 IsTrading             bool
 ProofType           object
 TotalCoinsMined    float64
 TotalCoinSupply     object
 dtype: object,
 pandas.core.frame.DataFrame)

In [116]:
# Convert TotalCoinSupply to float64

# Code Template: https://stackoverflow.com/questions/50444346/fast-punctuation-removal-with-pandas
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace(r'[^\w\s]+', '')

# crypto_df.astype({'TotalCoinSupply': 'float64'}).dtypes

# Remove all cryptocurrencies that aren’t trading
# Code Template: https://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
crypto_df.drop(crypto_df[crypto_df.IsTrading == False].index, inplace=True)

In [117]:
# Remove all cryptocurrencies that don’t have an algorithm defined
crypto_df.drop(crypto_df[(crypto_df.Algorithm == '')].index, inplace=True)

In [118]:
# Remove the IsTrading column
crypto_df.drop(columns=['IsTrading'],axis=1, inplace=True)
crypto_df.head()
# crypto_df.isna().sum()
# print(len(crypto_df))

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [119]:
# Remove all cryptocurrencies with at least one null value
# Code Template: https://stackoverflow.com/questions/21800169/python-pandas-get-index-of-rows-which-column-matches-certain-value

# crypto_df.isna().sum()

null_coins = crypto_df.index[crypto_df['TotalCoinsMined'].isna() == True].tolist()
# print(null_coins)

crypto_df.drop(null_coins, inplace=True)

# crypto_df.isna().sum()

In [120]:
# Remove all cryptocurrencies without coins mined
not_mined = crypto_df.index[crypto_df['TotalCoinsMined'] == 0].tolist()

crypto_df.drop(not_mined, inplace=True)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,3.14159e+16
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
8,ETH,Ethereum,Ethash,PoW,107684200.0,0.0
9,LTC,Litecoin,Scrypt,PoW,63039240.0,84000000.0
10,DASH,Dash,X11,PoW/PoS,9031294.0,22000000.0
11,XMR,Monero,CryptoNight-V7,PoW,17201140.0,0.0
12,ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000.0
13,ZEC,ZCash,Equihash,PoW,7383056.0,21000000.0


In [121]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name,
# and use the crypto_df.index as the index for this new DataFrame
coin_names_df = pd.DataFrame(crypto_df['CoinName'])
type(coin_names_df), coin_names_df.head(10)

(pandas.core.frame.DataFrame,
             CoinName
 0            42 Coin
 2            404Coin
 5          EliteCoin
 7            Bitcoin
 8           Ethereum
 9           Litecoin
 10              Dash
 11            Monero
 12  Ethereum Classic
 13             ZCash)

In [122]:
# Remove the CoinName column
crypto_df.drop(columns=['CoinName'], axis=1, inplace=True)
crypto_df.tail()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1238,ZEPH,SHA-256,DPoS,2000000000.0,2000000000
1242,GAP,Scrypt,PoW/PoS,14931050.0,250000000
1245,BDX,CryptoNight,PoW,980222600.0,1400222610
1246,ZEN,Equihash,PoW,7296538.0,21000000
1247,XBC,Scrypt,PoS,128327.0,1000000


In [123]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X
X = pd.get_dummies(crypto_df, columns=['Unnamed: 0', 'Algorithm', 'ProofType'])
# type(X)
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Unnamed: 0_1337,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42.0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000.0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,3.14159e+16,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
# Use the StandardScaler to standardize all of the data from the X DataFrame
