In [23]:
# standard libraries
import pandas as pd
import numpy as np
import os
from typing import Callable

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate

In [24]:
from os import listdir
from os.path import isfile, join

class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path_andrew(self):
        return os.getcwd()+'\\'+self.local_dir
    
    def retrieve_full_path_jorge(self):
        return os.getcwd()+'/'+self.local_dir

In [25]:
class Loader:
    df = pd.DataFrame()
    
    def load_data(self, file_name):
        pass
    
    def get_df(self):
        pass
    
    def size(self):
        return len(self.df)

In [26]:
class CSVLoader(Loader):
    def __init__(self, file_path_manager: FilePathManager):
        self.file_path_manager = file_path_manager
        
    def load_data(self, _prepare_data: Callable[[pd.DataFrame], pd.DataFrame] = None):
        self.df = pd.read_csv(self.file_path_manager.retrieve_full_path_andrew())
        if _prepare_data:
            self.df = _prepare_data(self.df)
    
    def get_df(self):
        return self.df;
    
    def size(self):
        return len(self.df)  

In [27]:
loader = CSVLoader(FilePathManager('5days_30min.csv'))
loader.load_data()
df = loader.get_df()

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,Datetime_updated_seconds,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume,collection,blacklisted,whitelisted
0,0,2020-12-02 12:00:00,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,12345rainbow,1,0
1,1,2020-12-02 12:30:00,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,12345rainbow,1,0
2,2,2020-12-02 13:00:00,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,12345rainbow,1,0
3,3,2020-12-02 13:30:00,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,12345rainbow,1,0
4,4,2020-12-02 14:00:00,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,12345rainbow,1,0


In [29]:
df['Timestep'] = df['Unnamed: 0']
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('Datetime_updated_seconds', axis=1)

In [30]:
df.tail()

Unnamed: 0,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume,collection,blacklisted,whitelisted,Timestep
236395,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1,235
236396,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1,236
236397,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1,237
236398,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1,238
236399,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1,239


In [31]:
# reorder Timestep to 0th column
col_order = list(range(0,12))
col_order.insert(0,12) 
df = df.iloc[:,col_order]
df.tail()

Unnamed: 0,Timestep,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume,collection,blacklisted,whitelisted
236395,235,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
236396,236,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
236397,237,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
236398,238,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
236399,239,0.10662,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1


In [32]:
# Use the train-test split already established in JMP
loader = CSVLoader(FilePathManager('NFT_Kmeans_Train_Val.csv'))
loader.load_data()
split = loader.get_df()

In [33]:
split.head()

Unnamed: 0,collection,blacklisted,train_val_set,kmeans_clusters
0,1amazingbook,0,Validation,4
1,1bitcoinlive,0,Training,1
2,1bodyinmove1,0,Training,1
3,1coolartnft1,0,Training,4
4,1forthebirds,0,Validation,4


In [34]:
train_idx = [i for i in split.index if split.loc[i,'train_val_set'] == 'Training']
train_collect = split.loc[train_idx,'collection']
X_train = df[df['collection'].isin(train_collect)]
X_test = df[~df['collection'].isin(train_collect)]

In [35]:
X_train.shape[0]/df.shape[0]*100

75.02538071065989

In [36]:
X_test.shape[0]/df.shape[0]*100

24.9746192893401

In [37]:
X_train.iloc[0:5,1:10]

Unnamed: 0,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume
0,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0
1,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0
2,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0
3,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0
4,0.04015,1.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0


In [135]:
sc = StandardScaler()

sc.fit(X_train.iloc[:,0:10])
X_train_sc = sc.transform(X_train.iloc[:,0:10])
X_test_sc = sc.transform(X_test.iloc[:,0:10])

In [81]:
pca = PCA()
pca.fit(X_train_sc)

PCA()

In [82]:
print(pca.explained_variance_ratio_)

[0.36003862 0.17310609 0.11536595 0.10996923 0.0879347  0.06619331
 0.0516013  0.02705391 0.00737171 0.00136519]


In [97]:
# Cumulative explained_variance_ratio
# Check - same as JMP Eigenvalues output

pca_sum = [0]

for x in range(0, len(pca.explained_variance_ratio_)):
    cum_pca = pca.explained_variance_ratio_[x] + pca_sum[x]
    pca_sum.append(cum_pca)

pca_sum.pop(0)
[round(x*100,3) for x in pca_sum]

[36.004, 53.314, 64.851, 75.848, 84.641, 91.261, 96.421, 99.126, 99.863, 100.0]

In [100]:
# Check - same as JMP Eigenvectors output

pd.DataFrame([X_train.columns[0:10], 
              pca.components_[0], pca.components_[1]]).T

Unnamed: 0,0,1,2
0,Timestep,0.024414,0.029654
1,Price_USD,-0.002287,0.705947
2,Price_Crypto,0.000872,0.706031
3,density,-0.306717,0.003456
4,vertex_count,0.445076,0.024543
5,edge_count,0.415261,0.027371
6,max_diameter,0.468026,-0.018182
7,max_radius,0.458463,-0.017921
8,max_peripher,0.264189,-0.016264
9,volume,0.189093,0.00036


Reference for Loading Matrix code below
https://scentellegher.github.io/machine-learning/2020/01/27/pca-loadings-sklearn.html#:~:text=To%20compute%20the%20Loading%20matrix%2C%20namely%20the%20correlations,and%20Y%20contains%20the%20standardized%20principal%20components%2C%20so

In [112]:
pc_labels = list(range(1,11))
pc_labels = ['PC'+str(i) for i in pc_labels]
pc_labels

['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']

In [113]:
# Check - same as JMP Loading Matrix

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loading_matrix = pd.DataFrame(loadings)
loading_matrix.index = X_train.columns[0:10]
loading_matrix.columns = pc_labels
loading_matrix

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Timestep,0.046325,0.039016,0.756383,-0.545939,0.076347,0.142259,-0.316344,-0.003676,0.004113,-0.001521
Price_USD,-0.004339,0.928816,-0.041991,0.002106,-0.005348,0.020932,0.002402,-0.367463,0.00419,0.000811
Price_Crypto,0.001654,0.928926,-0.039641,-0.009617,0.013741,-0.00645,-0.009137,0.367568,-0.003986,-0.000598
density,-0.581988,0.004547,0.040276,0.575115,-0.171682,-0.16974,-0.520072,-0.003199,-0.010505,0.003688
vertex_count,0.84452,0.032291,0.34362,0.208056,-0.156931,-0.224156,0.099601,-0.009489,-0.198745,-0.00202
edge_count,0.787946,0.036011,0.393632,0.286204,-0.142277,-0.274591,0.107251,0.005693,0.183881,9.8e-05
max_diameter,0.888067,-0.023922,-0.261599,-0.039905,-0.186143,0.257652,-0.180438,0.006925,0.007741,0.083937
max_radius,0.869922,-0.023579,-0.29136,-0.010767,-0.200339,0.261105,-0.206445,0.000957,0.012705,-0.08115
max_peripher,0.501291,-0.021398,-0.35811,-0.336532,0.435824,-0.51783,-0.220128,-0.014501,0.000911,-3.9e-05
volume,0.358798,0.000473,0.148341,0.480243,0.730913,0.289919,-0.017944,-0.000223,-0.004759,-0.000216


In [130]:
X_train_pca = pca.transform(X_train_sc)
X_train_pca = pd.DataFrame(X_train_pca, index=X_train.index, columns = pc_labels)
X_train_pca = X_train_pca.join(X_train.iloc[:,[0,10,11]])
X_train_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Timestep,collection,blacklisted
0,-0.562589,-0.378483,-1.283322,1.730460,-0.124614,-0.170998,-0.323182,0.011651,-0.012775,-0.208861,0,12345rainbow,1
1,-0.646935,-0.378216,-1.335019,1.517818,-0.472567,-0.328087,-0.318349,0.011741,-0.004705,-0.208222,1,12345rainbow,1
2,-0.646583,-0.377788,-1.324855,1.510304,-0.471392,-0.325563,-0.324705,0.011639,-0.004487,-0.208410,2,12345rainbow,1
3,-0.646230,-0.377360,-1.314691,1.502789,-0.470216,-0.323039,-0.331062,0.011537,-0.004268,-0.208598,3,12345rainbow,1
4,-0.645878,-0.376932,-1.304526,1.495275,-0.469041,-0.320516,-0.337418,0.011435,-0.004049,-0.208786,4,12345rainbow,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
236395,-0.237848,-0.231468,1.698509,-1.288002,0.286621,0.287900,0.592172,0.000990,-0.025569,-0.049118,235,zombieartist,0
236396,-0.237495,-0.231040,1.708673,-1.295517,0.287796,0.290424,0.585816,0.000888,-0.025351,-0.049306,236,zombieartist,0
236397,-0.237143,-0.230612,1.718838,-1.303031,0.288972,0.292947,0.579459,0.000786,-0.025132,-0.049493,237,zombieartist,0
236398,-0.236790,-0.230184,1.729002,-1.310545,0.290147,0.295471,0.573103,0.000684,-0.024913,-0.049681,238,zombieartist,0


In [136]:
X_test_pca = pca.transform(X_test_sc)
X_test_pca = pd.DataFrame(X_test_pca, index=X_test.index, columns = pc_labels)
X_test_pca = X_test_pca.join(X_test.iloc[:,[0,10,11]])
X_test_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Timestep,collection,blacklisted
240,0.061642,-0.389169,-1.366670,1.423865,0.283198,0.329647,0.205109,0.021595,0.023507,0.130318,0,1amazingbook,0
241,-0.107402,-0.389063,-1.480229,1.006095,-0.413882,0.012945,0.221131,0.021877,0.039428,0.131784,1,1amazingbook,0
242,-0.107050,-0.388635,-1.470065,0.998581,-0.412707,0.015469,0.214774,0.021775,0.039646,0.131596,2,1amazingbook,0
243,-0.106697,-0.388207,-1.459900,0.991067,-0.411532,0.017992,0.208418,0.021673,0.039865,0.131408,3,1amazingbook,0
244,-0.106345,-0.387779,-1.449736,0.983552,-0.410357,0.020516,0.202061,0.021571,0.040084,0.131220,4,1amazingbook,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
234475,-0.690220,0.741114,1.469383,-1.080625,0.276412,0.335475,0.032698,0.398944,0.025329,-0.025291,235,zamuraionwax,1
234476,-0.689867,0.741542,1.479548,-1.088139,0.277587,0.337999,0.026341,0.398842,0.025547,-0.025479,236,zamuraionwax,1
234477,-0.689515,0.741970,1.489712,-1.095653,0.278762,0.340523,0.019985,0.398740,0.025766,-0.025666,237,zamuraionwax,1
234478,-0.689162,0.742398,1.499877,-1.103168,0.279937,0.343046,0.013629,0.398638,0.025985,-0.025854,238,zamuraionwax,1


In [194]:
X_train_pc1_T = X_train_pca.loc[:,['PC1','Timestep','collection','blacklisted']]
X_train_pc1_T = X_train_pc1_T.pivot(index='collection', columns='Timestep', values='PC1')
X_train_pc1_T    

Timestep,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
collection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12345rainbow,-0.562589,-0.646935,-0.646583,-0.646230,-0.645878,-0.645526,-0.645173,-0.644821,-0.644468,-0.644116,...,-0.735777,-0.735424,-0.735072,-0.734720,-0.734367,-0.734015,-0.733662,-0.733310,-0.732958,-0.732605
1bitcoinlive,-0.567567,-1.412009,-1.411656,-1.411304,-1.410952,-1.410599,-1.410247,-1.409895,-1.409542,-1.409190,...,-1.331312,-1.330960,-1.330607,-1.330255,-1.329902,-1.329550,-1.329198,-1.328845,-1.328493,-1.328140
1bodyinmove1,-0.562600,-1.407042,-1.406690,-1.406337,-1.405985,-1.405633,-1.405280,-1.404928,-1.404575,-1.404223,...,-1.326345,-1.325993,-1.325640,-1.325288,-1.324936,-1.324583,-1.324231,-1.323878,-1.323526,-1.323174
1coolartnft1,-0.562598,-0.646944,-0.646591,-0.646239,-0.645887,-0.645534,-0.645182,-0.644830,-0.644477,-0.644125,...,0.432659,0.433011,0.433364,0.433716,0.591395,-0.493631,-0.493279,-0.492927,-0.492574,-0.492222
1neuroworlds,0.779349,2.543510,-0.493206,-0.492853,-0.492501,-0.492149,-0.491796,-0.491444,-0.491092,-0.490739,...,-0.412861,-0.412509,-0.412157,-0.411804,-0.411452,-0.411099,-0.410747,-0.410395,-0.410042,-0.409690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zeugencorona,-0.562589,-0.646935,-0.646583,-0.646230,0.029378,-0.054968,0.171548,0.087202,0.087555,0.087907,...,-0.414875,-0.414522,-0.414170,-0.413818,-0.413465,-0.413113,-0.412760,-0.412408,-0.412056,-0.411703
zippergirls1,-0.562544,-1.406986,-1.406634,-1.406281,-1.405929,-1.405576,-1.405224,-1.404872,-1.404519,-1.404167,...,-1.326289,-1.325937,-1.325584,-1.325232,-1.324880,-1.324527,-1.324175,-1.323822,-1.323470,-1.323118
zlfhomedecor,1.310283,0.548350,0.548702,0.549055,0.549407,0.549759,0.550112,0.550464,0.550817,0.551169,...,-0.585170,-0.584817,-0.584465,-0.584113,-0.583760,-0.583408,-0.583056,-0.582703,-0.582351,-0.581998
zombaeseries,-0.562892,-1.407334,-1.406982,-1.406630,-1.406277,-1.405925,-1.405572,-1.405220,-1.404868,-1.404515,...,-1.326637,-1.326285,-1.325933,-1.325580,-1.325228,-1.324876,-1.324523,-1.324171,-1.323818,-1.323466


In [195]:
sc1 = StandardScaler()
sc1.fit(X_train_pc1_T)
X_train_pc1_sc = sc1.transform(X_train_pc1_T)

In [259]:
kmeans = KMeans(n_clusters=4, n_init=2, random_state=1)

kmeans.fit(X_train_pc1_sc)
train_clusters = kmeans.predict(X_train_pc1_sc)
X_train_pc1_T['cluster'] = train_clusters
X_train_pc1_T.loc[:,['cluster']].value_counts()

cluster
1          388
0          281
3           61
2            9
dtype: int64

In [261]:
X_test_pc1_T = X_test_pca.loc[:,['PC1','Timestep','collection','blacklisted']]
X_test_pc1_T = X_test_pc1_T.pivot(index='collection', columns='Timestep', values='PC1')
X_test_pc1_T

Timestep,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
collection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1amazingbook,0.061642,-0.107402,-0.107050,-0.106697,-0.106345,-0.105993,-0.105640,-0.105288,-0.104936,-0.104583,...,-0.527477,-0.527124,-0.526772,-0.526420,-0.526067,-0.525715,-0.525363,-0.525010,-0.524658,-0.524305
1forthebirds,-0.562605,-0.646951,-0.646599,-0.646246,-0.645894,0.029716,-0.054630,-0.054277,-0.053925,-0.053572,...,-0.420298,-0.419945,-0.419593,-0.419241,-0.418888,-0.418536,-0.418184,-0.417831,-0.417479,-0.417126
1fungidents1,-0.562598,-0.646944,-0.477196,-0.646240,-0.645888,-0.645536,-0.645183,0.064109,-0.104935,-0.104582,...,-0.600770,-0.600418,-0.600066,-0.599713,-0.599361,-0.599009,-0.598656,-0.598304,-0.597951,-0.597599
2cryptokingg,-0.562588,-0.646934,0.028677,-0.055668,-0.055316,-0.054964,-0.054611,-0.054259,-0.053907,-0.053554,...,-0.357012,-0.356660,-0.356307,-0.355955,-0.355603,-0.355250,-0.354898,-0.354546,-0.354193,-0.353841
3dnanoocards,0.061645,-0.107399,-0.107047,-0.106694,-0.106342,-0.105990,1.651188,0.550462,0.550814,0.551167,...,-0.543653,-0.543301,-0.542949,-0.542596,-0.542244,-0.541891,-0.541539,-0.541187,-0.540834,-0.540482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wpcwrarecard,-0.562576,5.285574,5.830307,5.757367,4.356944,3.995636,6.891600,4.012231,3.250299,3.165953,...,0.607448,0.607800,0.608153,0.608505,0.608857,0.609210,0.609562,0.609915,0.610267,0.610619
wvmnftsonwax,3.241069,3.443093,2.469736,2.702847,1.856216,1.856568,1.941539,1.941891,1.857545,1.857898,...,0.547350,0.547702,0.548055,0.548407,0.548759,0.549112,0.549464,0.549817,0.550169,0.550521
xthingscards,-0.562634,-1.407077,-1.406724,-1.406372,-1.406020,-1.405667,-1.405315,-1.404962,-1.404610,-1.404258,...,-1.326380,-1.326027,-1.325675,-1.325323,-1.324970,-1.324618,-1.324265,-1.323913,-1.323561,-1.323208
xxbleetcolxx,-0.562637,-1.407079,-1.406727,-1.406374,-1.406022,-1.405670,-1.405317,-1.404965,-1.404612,-1.404260,...,-1.326382,-1.326030,-1.325677,-1.325325,-1.324973,-1.324620,-1.324268,-1.323916,-1.323563,-1.323211


In [262]:
X_test_pc1_sc = sc1.transform(X_test_pc1_T)

test_clusters = kmeans.predict(X_test_pc1_sc)
X_test_pc1_T['cluster'] = test_clusters
X_test_pc1_T.loc[:,['cluster']]
X_test_pc1_T.loc[:,['cluster']].value_counts()

cluster
1          126
0           88
3           26
2            6
dtype: int64