In [1]:
# standard libraries
import pandas as pd
import numpy as np
import os
from IPython.display import Image
#from IPython.display import clear_output
import time
#import re
import json

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate

In [2]:
from os import listdir
from os.path import isfile, join

class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path(self):
        return os.getcwd()+'/'+self.local_dir

In [3]:
class Loader:
    df = pd.DataFrame()
    
    #@abstractmethod
    def load_data(self, file_name):
        pass
    
    #@abstractmethod
    def get_df(self):
        pass
    
    def size(self):
        return len(self.df)

In [4]:
from typing import Callable
 
class CSVLoader(Loader):
    def __init__(self, file_path_manager: FilePathManager):
        self.file_path_manager = file_path_manager
        
    def load_data(self, _prepare_data: Callable[[pd.DataFrame], pd.DataFrame] = None):
        self.df = pd.read_csv(self.file_path_manager.retrieve_full_path(), compression='gzip', low_memory=False)
        if _prepare_data:
            self.df = _prepare_data(self.df)
    
    def get_df(self):
        return self.df;
    
    def size(self):
        return len(self.df)  

In [5]:
def clean_data(df):
    df['Datetime_updated'] = pd.to_datetime(df['Datetime_updated'], infer_datetime_format=True)
    df['Datetime_updated_seconds'] = pd.to_datetime(df['Datetime_updated_seconds'], infer_datetime_format=True)
    return df

In [8]:
loader = CSVLoader(FilePathManager('Data_API.csv.gz'))
loader.load_data(clean_data)
df = loader.get_df()

In [11]:
f = open('atomichub.config')
data = json.load(f)
blist = data.get('data').get('global').get('col.blist')
wlist = data.get('data').get('global').get('col.wlist')

In [10]:
collections = df['Collection'].value_counts()
collections

alien.worlds      1288810
Cryptokitties     1092177
Gods-unchained     383965
stf.capcom         318737
kogsofficial       300854
                   ...   
xangrysnailx            1
21pipedreams            1
Payusnomind             1
Nft-machine             1
dogecoinmeme            1
Name: Collection, Length: 6283, dtype: int64

In [18]:
len(collections)

6283

In [13]:
blacklist = []
whitelist = []
neither = []
for c in collections.index.tolist():
    blacklisted = True if c in blist else False
    whitelisted = True if c in wlist else False
    if blacklisted:
        blacklist.append(1)
    else:
        blacklist.append(0)
    if whitelisted:
        whitelist.append(1)
    else:
        whitelist.append(0)
    if not blacklisted and not whitelisted:
        neither.append(1)
    else:
        neither.append(0)

In [26]:
print('# blacklisted collections: ', round(sum(blacklist)/len(collections)*100,2),'%')

# blacklisted collections:  4.38 %


In [27]:
print('# whitelisted collections: ', round(sum(whitelist)/len(collections)*100,2),'%')

# whitelisted collections:  11.46 %


In [28]:
print('# collections not white or black listed: ', round(sum(neither)/len(collections)*100,2),'%')

# collections not white or black listed:  84.16 %


In [22]:
col_lists = pd.DataFrame({'collection':collections.index.tolist(),'blacklist':blacklist, 'whitelist':whitelist, 'neither':neither})
col_lists

Unnamed: 0,collection,blacklist,whitelist,neither
0,alien.worlds,0,1,0
1,Cryptokitties,0,0,1
2,Gods-unchained,0,0,1
3,stf.capcom,0,1,0
4,kogsofficial,0,1,0
...,...,...,...,...
6278,xangrysnailx,0,0,1
6279,21pipedreams,0,0,1
6280,Payusnomind,0,0,1
6281,Nft-machine,0,0,1


In [23]:
col_train = col_lists[col_lists['neither']==0]
col_train

Unnamed: 0,collection,blacklist,whitelist,neither
0,alien.worlds,0,1,0
3,stf.capcom,0,1,0
4,kogsofficial,0,1,0
6,mlb.topps,0,1,0
10,officialhero,0,1,0
...,...,...,...,...
6215,alleniworlds,1,0,0
6219,coolasset123,1,0,0
6257,3doddities33,0,1,0
6260,alieenworlld,1,0,0
