In [29]:
# standard libraries
import pandas as pd
import numpy as np
import os
from IPython.display import Image
#from IPython.display import clear_output
import time
import json
#import re

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
#from community import community_louvain
from networkx.algorithms.community import greedy_modularity_communities
import community
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [4]:
from os import listdir
from os.path import isfile, join

class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path(self):
        return os.getcwd()+'/'+self.local_dir

In [9]:
from os import listdir
from os.path import isfile, join

class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path(self):
        return os.getcwd()+'\\'+self.local_dir

In [6]:
class Loader:
    df = pd.DataFrame()
    
    def load_data(self, file_name):
        pass
    
    def get_df(self):
        pass
    
    def size(self):
        return len(self.df)

In [7]:
from typing import Callable
 
class CSVLoader(Loader):
    def __init__(self, file_path_manager: FilePathManager):
        self.file_path_manager = file_path_manager
        
    def load_data(self, _prepare_data: Callable[[pd.DataFrame], pd.DataFrame] = None):
        self.df = pd.read_csv(self.file_path_manager.retrieve_full_path(), low_memory=False)
        if _prepare_data:
            self.df = _prepare_data(self.df)
    
    def get_df(self):
        return self.df;
    
    def size(self):
        return len(self.df)  

In [8]:
def clean_data(df):
    df['Datetime_updated'] = pd.to_datetime(df['Datetime_updated'], infer_datetime_format=True)
    df['Datetime_updated_seconds'] = pd.to_datetime(df['Datetime_updated_seconds'], infer_datetime_format=True)
    return df

In [12]:
loader = CSVLoader(FilePathManager('Data_API.csv.gz'))
loader.load_data(clean_data)
df = loader.get_df()

In [30]:
# get target variable labels
f = open('atomichub.config')
data = json.load(f)
blist = data.get('data').get('global').get('col.blist')
wlist = data.get('data').get('global').get('col.wlist')

In [219]:
print("# blacklisted collections on AtomicHub:", len(blist))
print("# whitelisted collections on AtomicHub:", len(wlist))

# blacklisted collections on AtomicHub: 5941
# whitelisted collections on AtomicHub: 2352


In [34]:
# match target variable labels with collection names in dataset
blacklist = []
whitelist = []
neither = []
for c in gb.groups.keys():
    blacklisted = True if c in blist else False
    whitelisted = True if c in wlist else False
    if blacklisted:
        blacklist.append(1)
    else:
        blacklist.append(0)
    if whitelisted:
        whitelist.append(1)
    else:
        whitelist.append(0)
    if not blacklisted and not whitelisted:
        neither.append(1)
    else:
        neither.append(0)

In [98]:
coll_labels = pd.DataFrame({'collection':gb.groups.keys(),'blacklist':blacklist, 'whitelist':whitelist, 'neither':neither})
coll_labels

Unnamed: 0,collection,blacklist,whitelist,neither
0,0-domains,0,0,1
1,0x-meets-imtoken,0,0,1
2,0x001ba7dc2ddfa893c17b6b478c9853b83fc8594c,0,0,1
3,0xbitcoinmemes,0,0,1
4,0xearth,0,0,1
...,...,...,...,...
6278,zodiacstars1,0,0,1
6279,zombaeseries,0,1,0
6280,zombieartist,0,1,0
6281,zombiesperks,0,0,1


In [218]:
print('blacklisted collections in dataset:', sum(coll_labels['blacklist']), ",", 
      round(sum(coll_labels['blacklist'])/coll_labels.shape[0]*100,1),'%')
print('\nwhitelisted collections in dataset:', sum(coll_labels['whitelist']), ",",
      round(sum(coll_labels['whitelist'])/coll_labels.shape[0]*100,1),'%')
print('\ncollections not white or black listed in dataset:', sum(coll_labels['neither']), ",", 
      round(sum(coll_labels['neither'])/coll_labels.shape[0]*100,1),'%')

blacklisted collections in dataset: 275 , 4.4 %

whitelisted collections in dataset: 720 , 11.5 %

collections not white or black listed in dataset: 5288 , 84.2 %


In [100]:
# Collections for model training 
coll_train = coll_labels[coll_labels['neither']==0]
coll_train

Unnamed: 0,collection,blacklist,whitelist,neither
31,12345rainbow,1,0,0
40,1amazingbook,0,1,0
43,1bitcoinlive,0,1,0
46,1bodyinmove1,0,1,0
50,1coolartnft1,0,1,0
...,...,...,...,...
6272,zeugencorona,1,0,0
6274,zippergirls1,0,1,0
6275,zlfhomedecor,0,1,0
6279,zombaeseries,0,1,0


In [13]:
gb = df.groupby('Collection')    

In [217]:
print( "total # of unique collections in dataset:", len(gb.groups.keys()) )

total # of unique collections in dataset: 6283


In [64]:
# create a list of dataframes for all collections that are whitelisted or blacklisted
collection_dfs = [(x, gb.get_group(x)) for x in coll_train['collection']]

In [216]:
print("total # of black/whitelisted collections in dataset:", len(collection_dfs))

total # of black/whitelisted collections in dataset: 995


In [86]:
collection_dfs[100][1].head()

Unnamed: 0,Smart_contract,ID_token,Transaction_hash,Seller_address,Seller_username,Buyer_address,Buyer_username,Image_url_1,Image_url_2,Image_url_3,...,Name,Description,Collection,Market,Datetime_updated,Datetime_updated_seconds,Permanent_link,Unique_id_collection,Collection_cleaned,Category
3137139,atomicassets,1099524000000.0,,sg54o.wam,,enmbe.wam,,QmaaYjo6Nq48mZfmy53ZVqcvxZHXWUPjpduAKzQvf3gu27,https://ipfs.io/ipfs/QmaaYjo6Nq48mZfmy53ZVqcvx...,https://ipfs.atomichub.io/ipfs/QmaaYjo6Nq48mZf...,...,,,allenvvorlds,Atomic,2021-04-14,2021-04-14 22:03:14,,"('allenvvorlds', '1099523870996')",All,Other
3137154,atomicassets,1099524000000.0,,sg54o.wam,,t4mra.wam,,QmaaYjo6Nq48mZfmy53ZVqcvxZHXWUPjpduAKzQvf3gu27,https://ipfs.io/ipfs/QmaaYjo6Nq48mZfmy53ZVqcvx...,https://ipfs.atomichub.io/ipfs/QmaaYjo6Nq48mZf...,...,,,allenvvorlds,Atomic,2021-04-14,2021-04-14 22:02:58,,"('allenvvorlds', '1099523870997')",All,Other
3137224,atomicassets,1099524000000.0,,sg54o.wam,,gd5ri.wam,,QmaaYjo6Nq48mZfmy53ZVqcvxZHXWUPjpduAKzQvf3gu27,https://ipfs.io/ipfs/QmaaYjo6Nq48mZfmy53ZVqcvx...,https://ipfs.atomichub.io/ipfs/QmaaYjo6Nq48mZf...,...,,,allenvvorlds,Atomic,2021-04-14,2021-04-14 22:01:42,,"('allenvvorlds', '1099523870998')",All,Other
3139677,atomicassets,1099524000000.0,,aw4bi.wam,,tsqra.wam,,QmaaYjo6Nq48mZfmy53ZVqcvxZHXWUPjpduAKzQvf3gu27,https://ipfs.io/ipfs/QmaaYjo6Nq48mZfmy53ZVqcvx...,https://ipfs.atomichub.io/ipfs/QmaaYjo6Nq48mZf...,...,,,allenvvorlds,Atomic,2021-04-14,2021-04-14 21:07:15,,"('allenvvorlds', '1099523870989')",All,Other
3139774,atomicassets,1099524000000.0,,aw4bi.wam,,t4mra.wam,,QmaaYjo6Nq48mZfmy53ZVqcvxZHXWUPjpduAKzQvf3gu27,https://ipfs.io/ipfs/QmaaYjo6Nq48mZfmy53ZVqcvx...,https://ipfs.atomichub.io/ipfs/QmaaYjo6Nq48mZf...,...,,,allenvvorlds,Atomic,2021-04-14,2021-04-14 21:04:15,,"('allenvvorlds', '1099523870993')",All,Other


In [97]:
collection_dfs[100][1][['Datetime_updated_seconds','Price_USD', 'Price_Crypto']]

Unnamed: 0,Datetime_updated_seconds,Price_USD,Price_Crypto
3137139,2021-04-14 22:03:14,5.8056,24.0
3137154,2021-04-14 22:02:58,5.8056,24.0
3137224,2021-04-14 22:01:42,5.8056,24.0
3139677,2021-04-14 21:07:15,6.2894,26.0
3139774,2021-04-14 21:04:15,5.8056,24.0
3139941,2021-04-14 20:59:59,5.8056,24.0
3139977,2021-04-14 20:59:04,5.8056,24.0
3140033,2021-04-14 20:58:04,5.8056,24.0
3140069,2021-04-14 20:57:09,5.8056,24.0
3141036,2021-04-14 20:35:32,6.0475,25.0


In [None]:
sns.lineplot()

In [67]:
def convert_to_ts(collection, time_range):
    ts_collection = collection.reset_index(drop=True)
    ts_collection.loc[ts_collection[['Datetime_updated_seconds']].duplicated(),'Datetime_updated_seconds']= \
    ts_collection[ts_collection[['Datetime_updated_seconds']].duplicated()]['Datetime_updated_seconds'].\
    map(lambda x: x + pd.to_timedelta(1, unit='s'), na_action=None)
    
    volume = ts_collection[['Datetime_updated_seconds','Price_USD','Price_Crypto']].\
    set_index('Datetime_updated_seconds').resample(time_range).agg({"Price_USD":'size'})
    
    ts_collection = ts_collection[['Datetime_updated_seconds','Price_USD','Price_Crypto']].\
    set_index('Datetime_updated_seconds').resample(time_range).mean().ffill()
    ts_collection['volume'] = volume['Price_USD']
    return ts_collection

In [69]:
timeseries = []

full_dataset = None
for collection in collection_dfs:
    name = str(collection[0])
    collection_values = col_train.loc[col_train['collection']==name][['blacklist', 'whitelist']].values
    a_ts_collection = None
    if collection_values[0][0] == 1:
        a_ts_collection = convert_to_ts(collection[1], '1d')
        a_ts_collection['collection'] = name
        a_ts_collection['blacklisted'] = '1'
        a_ts_collection['whitelisted'] = '0'
    if collection_values[0][1] == 1:
        a_ts_collection = convert_to_ts(collection[1], '1d')
        a_ts_collection['collection'] = name
        a_ts_collection['blacklisted'] = '0'
        a_ts_collection['whitelisted'] = '1'

    if a_ts_collection is not None:
        if full_dataset is not None:
            full_dataset = pd.concat([full_dataset, a_ts_collection])
        else:
            full_dataset = a_ts_collection

In [74]:
full_dataset.head()

Unnamed: 0_level_0,Price_USD,Price_Crypto,volume,collection,blacklisted,whitelisted
Datetime_updated_seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-02,0.04015,1.0,1,12345rainbow,1,0
2020-12-03,0.04015,1.0,0,12345rainbow,1,0
2020-12-04,0.036259,0.99,1,12345rainbow,1,0
2021-03-31,0.13144,0.5025,2,1amazingbook,0,1
2021-04-01,0.26135,1.0,1,1amazingbook,0,1


In [143]:
trading_days = pd.DataFrame(full_dataset['collection'].value_counts())
trading_days.columns = ['days']
trading_days

Unnamed: 0,days
anyo.b1,302
niftywizards,300
badges.b1,299
dopestickers,299
pepe.hero,298
...,...
beautywifeee,1
leftarmover1,1
lastdaysfiat,1
aliensworldc,1


In [154]:
# Proportion of white/blacklisted collections
trading_days.value_counts(normalize=True)

days
1       0.235176
2       0.052261
3       0.038191
4       0.022111
11      0.018090
          ...   
150     0.001005
148     0.001005
146     0.001005
145     0.001005
302     0.001005
Length: 187, dtype: float64

In [157]:
one_day_idx = trading_days[trading_days['days']==1].index

In [158]:
one_day_coll = full_dataset.loc[full_dataset['collection'].isin(one_day_idx)]
one_day_coll

Unnamed: 0_level_0,Price_USD,Price_Crypto,volume,collection,blacklisted,whitelisted
Datetime_updated_seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-29,0.838200,20.000000,1,1pokemenbolg,1,0
2021-04-06,0.242300,1.000000,1,3doddities33,0,1
2021-03-29,26.650000,100.000000,4,515toysinthe,0,1
2021-04-15,3.672956,15.037692,26,a11ianworlds,1,0
2021-04-14,3.345426,13.829787,47,a1ieneworlds,1,0
...,...,...,...,...,...,...
2021-03-12,2.346000,10.000000,1,wickedsketch,1,0
2020-10-17,0.038345,1.000000,1,xnxxdotcom11,1,0
2021-04-09,1.159250,5.000000,1,xthingscards,0,1
2021-03-27,2.361487,9.501812,4,yetigraffiti,0,1


In [220]:
sum_one_day_blist = one_day_coll['blacklisted'].astype(int).sum()
sum_all_blist = sum(coll_labels['blacklist'])
print("# blacklisted collections with only 1 day of trades:", sum_one_day_blist)
print("% of all blacklisted collections with only 1 day of trades:", round(sum_one_day_blist/sum_all_blist*100,1),'%')

sum_one_day_wlist = one_day_coll['whitelisted'].astype(int).sum()
sum_all_wlist = sum(coll_labels['whitelist'])
print("\n# whitelisted collections with only 1 day of trades:", sum_one_day_wlist)
print("% of all whitelisted collections with only 1 day of trades:", round(sum_one_day_wlist/sum_all_wlist*100,1),'%')

# blacklisted collections with only 1 day of trades: 141
% of all blacklisted collections with only 1 day of trades: 51.3 %

# whitelisted collections with only 1 day of trades: 93
% of all whitelisted collections with only 1 day of trades: 12.9 %


In [208]:
# many of these are fraudulent due to knock-off collections names
np.array(one_day_coll[one_day_coll['blacklisted']=='1']['collection'])

array(['1pokemenbolg', 'a11ianworlds', 'a1ieneworlds', 'a1ienlwor1ds',
       'a1ienoworlds', 'a1ienswor1ds', 'a1ienvvor1de', 'a1ienvvordds',
       'a1ienvvorlds', 'a1ienzworlds', 'aalienworlds', 'aallienworld',
       'aiiencworlds', 'aiienvvorlds', 'aiienworldss', 'aillenworlds',
       'ailllenworld', 'al1ienworl1d', 'ali1enworild', 'alieenworlds',
       'alieenworlld', 'aliemmworlds', 'aliemswworld', 'alieneworlds',
       'alienfworlds', 'aliengworlds', 'alienlworlds', 'aliennwiorld',
       'alienpaworld', 'aliensvvorld', 'alienswor1ds', 'aliensworldc',
       'aliensworlds', 'alienswworld', 'alienvvor1ds', 'alienvvorld2',
       'alienvvorlds', 'alienvvworld', 'alienwords23', 'alienworilds',
       'alienworldio', 'alienworrlds', 'alienwvorlds', 'alienwvvorld',
       'alienwworlld', 'alienxworlds', 'aliienworids', 'alleenworlds',
       'alleniworlds', 'allensworlbs', 'allenvvorlds', 'allienwulrds',
       'allienwurlds', 'alllenworlks', 'alllianworld', 'alonworldscs',
      

In [159]:
# Trade counts (volume) of collections with only 1 day of data
one_day_coll['volume'].value_counts()

1       107
2        34
3        21
4        18
5         8
7         6
12        4
11        4
10        4
8         4
6         3
16        2
9         2
26        2
25        2
17        2
61        1
31        1
14        1
46        1
56        1
24        1
21        1
55        1
47        1
20        1
3354      1
Name: volume, dtype: int64

In [209]:
one_day_coll[one_day_coll['volume']==3354]

Unnamed: 0_level_0,Price_USD,Price_Crypto,volume,collection,blacklisted,whitelisted
Datetime_updated_seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-27,3.174432,16.626586,3354,weezer,0,1
