In [594]:
import pandas as pd
import numpy as np
from numpy.random import seed
from numpy.random import normal

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams


import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.ticker import MaxNLocator



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, auc, average_precision_score
from sklearn.preprocessing import StandardScaler



from keras.layers import Conv1D, Dense, Dropout, Input, Concatenate, GlobalMaxPooling1D, MaxPooling1D
from keras.models import Model
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit

# from keras.optimizers import RMSprop, adam

## Importing Data

In [3]:
day1_dataset_1min = pd.read_csv('../Data Slices/5_days_timeseries_data/1min.csv')
day1_dataset_10min = pd.read_csv('../Data Slices/5_days_timeseries_data/10min.csv')
day1_dataset_30min = pd.read_csv('../Data Slices/5_days_timeseries_data/30min.csv')
day1_dataset_60min = pd.read_csv('../Data Slices/5_days_timeseries_data/60min.csv')

In [87]:
day1_dataset_60min[day1_dataset_60min['collection'] == 'zombieartist']

Unnamed: 0.1,Unnamed: 0,Datetime_updated_seconds,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume,collection,blacklisted,whitelisted
118080,0,2020-10-11 22:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,zombieartist,0,1
118081,1,2020-10-11 23:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118082,2,2020-10-12 00:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118083,3,2020-10-12 01:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118084,4,2020-10-12 02:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118195,115,2020-10-16 17:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118196,116,2020-10-16 18:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118197,117,2020-10-16 19:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118198,118,2020-10-16 20:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1


In [11]:
clustered_collections = pd.read_csv('NFT_Kmeans_Train_Val.csv')

In [581]:
clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].sum()/clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].count()

0.2787550744248985

In [701]:
cluster_4= clustered_collections[clustered_collections['kmeans_clusters'] == 4]
cluster_5= clustered_collections[clustered_collections['kmeans_clusters'] == 5]
cluster_1= clustered_collections[clustered_collections['kmeans_clusters'] == 1]
# clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].sum()/clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].count()

0.2787550744248985

In [20]:
clustered_collections.columns

Index(['collection', 'blacklisted', 'train_val_set', 'kmeans_clusters'], dtype='object')

In [23]:
training_df = clustered_collections[clustered_collections['train_val_set']=='Training']
validation_df = clustered_collections[clustered_collections['train_val_set']=='Validation']

In [35]:
validation_df

Unnamed: 0,collection,blacklisted,train_val_set,kmeans_clusters
0,1amazingbook,0,Validation,4
4,1forthebirds,0,Validation,4
5,1fungidents1,0,Validation,4
10,2cryptokingg,0,Validation,4
12,3dnanoocards,0,Validation,4
...,...,...,...,...
968,wpcwrarecard,0,Validation,5
970,wvmnftsonwax,0,Validation,5
972,xthingscards,0,Validation,1
973,xxbleetcolxx,0,Validation,1


## Model

In [429]:
from typing import Dict, List, Tuple
from enum import Enum


class DatasetType(Enum):
    TRAINING = 'Training'
    VALIDATION = 'Validation'
    
class Aggregation(Enum):
    ONE_MIN = 1
    TEN_MIN = 2
    THIRTHY_MIN = 3
    SIXTY_MIN = 4

class Collection:
    def __init__(self, name, aggregations:  Dict[Aggregation, pd.DataFrame] = dict(), blacklisted=0):
        self.name = name
        self.aggregations: Dict[Aggregation, pd.Dataframe] = aggregations
        self.blacklisted = blacklisted
    
    def get_aggregation(self, aggregation: Aggregation):
        return self.aggregations.get(aggregation)
    
    def add_aggregation(self,aggregation: Aggregation, a_df: pd.DataFrame):
        self.aggregations[aggregation] = a_df.copy()

In [None]:
class Dataset:
    def __init__(self, ds_type: DatasetType, cluster,  collections: List[Collection] = None, columns = []):
        self.collections = []
        self.ds_type = ds_type
        self.columns = columns
        
    def add(self, collection: Collection):
        self.collections.append(collection)
        
    def concat(self, aggregation):
        return pd.concat(
            [collection.get_aggregation(aggregation) for collection in self.collections], ignore_index=True)
    
    def fit(self, aggregation, scaler):
        scaler.fit(self.concat(aggregation)[self.columns])
    
    def transform(self, aggregation, scaler: StandardScaler):
        for collection in self.collections:
            all_columns = collection.get_aggregation(aggregation).copy()
            internal_df = all_columns[self.columns].copy()
            internal_df = scaler.transform(internal_df)
            collection.add_aggregation(
                aggregation, internal_df.copy())
    
    @property
    def length(self):
        return len(self.collections)
            
    def format(self):
        x_arr = []
        for agg in Aggregation:
            x = [collection.get_aggregation(agg) for collection in self.collections]
            shape = x[0].shape
            x =  np.stack(x)
            x = x.reshape(self.length, shape[0], shape[1])
            x_arr.append(x)
        return x_arr, [collection.blacklisted for collection in self.collections]
        

In [290]:
import copy

In [700]:
models = []
for cluster in list(clustered_collections.groupby(['kmeans_clusters'])):
    print(f'processing cluster {cluster[0]} with shape {cluster[1].shape}')
    cluster_number = cluster[0]
    columns = ['Price_USD', 
                                    'Price_Crypto', 
                                    'volume', 
                                    'density', 
                                    'vertex_count', 
                                    'edge_count', 
                                    'max_diameter', 
                                    'max_radius', 
                                    'max_peripher']
    training = Dataset(ds_type=DatasetType.TRAINING, cluster = cluster_number,columns=columns )
    validation = Dataset(ds_type=DatasetType.VALIDATION, 
                         cluster = cluster_number, 
                         columns = columns)
    for row in cluster[1].itertuples(index=False, name=None):
        collection = Collection(name=row[0], blacklisted=row[1])
        ds_type = row[2]
        
        for aggregation in [(Aggregation.ONE_MIN, day1_dataset_1min), 
                   (Aggregation.TEN_MIN, day1_dataset_10min), 
                   (Aggregation.THIRTHY_MIN, day1_dataset_30min),
                   (Aggregation.SIXTY_MIN, day1_dataset_60min) ]:
            collection.add_aggregation(aggregation[0], aggregation[1].loc[aggregation[1]['collection'] == collection.name].copy())
        
        
        if ds_type == DatasetType.TRAINING.value:
            training.add(copy.deepcopy(collection))
        elif ds_type == DatasetType.VALIDATION.value:
            validation.add(copy.deepcopy(collection))
    models.append(MCNNModel(training = training, validation = validation, cluster = cluster_number))

processing cluster 1 with shape (513, 4)
processing cluster 4 with shape (369, 4)
processing cluster 5 with shape (103, 4)


In [695]:
models[0].training.columns

['Price_USD',
 'Price_Crypto',
 'volume',
 'density',
 'vertex_count',
 'edge_count',
 'max_diameter',
 'max_radius',
 'max_peripher']

In [699]:
class MCNNModel:

    def __init__(self,  training: Dataset, validation: Dataset, cluster,  
                 filters = [200,200,200,200], 
                 k_sizes= [500,50,30,20],
                 batch_size = 50
                ):
        self.training = training
        self.validation = validation
        self.cluster = cluster
        self.scalers =   {agg: StandardScaler() for agg in Aggregation}
        self.filters = filters
        self.k_sizes = k_sizes
        self.batch_size = 50
        self.model = None
        self.model_hist = None
        self.scaled = False
        
    def scale(self):
        if not self.scaled:
            for agg in Aggregation:
                print('fitting aggregation', agg)
                self.training.fit(agg, self.scalers.get(agg))
                print('transforming aggregation', agg)
                self.training.transform(agg, self.scalers.get(agg))
                self.validation.transform(agg, self.scalers.get(agg))
            self.scaled = True
            
            
    def retrieve_tensor_datasets(self):
        train_x, train_y = self.training.format()
        validation_x, validation_y = self.validation.format()
        formatted_train = ({f'input{n}': data for n, data in enumerate(train_x) }, train_y)
        formatted_test = ({f'input{n}': data for n, data in enumerate(validation_x) }, validation_y)
        train_dataset = tf.data.Dataset.from_tensor_slices(formatted_train).batch(200)
        test_dataset = tf.data.Dataset.from_tensor_slices(formatted_test).batch(200)
        return train_dataset, test_dataset
    
    @property
    def shapes(self):
        return [aggregation_type.shape 
                for aggregation_type in list(self.training.collections[0].aggregations.values())]
    
    @staticmethod
    def get_base_model(shape, k_size = k_sizes[0], num_filters = filters[0]):
        print("base model shape", shape)
        input_seq = Input(shape=shape)
        nb_filters = num_filters
        convolved = Conv1D(num_filters, k_size, padding="same", activation="relu")(input_seq)
        processed = GlobalMaxPooling1D()(convolved)
        #todo: fix maxpooling
    #     processed = MaxPooling1D(pool_size=2, strides=1, padding='same')(convolved)
        compressed = Dense(100, activation="relu")(processed)
        compressed = Dropout(0.3)(compressed)
        model = Model(inputs=input_seq, outputs=compressed)
        model.summary()
        return model
    
    @staticmethod
    def main_model(shapes, filters, k_sizes):
        inputs = [Input(shape=shape, name=f'input{n}')for n, shape in enumerate(shapes)]
        sub_models = [ MCNNModel.get_base_model(shape, k_size = k_sizes[n], num_filters=filters[n]) 
                      for n, shape in enumerate(shapes)]
        print(sub_models[0].output)
        embeddings = [ model(inputs[n]) for n, model in enumerate(sub_models)]
        merged = Concatenate()(embeddings)
        #todo: tweak dense be modifiable 
        layer1 = Dense(100, activation='relu', name ='hidden_layer1')(merged)
        layer2 = Dense(50, activation='relu', name ='hidden_layer2')(layer1)
        layer3 = Dense(25, activation='relu', name ='hidden_layer3')(layer2)
        out = Dense(1, activation='sigmoid')(layer3)
        model = Model(inputs=inputs, outputs=out)
        return model
    
    def run_model(self):
        train, validation = self.retrieve_tensor_datasets()
        model_shapes = self.shapes
        filters = self.filters
        k_sizes = self.k_sizes
        es = keras.callbacks.EarlyStopping(min_delta=0.1, patience=5)
        model = MCNNModel.main_model(model_shapes, filters, k_sizes)
        model.compile(loss='binary_crossentropy', # categorical_crossentropy
                              optimizer='adam', #sgd, nadam, adam, rmsprop
                              metrics=['binary_accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),
                                       tf.keras.metrics.AUC(curve='PR')])
        model.summary()
        model_hist = model.fit(train,
                                   validation_data=validation,
                                   batch_size=self.batch_size, epochs=2000, 
                               callbacks=[es]
                              )
        self.model = model
        self.model_hist = model_hist
        return self
    
    def predict(self):
        val_x, val_y = self.validation.format()
        if self.model:
            result = self.model.predict(val_x)
            return [result[0] for result in results], val_y
        else:
            return [], []

In [600]:
def pr_auc(actual, scores):
    precision, recall, thresholds = precision_recall_curve(actual, scores)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    return auc_precision_recall


In [718]:
from concurrent.futures import ThreadPoolExecutor, as_completed

class CustomClusteredModel:
    def __init__(self, models: List[MCNNModel], thresholds=None):
        self.models = models
        self.scores = dict()
        if thresholds:
            self.thresholds = thresholds
        else:
            self.thresholds = {
                1:0.37209302325581395,
                4:0.20284697508896798,
                5:0.07042253521126761
            }
    
    def scale(self):
        for model in self.models:
            model.scale()
        
    def train_model(self):
        futures = []
        with ThreadPoolExecutor() as executor:
            for model in self.models:
                an_executor = executor.submit(model.run_model)
                futures.append(an_executor)
        
        for a_future in as_completed(futures):
            model = a_future.result()
            print(f'Modeling for cluster {model.cluster} finished')
            
    def metrics(self):
        all_predictions = []
        all_actuals = []
        all_scores = []
        
        for model in self.models:
            predictions, actuals = CustomClusteredModel.predict(model)
            print('prediction and actuals', len(predictions), len(actuals))
            all_predictions.extend([1 if prediction > self.thresholds.get(model.cluster) else 0 for prediction in predictions])
            all_scores.extend(predictions)
            all_actuals.extend(actuals)
        print(len(all_predictions))
        print(len(all_actuals))
        self.scores['accuracy'] = accuracy_score(all_actuals, all_predictions)
        self.scores['recall'] = recall_score(all_actuals, all_predictions)
        self.scores['precision'] = precision_score(all_actuals, all_predictions)
        self.scores['f1_score'] = f1_score(all_actuals, all_predictions)
        self.scores['pr_auc'] = average_precision_score(all_actuals, all_scores) #pr_auc
        self.scores['auc'] = roc_auc_score(all_actuals, all_scores)
        return self.scores
        
    @staticmethod    
    def predict(model):
        #add choose model (Andrew's clustering/classifcation) 
        val_x, val_y = model.validation.format()
#         print('val x shape', val_x)
        if model:
            results = model.model.predict(val_x)
            print('inside predict', len(result))
            return [result[0] for result in results], val_y
        else:
            return [], []
            
    

In [721]:
ccm = CustomClusteredModel(models)
ccm2 = CustomClusteredModel(models, thresholds={
            1:0.2787550744248985,
            4:0.2787550744248985,
            5:0.2787550744248985
        })

In [704]:
ccm.scale()

fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.SIXTY_MIN
transforming aggregation Aggregation.SIXTY_MIN
fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.SIXTY_MIN
transforming aggregation Aggregation.SIXTY_MIN
fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.S

In [705]:
ccm.train_model()

base model shape (7200, 9)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 7200, 9)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 7200, 200)         900200    
_________________________________________________________________
global_max_pooling1d (Global (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               20100     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
Total params: 920,300
Trainable params: 920,300
Non-trainable params: 0
_________________________________________________________________
base model shape (720, 9)
Model: "model_1"
___________________________________________________

base model shape (7200, 9)
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 120, 9)]          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 120, 200)          36200     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
Total params: 56,300
Trainable params: 56,300
Non-trainable params: 0
_________________________________________________________________
KerasTensor(type_spec=TensorSpec(shape=(None, 100), dtype=tf.float32, name=None), name='dropou

Epoch 1/2000
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             [(None, 7200, 9)]    0                                            
__________________________________________________________________________________________________
input1 (InputLayer)             [(None, 720, 9)]     0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 240, 9)]     0                                            
__________________________________________________________________________________________________
input3 (InputLayer)             [(None, 120, 9)]     0                                            
_______________________________________________________________________________

Epoch 5/2000
Epoch 4/2000
Epoch 6/2000
Epoch 5/2000
Epoch 7/2000
Epoch 6/2000
Epoch 8/2000
Epoch 9/2000
Epoch 7/2000
Epoch 10/2000
Epoch 8/2000
Epoch 11/2000
Epoch 9/2000
Epoch 12/2000
Epoch 13/2000
Epoch 10/2000
Epoch 11/2000
Epoch 14/2000
Epoch 15/2000
Epoch 12/2000
Epoch 16/2000
Epoch 13/2000
Epoch 17/2000
Epoch 18/2000
Epoch 14/2000
Epoch 19/2000
Epoch 15/2000
Epoch 16/2000


Epoch 21/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Modeling for cluster 4 finished
Modeling for cluster 5 finished
Modeling for cluster 1 finished


In [712]:
ccm.metrics()

inside predict 1
prediction and actuals 126 126
inside predict 1
prediction and actuals 88 88
inside predict 1
prediction and actuals 32 32
246
246


{'accuracy': 0.6666666666666666,
 'recall': 0.6521739130434783,
 'precision': 0.4368932038834951,
 'f1_score': 0.5232558139534883,
 'pr_auc': 0.534653237728536,
 'auc': 0.7460902317202981}

In [722]:
ccm2.metrics()

inside predict 1
prediction and actuals 126 126
inside predict 1
prediction and actuals 88 88
inside predict 1
prediction and actuals 32 32
246
246


{'accuracy': 0.6504065040650406,
 'recall': 0.782608695652174,
 'precision': 0.432,
 'f1_score': 0.5567010309278351,
 'pr_auc': 0.534653237728536,
 'auc': 0.7460902317202981}

# Adding adding cluster type to model

In [657]:
input0 =  ccm.models[0].model.get_layer(name='input0').input
input1 =  ccm.models[0].model.get_layer(name='input1').input
input2 =  ccm.models[0].model.get_layer(name='input2').input
input3 =  ccm.models[0].model.get_layer(name='input3').input
input4 =  ccm.models[1].model.get_layer(name='input0').input
input5 =  ccm.models[1].model.get_layer(name='input1').input
input6 =  ccm.models[1].model.get_layer(name='input2').input
input7 =  ccm.models[1].model.get_layer(name='input3').input
input8 =  ccm.models[2].model.get_layer(name='input0').input
input9 =  ccm.models[2].model.get_layer(name='input1').input
input10 =  ccm.models[2].model.get_layer(name='input2').input
input11 =  ccm.models[2].model.get_layer(name='input3').input
cluster1 = ccm.models[0].model.get_layer(name='hidden_layer3').output
cluster2 = ccm.models[1].model.get_layer(name='hidden_layer3').output
cluster3 = ccm.models[2].model.get_layer(name='hidden_layer3').output

In [639]:
[input0, input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, cluster_type_input]

<keras.layers.core.Dense at 0x7fe28cf797c0>

In [655]:
input0

<KerasTensor: shape=(None, 7200, 6) dtype=float32 (created by layer 'input0')>

In [646]:
clusters_concat

<keras.layers.merge.Concatenate at 0x7fe250444520>

In [706]:
cluster_type_input = Input(shape=(1, ))
clusters_concat = tf.keras.layers.Concatenate()([cluster1, cluster2, cluster3])
final_concat = tf.keras.layers.Concatenate()([clusters_concat, cluster_type_input])
layer1 = Dense(50, activation='relu', name ='hidden_layer1')(final_concat)
layer2 = Dense(25, activation='relu', name ='hidden_layer2')(layer1)
layer3 = Dense(5, activation='relu', name ='hidden_layer3')(layer2)
out = Dense(1, activation='sigmoid')(layer3)
Model(inputs=[input0, input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, cluster_type_input], outputs=out )

ValueError: The name "input0" is used 3 times in the model. All layer names should be unique.

inside predict 1
prediction and actuals 126 126
inside predict 1
prediction and actuals 88 88
inside predict 1
prediction and actuals 32 32
246
246


{'accuracy': 0.6056910569105691,
 'recall': 0.7681159420289855,
 'precision': 0.39552238805970147,
 'f1_score': 0.5221674876847291,
 'pr_auc': 0.4387294881266916,
 'auc': 0.7037582903463523}

In [614]:
cluster1

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'hidden_layer3')>

In [616]:
cluster2

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'hidden_layer3')>

In [622]:
inputA = Input(shape=(25,))
inputB = Input(shape=(25,))
inputC = Input(shape=(25,))
inputD = Input(shape=(1,))

In [618]:
inputA

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_9')>

In [619]:
inputB

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_10')>

In [620]:
inputC

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_11')>