In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
from numpy.random import normal

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams


import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.ticker import MaxNLocator



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, auc, average_precision_score
from sklearn.preprocessing import StandardScaler



from keras.layers import Conv1D, Dense, Dropout, Input, Concatenate, GlobalMaxPooling1D, MaxPooling1D
from keras.models import Model
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit

# from keras.optimizers import RMSprop, adam

## Importing Data

In [2]:
day1_dataset_1min = pd.read_csv('../Data Slices/5_days_timeseries_data/1min.csv')
day1_dataset_10min = pd.read_csv('../Data Slices/5_days_timeseries_data/10min.csv')
day1_dataset_30min = pd.read_csv('../Data Slices/5_days_timeseries_data/30min.csv')
day1_dataset_60min = pd.read_csv('../Data Slices/5_days_timeseries_data/60min.csv')

In [3]:
day1_dataset_60min[day1_dataset_60min['collection'] == 'zombieartist']

Unnamed: 0.1,Unnamed: 0,Datetime_updated_seconds,Price_USD,Price_Crypto,density,vertex_count,edge_count,max_diameter,max_radius,max_peripher,volume,collection,blacklisted,whitelisted
118080,0,2020-10-11 22:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,zombieartist,0,1
118081,1,2020-10-11 23:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118082,2,2020-10-12 00:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118083,3,2020-10-12 01:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
118084,4,2020-10-12 02:00:00,0.720015,23.0,1.0,2.0,1.0,1.0,1.0,2.0,0.0,zombieartist,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118195,115,2020-10-16 17:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118196,116,2020-10-16 18:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118197,117,2020-10-16 19:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1
118198,118,2020-10-16 20:00:00,0.106620,3.0,0.1,20.0,19.0,0.0,0.0,0.0,0.0,zombieartist,0,1


In [4]:
clustered_collections = pd.read_csv('NFT_Kmeans_Train_Val.csv')

In [5]:
clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].sum()/clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].count()

0.2787550744248985

In [6]:
cluster_4= clustered_collections[clustered_collections['kmeans_clusters'] == 4]
cluster_5= clustered_collections[clustered_collections['kmeans_clusters'] == 5]
cluster_1= clustered_collections[clustered_collections['kmeans_clusters'] == 1]
# clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].sum()/clustered_collections[clustered_collections['train_val_set'] == 'Training']['blacklisted'].count()

In [20]:
clustered_collections.columns

Index(['collection', 'blacklisted', 'train_val_set', 'kmeans_clusters'], dtype='object')

In [23]:
training_df = clustered_collections[clustered_collections['train_val_set']=='Training']
validation_df = clustered_collections[clustered_collections['train_val_set']=='Validation']

In [35]:
validation_df

Unnamed: 0,collection,blacklisted,train_val_set,kmeans_clusters
0,1amazingbook,0,Validation,4
4,1forthebirds,0,Validation,4
5,1fungidents1,0,Validation,4
10,2cryptokingg,0,Validation,4
12,3dnanoocards,0,Validation,4
...,...,...,...,...
968,wpcwrarecard,0,Validation,5
970,wvmnftsonwax,0,Validation,5
972,xthingscards,0,Validation,1
973,xxbleetcolxx,0,Validation,1


## Model

In [7]:
from typing import Dict, List, Tuple
from enum import Enum


class DatasetType(Enum):
    TRAINING = 'Training'
    VALIDATION = 'Validation'
    
class Aggregation(Enum):
    ONE_MIN = 1
    TEN_MIN = 2
    THIRTHY_MIN = 3
    SIXTY_MIN = 4

class Collection:
    def __init__(self, name, aggregations:  Dict[Aggregation, pd.DataFrame] = dict(), blacklisted=0):
        self.name = name
        self.aggregations: Dict[Aggregation, pd.Dataframe] = aggregations
        self.blacklisted = blacklisted
    
    def get_aggregation(self, aggregation: Aggregation):
        return self.aggregations.get(aggregation)
    
    def add_aggregation(self,aggregation: Aggregation, a_df: pd.DataFrame):
        self.aggregations[aggregation] = a_df.copy()

In [8]:
class Dataset:
    def __init__(self, ds_type: DatasetType, cluster,  collections: List[Collection] = None, columns = []):
        self.collections = []
        self.ds_type = ds_type
        self.columns = columns
        
    def add(self, collection: Collection):
        self.collections.append(collection)
        
    def concat(self, aggregation):
        return pd.concat(
            [collection.get_aggregation(aggregation) for collection in self.collections], ignore_index=True)
    
    def fit(self, aggregation, scaler):
        scaler.fit(self.concat(aggregation)[self.columns])
    
    def transform(self, aggregation, scaler: StandardScaler):
        for collection in self.collections:
            all_columns = collection.get_aggregation(aggregation).copy()
            internal_df = all_columns[self.columns].copy()
            internal_df = scaler.transform(internal_df)
            collection.add_aggregation(
                aggregation, internal_df.copy())
    
    @property
    def length(self):
        return len(self.collections)
            
    def format(self):
        x_arr = []
        for agg in Aggregation:
            x = [collection.get_aggregation(agg) for collection in self.collections]
            shape = x[0].shape
            x =  np.stack(x)
            x = x.reshape(self.length, shape[0], shape[1])
            x_arr.append(x)
        return x_arr, [collection.blacklisted for collection in self.collections]
        

In [9]:
import copy

In [137]:
clustered_collections

Unnamed: 0,collection,blacklisted,train_val_set,kmeans_clusters
0,1amazingbook,0,Validation,4
1,1bitcoinlive,0,Training,1
2,1bodyinmove1,0,Training,1
3,1coolartnft1,0,Training,4
4,1forthebirds,0,Validation,4
...,...,...,...,...
980,zeugencorona,1,Training,4
981,zippergirls1,0,Training,1
982,zlfhomedecor,0,Training,4
983,zombaeseries,0,Training,1


In [150]:
def create_singular_mcnn():
    training = Dataset(ds_type=DatasetType.TRAINING, cluster = None,columns=columns )
    validation = Dataset(ds_type=DatasetType.VALIDATION, 
                             cluster = None, 
                             columns = columns)
    for row in clustered_collections.itertuples(index=False, name=None):
        collection = Collection(name=row[0], blacklisted=row[1])
        ds_type = row[2]
        for aggregation in [(Aggregation.ONE_MIN, day1_dataset_1min), (Aggregation.TEN_MIN, day1_dataset_10min), 
                       (Aggregation.THIRTHY_MIN, day1_dataset_30min),(Aggregation.SIXTY_MIN, day1_dataset_60min) ]:
                collection.add_aggregation(aggregation[0], aggregation[1].loc[aggregation[1]['collection'] == collection.name].copy())
        if ds_type == DatasetType.TRAINING.value:
            training.add(copy.deepcopy(collection))
        elif ds_type == DatasetType.VALIDATION.value:
            validation.add(copy.deepcopy(collection))
    return MCNNModel(training = training, validation = validation, cluster = cluster_number)

In [151]:
mcnn_model = create_singular_mcnn()

In [155]:
mcnn_model.validation.length

246

In [204]:
models = []
for cluster in list(clustered_collections.groupby(['kmeans_clusters'])):
    print(f'processing cluster {cluster[0]} with shape {cluster[1].shape}')
    cluster_number = cluster[0]
    columns = ['Price_USD', 
                                    'Price_Crypto', 
                                    'volume', 
                                    'density', 
                                    'vertex_count', 
                                    'edge_count', 
                                    'max_diameter', 
                                    'max_radius', 
                                    'max_peripher']
    training = Dataset(ds_type=DatasetType.TRAINING, cluster = cluster_number,columns=columns )
    validation = Dataset(ds_type=DatasetType.VALIDATION, 
                         cluster = cluster_number, 
                         columns = columns)
    for row in cluster[1].itertuples(index=False, name=None):
        collection = Collection(name=row[0], blacklisted=row[1])
        ds_type = row[2]
        
        for aggregation in [(Aggregation.ONE_MIN, day1_dataset_1min), 
                   (Aggregation.TEN_MIN, day1_dataset_10min), 
                   (Aggregation.THIRTHY_MIN, day1_dataset_30min),
                   (Aggregation.SIXTY_MIN, day1_dataset_60min) ]:
            collection.add_aggregation(aggregation[0], aggregation[1].loc[aggregation[1]['collection'] == collection.name].copy())
        
        
        if ds_type == DatasetType.TRAINING.value:
            training.add(copy.deepcopy(collection))
        elif ds_type == DatasetType.VALIDATION.value:
            validation.add(copy.deepcopy(collection))
    models.append(MCNNModel(training = training, validation = validation, cluster = cluster_number))

processing cluster 1 with shape (513, 4)
processing cluster 4 with shape (369, 4)
processing cluster 5 with shape (103, 4)


In [19]:
models[0].training.columns

['Price_USD',
 'Price_Crypto',
 'volume',
 'density',
 'vertex_count',
 'edge_count',
 'max_diameter',
 'max_radius',
 'max_peripher']

In [206]:
class MCNNModel:

    def __init__(self,  training: Dataset, validation: Dataset, cluster,  
                 filters = [200,200,200,200], 
                 k_sizes= [500,50,30,20],
                 batch_size = 50
                ):
        self.training = training
        self.validation = validation
        self.cluster = cluster
        self.scalers =   {agg: StandardScaler() for agg in Aggregation}
        self.filters = filters
        self.k_sizes = k_sizes
        self.batch_size = 50
        self.model = None
        self.model_hist = None
        self.scaled = False
        
    def scale(self):
        if not self.scaled:
            for agg in Aggregation:
                print('fitting aggregation', agg)
                self.training.fit(agg, self.scalers.get(agg))
                print('transforming aggregation', agg)
                self.training.transform(agg, self.scalers.get(agg))
                self.validation.transform(agg, self.scalers.get(agg))
            self.scaled = True
            
            
    def retrieve_tensor_datasets(self):
        train_x, train_y = self.training.format()
        validation_x, validation_y = self.validation.format()
        formatted_train = ({f'input{n}': data for n, data in enumerate(train_x) }, train_y)
        formatted_test = ({f'input{n}': data for n, data in enumerate(validation_x) }, validation_y)
        train_dataset = tf.data.Dataset.from_tensor_slices(formatted_train).batch(200)
        test_dataset = tf.data.Dataset.from_tensor_slices(formatted_test).batch(200)
        return train_dataset, test_dataset
    
    @property
    def shapes(self):
        return [aggregation_type.shape 
                for aggregation_type in list(self.training.collections[0].aggregations.values())]
    
    @staticmethod
    def get_base_model(shape, k_size, num_filters):
        print("base model shape", shape)
        input_seq = Input(shape=shape)
        nb_filters = num_filters
        convolved = Conv1D(num_filters, k_size, padding="same", activation="relu")(input_seq)
        processed = GlobalMaxPooling1D()(convolved)
        #todo: fix maxpooling
    #     processed = MaxPooling1D(pool_size=2, strides=1, padding='same')(convolved)
        compressed = Dense(100, activation="relu")(processed)
        compressed = Dropout(0.3)(compressed)
        model = Model(inputs=input_seq, outputs=compressed)
        model.summary()
        return model
    
    @staticmethod
    def main_model(shapes, filters, k_sizes):
        inputs = [Input(shape=shape, name=f'input{n}')for n, shape in enumerate(shapes)]
        sub_models = [ MCNNModel.get_base_model(shape, k_size = k_sizes[n], num_filters=filters[n]) 
                      for n, shape in enumerate(shapes)]
        print(sub_models[0].output)
        embeddings = [ model(inputs[n]) for n, model in enumerate(sub_models)]
        merged = Concatenate()(embeddings)
        #todo: tweak dense be modifiable 
        layer1 = Dense(100, activation='relu', name ='hidden_layer1')(merged)
        layer2 = Dense(50, activation='relu', name ='hidden_layer2')(layer1)
        layer3 = Dense(25, activation='relu', name ='hidden_layer3')(layer2)
        out = Dense(1, activation='sigmoid')(layer3)
        model = Model(inputs=inputs, outputs=out)
        return model
    
    def run_model(self):
        train, validation = self.retrieve_tensor_datasets()
        model_shapes = self.shapes
        filters = self.filters
        k_sizes = self.k_sizes
        es = keras.callbacks.EarlyStopping(min_delta=0.0001, patience=10)
        model = MCNNModel.main_model(model_shapes, filters, k_sizes)
        model.compile(loss='binary_crossentropy', # categorical_crossentropy
                              optimizer='adam', #sgd, nadam, adam, rmsprop
                              metrics=['binary_accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),
                                       tf.keras.metrics.AUC(curve='PR')])
        model.summary()
        model_hist = model.fit(train,
                                   validation_data=validation,
                                   batch_size=self.batch_size, epochs=2000, 
                               callbacks=[es]
                              )
        self.model = model
        self.model_hist = model_hist
        return self
    
    def predict(self):
        val_x, val_y = self.validation.format()
        if self.model:
            results = self.model.predict(val_x)
            return [result[0] for result in results], val_y
        else:
            return [], []

In [20]:
def pr_auc(actual, scores):
    precision, recall, thresholds = precision_recall_curve(actual, scores)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    return auc_precision_recall


In [212]:
from concurrent.futures import ThreadPoolExecutor, as_completed

class CustomClusteredModel:
    def __init__(self, models: List[MCNNModel], thresholds=None):
        self.models = models
        self.scores = dict()
        if thresholds:
            self.thresholds = thresholds
        else:
            self.thresholds = {
                1:0.37209302325581395,
                4:0.20284697508896798,
                5:0.07042253521126761
            }
    
    def scale(self):
        for model in self.models:
            model.scale()
        
    def train_model(self):
        futures = []
        with ThreadPoolExecutor() as executor:
            for model in self.models:
                an_executor = executor.submit(model.run_model)
                futures.append(an_executor)
        
        for a_future in as_completed(futures):
            model = a_future.result()
            print(f'Modeling for cluster {model.cluster} finished')
            
    def metrics(self):
        all_predictions = []
        all_actuals = []
        all_scores = []
        
        model_scores = {}
        for model in self.models:
            predictions, actuals = CustomClusteredModel.predict(model)
            print('prediction and actuals', len(predictions), len(actuals))
            model_scores[model] = (predictions,actuals)
            all_predictions.extend([1 if prediction > self.thresholds.get(model.cluster) else 0 for prediction in predictions])
            all_scores.extend(predictions)
            all_actuals.extend(actuals)
        print(len(all_predictions))
        print(len(all_actuals))
        self.scores['accuracy'] = accuracy_score(all_actuals, all_predictions)
        self.scores['recall'] = recall_score(all_actuals, all_predictions)
        self.scores['precision'] = precision_score(all_actuals, all_predictions)
        self.scores['f1_score'] = f1_score(all_actuals, all_predictions)
        self.scores['pr_auc'] = average_precision_score(all_actuals, all_scores) #pr_auc
        self.scores['auc'] = roc_auc_score(all_actuals, all_scores)
        return self.scores, all_scores, all_actuals, model_scores
        
    @staticmethod    
    def predict(model):
        #add choose model (Andrew's clustering/classifcation) 
        val_x, val_y = model.validation.format()
#         print('val x shape', val_x)
        if model:
            results = model.model.predict(val_x)
            return [r[0] for r in results], val_y
        else:
            return [], []
            
    

In [213]:
ccm = CustomClusteredModel(models)
ccm2 = CustomClusteredModel(models, thresholds={
                1:0.2558139534883721,
                4:0.594306049822064,
                5:0.8591549295774648
        })

In [None]:
[1]

In [209]:
ccm2.scale()

fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.SIXTY_MIN
transforming aggregation Aggregation.SIXTY_MIN
fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.SIXTY_MIN
transforming aggregation Aggregation.SIXTY_MIN
fitting aggregation Aggregation.ONE_MIN
transforming aggregation Aggregation.ONE_MIN
fitting aggregation Aggregation.TEN_MIN
transforming aggregation Aggregation.TEN_MIN
fitting aggregation Aggregation.THIRTHY_MIN
transforming aggregation Aggregation.THIRTHY_MIN
fitting aggregation Aggregation.S

In [210]:
ccm2.train_model()

base model shape (7200, 9)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 7200, 9)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 7200, 200)         900200    
_________________________________________________________________
global_max_pooling1d (Global (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               20100     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
Total params: 920,300
Trainable params: 920,300
Non-trainable params: 0
_________________________________________________________________
base model shape (720, 9)
Model: "model_1"
___________________________________________________

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 240, 9)]          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 240, 200)          54200     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
Total params: 74,300
Trainable params: 74,300
Non-trainable params: 0
_________________________________________________________________
base model shape (120, 9)
Model: "model_3"
_________________________________________________________________
Layer (type)

Epoch 1/2000
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input0 (InputLayer)             [(None, 7200, 9)]    0                                            
__________________________________________________________________________________________________
input1 (InputLayer)             [(None, 720, 9)]     0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 240, 9)]     0                                            
__________________________________________________________________________________________________
input3 (InputLayer)             [(None, 120, 9)]     0                                            
_______________________________________________________________________________

Epoch 14/2000
Epoch 15/2000
Epoch 5/2000
Epoch 16/2000
Epoch 4/2000
Epoch 17/2000
Epoch 18/2000
Epoch 6/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 5/2000
Epoch 7/2000
Epoch 6/2000
Epoch 8/2000
Epoch 9/2000
Epoch 7/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 9/2000
Epoch 13/2000
Epoch 10/2000
Epoch 14/2000
Epoch 11/2000
Epoch 15/2000
Epoch 16/2000


Epoch 12/2000
Epoch 17/2000
Epoch 13/2000
Epoch 18/2000
Epoch 14/2000
Epoch 19/2000
Epoch 20/2000
Epoch 15/2000
Epoch 21/2000
Epoch 16/2000
Epoch 22/2000
Epoch 17/2000
Epoch 23/2000
Epoch 24/2000
Epoch 18/2000
Epoch 25/2000
Epoch 19/2000
Epoch 26/2000
Epoch 27/2000
Epoch 20/2000
Epoch 28/2000
Epoch 21/2000
Epoch 29/2000
Epoch 22/2000
Epoch 30/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000


Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Modeling for cluster 4 finished
Modeling for cluster 1 finished
Modeling for cluster 5 finished


In [214]:
ccm2.metrics()

prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246


({'accuracy': 0.7073170731707317,
  'recall': 0.7681159420289855,
  'precision': 0.48623853211009177,
  'f1_score': 0.5955056179775281,
  'pr_auc': 0.5555525061596606,
  'auc': 0.7609105051993776},
 [0.61231315,
  0.46982074,
  0.4481584,
  0.30700344,
  0.6662127,
  0.7796637,
  0.90517277,
  0.36214733,
  0.603771,
  0.863167,
  0.8232345,
  0.4232804,
  0.65359735,
  0.9474193,
  0.3039164,
  0.91275424,
  0.42196465,
  0.5641688,
  0.59019744,
  0.7526735,
  0.9426371,
  0.99890184,
  0.13326666,
  0.41544598,
  0.70453864,
  0.09954885,
  0.3207967,
  0.3239746,
  0.9377845,
  0.4975063,
  0.5059294,
  0.20002624,
  0.4806873,
  0.38144466,
  0.38805905,
  0.5589733,
  0.18100089,
  0.30330837,
  0.7995721,
  0.29057914,
  0.6763704,
  0.12336916,
  0.30968213,
  0.06904635,
  0.0013312101,
  0.3299384,
  0.20356485,
  0.9446499,
  0.19500864,
  0.32308364,
  0.3054132,
  0.31121463,
  0.36172962,
  0.304227,
  0.043332636,
  0.43311402,
  0.44141328,
  0.40956205,
  0.26854867,
 

In [115]:
import itertools
def get_permutations():
    tuning1 = [i*.02 for i in range(int(1/0.02))]
    param_comb = []
    for subset in itertools.permutations(tuning1, 3):
        param_comb.append(subset)
    return param_comb

In [116]:
permutations = get_permutations()


In [118]:
permutations

(0.98, 0.96, 0.9400000000000001)

In [74]:
len(permutations)

720

In [215]:
_,_,_,a = ccm2.metrics()

prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246


In [183]:
a

{<__main__.MCNNModel at 0x7f9fc956e070>: ([0.2834437,
   0.4197714,
   0.4456398,
   0.33118904,
   0.25454122,
   0.6164682,
   0.29436156,
   0.333063,
   0.20764238,
   0.48848838,
   0.15442693,
   0.5819849,
   0.60199434,
   0.5908707,
   0.0921191,
   0.5605314,
   0.49317726,
   0.09617284,
   0.56882054,
   0.65288985,
   0.89689124,
   0.9805275,
   0.13848403,
   0.6048446,
   0.63964033,
   0.07996616,
   0.1356155,
   0.37464005,
   0.74741495,
   0.4002385,
   0.49612054,
   0.11510727,
   0.4319935,
   0.6212717,
   0.008309811,
   0.19647992,
   0.04822889,
   0.33007085,
   0.56192535,
   0.07135308,
   0.22680154,
   0.040951997,
   0.3321486,
   0.035749614,
   0.018501222,
   0.35068056,
   0.14094749,
   0.5492408,
   0.045867205,
   0.3414599,
   0.33066714,
   0.33273333,
   0.34415752,
   0.33018896,
   0.14340073,
   0.492446,
   0.54715216,
   0.4506877,
   0.03819129,
   0.3295639,
   0.012631685,
   0.5165477,
   0.07380307,
   0.4470405,
   0.33095238,
   0

In [68]:
# all_scores

[{'accuracy': 0.3780487804878049,
  'recall': 0.9420289855072463,
  'precision': 0.3037383177570093,
  'f1_score': 0.4593639575971731,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.3780487804878049,
  'recall': 0.9420289855072463,
  'precision': 0.3037383177570093,
  'f1_score': 0.4593639575971731,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.3780487804878049,
  'recall': 0.9420289855072463,
  'precision': 0.3037383177570093,
  'f1_score': 0.4593639575971731,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.3780487804878049,
  'recall': 0.9420289855072463,
  'precision': 0.3037383177570093,
  'f1_score': 0.4593639575971731,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.3780487804878049,
  'recall': 0.9420289855072463,
  'precision': 0.3037383177570093,
  'f1_score': 0.4593639575971731,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0

In [93]:
tunningCCM = CustomClusteredModel(ccm2.models, thresholds={
                1:0.2558139534883721,
                4:0.594306049822064,
                5:0.8591549295774648
        })
_,_,_,a= tunningCCM.metrics()

prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246


In [216]:
all_scores = []
for params in get_permutations():
    all_model_predictions = []
    all_model_raw_scores = []
    all_actuals = []
    for index, model in enumerate(a.items()):
        model_prediction= [1 if prediction > params[index] else 0 for prediction in model[1][0]]
        all_model_predictions.extend(model_prediction)
        all_model_raw_scores.extend(model[1][0])
        all_actuals.extend(model[1][1])
    scores = {}
    scores['accuracy'] = accuracy_score(all_actuals, all_model_predictions)
    scores['recall'] = recall_score(all_actuals, all_model_predictions)
    scores['precision'] = precision_score(all_actuals, all_model_predictions)
    scores['f1_score'] = f1_score(all_actuals, all_model_predictions)
    scores['pr_auc'] = average_precision_score(all_actuals, all_model_raw_scores) #pr_auc
    scores['auc'] = roc_auc_score(all_actuals, all_model_raw_scores)
    all_scores.append((scores,params))

In [217]:
max_f1 = all_scores[0] 
for idx, score in enumerate(all_scores):
    if score[0]['f1_score'] > max_f1[0]['f1_score']:
        max_f1 = score

In [203]:
all_scores[22056]

({'accuracy': 0.7113821138211383,
  'recall': 0.6956521739130435,
  'precision': 0.4897959183673469,
  'f1_score': 0.5748502994011976,
  'pr_auc': 0.5156985725622069,
  'auc': 0.729632358961762},
 (0.18, 0.38, 0.52))

In [218]:
max_f1

({'accuracy': 0.7113821138211383,
  'recall': 0.8115942028985508,
  'precision': 0.49122807017543857,
  'f1_score': 0.6120218579234973,
  'pr_auc': 0.5555525061596606,
  'auc': 0.7609105051993776},
 (0.26, 0.3, 0.32))

In [84]:
tunningCCM = CustomClusteredModel(ccm2.models, thresholds={
                1:0.2558139534883721,
                4:0.594306049822064,
                5:0.8591549295774648
        })
all_scores = [] 
for params in permutations:
    print(params)
    tunningCCM.thresholds = {
        1: params[0],
        4: params[1],
        5: params[2]
    }
    all_scores.append(tunningCCM.metrics())


(0.0, 0.1, 0.2)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.30000000000000004)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.4)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.5)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.6000000000000001)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.7000000000000001)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.8)
prediction and actuals 126 126
prediction and actuals 88 88
prediction and actuals 32 32
246
246
(0.0, 0.1, 0.9)


KeyboardInterrupt: 

In [83]:
len(all_scores)

720

In [82]:
[x for x in all_scores]

[{'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0,
  'pr_auc': 0.5267216771984266,
  'auc': 0.7315155981331368},
 {'accuracy': 0.7195121951219512,
  'recall': 0.0,
  'precision': 0.0,
  'f1_score': 0.0

# Adding adding cluster type to model

In [30]:
input0 =  ccm.models[0].model.get_layer(name='input0').input
input1 =  ccm.models[0].model.get_layer(name='input1').input
input2 =  ccm.models[0].model.get_layer(name='input2').input
input3 =  ccm.models[0].model.get_layer(name='input3').input
input4 =  ccm.models[1].model.get_layer(name='input0').input
input5 =  ccm.models[1].model.get_layer(name='input1').input
input6 =  ccm.models[1].model.get_layer(name='input2').input
input7 =  ccm.models[1].model.get_layer(name='input3').input
input8 =  ccm.models[2].model.get_layer(name='input0').input
input9 =  ccm.models[2].model.get_layer(name='input1').input
input10 =  ccm.models[2].model.get_layer(name='input2').input
input11 =  ccm.models[2].model.get_layer(name='input3').input
cluster1 = ccm.models[0].model.get_layer(name='hidden_layer3').output
cluster2 = ccm.models[1].model.get_layer(name='hidden_layer3').output
cluster3 = ccm.models[2].model.get_layer(name='hidden_layer3').output

In [639]:
[input0, input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, cluster_type_input]

<keras.layers.core.Dense at 0x7fe28cf797c0>

In [655]:
input0

<KerasTensor: shape=(None, 7200, 6) dtype=float32 (created by layer 'input0')>

In [646]:
clusters_concat

<keras.layers.merge.Concatenate at 0x7fe250444520>

In [32]:
cluster_type_input = Input(shape=(1, ))
clusters_concat = tf.keras.layers.Concatenate()([cluster1, cluster_type_input])
layer3 = Dense(5, activation='relu', name ='hidden_layer4')(clusters_concat)
out = Dense(1, activation='sigmoid')(layer3)
Model(inputs=[input0, input1, input2, input3, cluster_type_input], outputs=out)

<keras.engine.functional.Functional at 0x7f9fb13fad30>

In [34]:
ccm2.models[0].retrieve_tensor_datasets()

(<BatchDataset shapes: ({input0: (None, 7200, 9), input1: (None, 720, 9), input2: (None, 240, 9), input3: (None, 120, 9)}, (None,)), types: ({input0: tf.float64, input1: tf.float64, input2: tf.float64, input3: tf.float64}, tf.int32)>,
 <BatchDataset shapes: ({input0: (None, 7200, 9), input1: (None, 720, 9), input2: (None, 240, 9), input3: (None, 120, 9)}, (None,)), types: ({input0: tf.float64, input1: tf.float64, input2: tf.float64, input3: tf.float64}, tf.int32)>)

In [50]:
len(train_x1)

12

In [54]:
len(formatted_train[0])

12

In [51]:
train_x1, train_y1, = ccm2.models[0].training.format()
train_x2, train_y2, = ccm2.models[1].training.format()
train_x3, train_y3, = ccm2.models[2].training.format()
cluster_1 = [1 for _ in train_y]
cluster_2 = [2 for _ in train_y2]
cluster_3 = [3 for _ in train_y3]

train_x1.extend(train_x2)
train_x1.extend(train_x3)
train_y1.extend(train_y2)
train_y1.extend(train_y3)
cluster_1.extend(cluster_2)
cluster_1.extend(cluster_3)

# validation_x, validation_y = ccm2.models[0].validation.format()
formatted_train = ({f'input{n}': data for n, data in enumerate(train_x1) }, train_y1, cluster_1)
# formatted_test = ({f'input{n}': data for n, data in enumerate(validation_x) }, validation_y)
train_dataset = tf.data.Dataset.from_tensor_slices(formatted_train).batch(200)
# test_dataset = tf.data.Dataset.from_tensor_slices(formatted_test).batch(200)
# return train_dataset, test_dataset

ValueError: Dimensions 387 and 71 are not compatible

In [41]:
[1 for _ in train_y]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [38]:
train_x

[array([[[ 3.16932185e+00,  1.07767017e+00,  2.86599418e+01, ...,
           1.62129982e+00,  2.30014276e+00,  2.06701132e+00],
         [ 3.16932185e+00,  1.07767017e+00, -1.91749909e-02, ...,
          -3.95215782e-01, -4.23877217e-01, -4.09637161e-01],
         [ 3.16932185e+00,  1.07767017e+00, -1.91749909e-02, ...,
          -3.95215782e-01, -4.23877217e-01, -4.09637161e-01],
         ...,
         [ 3.16932185e+00,  1.07767017e+00, -1.91749909e-02, ...,
          -3.95215782e-01, -4.23877217e-01, -4.09637161e-01],
         [ 3.16932185e+00,  1.07767017e+00, -1.91749909e-02, ...,
          -3.95215782e-01, -4.23877217e-01, -4.09637161e-01],
         [ 3.16932185e+00,  1.07767017e+00, -1.91749909e-02, ...,
          -3.95215782e-01, -4.23877217e-01, -4.09637161e-01]],
 
        [[-2.39906705e-01, -1.06176253e-01,  2.86599418e+01, ...,
           1.62129982e+00,  2.30014276e+00,  2.06701132e+00],
         [-2.39906705e-01, -1.06176253e-01, -1.91749909e-02, ...,
          -3.95215782

In [36]:
len(train_y)

387

In [706]:
cluster_type_input = Input(shape=(1, ))
clusters_concat = tf.keras.layers.Concatenate()([cluster1, cluster2, cluster3])
final_concat = tf.keras.layers.Concatenate()([clusters_concat, cluster_type_input])
layer1 = Dense(50, activation='relu', name ='hidden_layer1')(final_concat)
layer2 = Dense(25, activation='relu', name ='hidden_layer2')(layer1)
layer3 = Dense(5, activation='relu', name ='hidden_layer3')(layer2)
out = Dense(1, activation='sigmoid')(layer3)
Model(inputs=[input0, input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, cluster_type_input], outputs=out )

ValueError: The name "input0" is used 3 times in the model. All layer names should be unique.

inside predict 1
prediction and actuals 126 126
inside predict 1
prediction and actuals 88 88
inside predict 1
prediction and actuals 32 32
246
246


{'accuracy': 0.6056910569105691,
 'recall': 0.7681159420289855,
 'precision': 0.39552238805970147,
 'f1_score': 0.5221674876847291,
 'pr_auc': 0.4387294881266916,
 'auc': 0.7037582903463523}

In [614]:
cluster1

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'hidden_layer3')>

In [616]:
cluster2

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'hidden_layer3')>

In [622]:
inputA = Input(shape=(25,))
inputB = Input(shape=(25,))
inputC = Input(shape=(25,))
inputD = Input(shape=(1,))

In [618]:
inputA

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_9')>

In [619]:
inputB

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_10')>

In [620]:
inputC

<KerasTensor: shape=(None, 25) dtype=float32 (created by layer 'input_11')>