In [50]:
import pandas as pd
import numpy as np

#import torch
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams


import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.ticker import MaxNLocator



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler



from keras.layers import Conv1D, Dense, Dropout, Input, Concatenate, GlobalMaxPooling1D
from keras.models import Model
from tensorflow import keras
import tensorflow as tf
# from keras.optimizers import RMSprop, adam

ImportError: cannot import name 'RMSprop' from 'keras.optimizers' (/opt/anaconda3/lib/python3.8/site-packages/keras/optimizers.py)

## Importing Data

In [2]:
day1_dataset_10min = pd.read_csv('Data Slices/day1_dataset_10min_10Aug2022.csv')
day1_dataset_30min = pd.read_csv('Data Slices/day1_dataset_30min_10Aug2022.csv')
day1_dataset_60min = pd.read_csv('Data Slices/day1_dataset_60min_10Aug2022.csv')

In [4]:
def get_X_and_y(df, x_columns, y_columns):
    X = df[x_columns]
    y = df[y_columns]
    return X, y

In [5]:
def get_daily_sequences(X, y, feature_columns, sequence_dict = {}):
    for collection, group in X.groupby("collection"):
        features = group[group.columns.intersection(feature_columns)]
        blacklist = y[y.collection == collection].iloc[0].blacklisted
        if sequence_dict.get(collection):
            sequence_dict[collection][0].append(features)
        else:
            sequence_dict[collection] = ([features], blacklist)
    return sequence_dict

In [6]:
def get_daily_sequences(X, y, feature_columns, sequence_dict = {}):
    for collection, group in X.groupby("collection"):
        features = group[group.columns.intersection(feature_columns)]
        blacklist = y[y.collection == collection].iloc[0].blacklisted
        if sequence_dict.get(collection):
            sequence_dict[collection][0].append(features)
        else:
            sequence_dict[collection] = ([features], blacklist)
    return sequence_dict

In [7]:
def scale_dataset(sequences, columns = ['Price_USD', 'Price_Crypto', 'volume', 'densities', 'vertex_count', 
                              'edge_count','vertext_edge_ratio'], fitted_scaler = None):
    collection_blacklist_dict = {sequence[0]:sequence[2] for sequence in sequences}
    concat_df = pd.concat([sequence[1] for sequence in sequences])
    if fitted_scaler is None:
        fitted_scaler = StandardScaler()
        fitted_scaler.fit(concat_df[columns])
    concat_df[columns] = fitted_scaler.transform(concat_df[columns])
    return  [(collection, group[columns], collection_blacklist_dict[collection]) 
             for collection, group in concat_df.groupby("collection")] , fitted_scaler

In [8]:
def get_all_sequences(all_df):
    sequence_dict = {}
    for df in all_df: 
        df = df.drop(['Unnamed: 0'], axis=1)
        x_columns =  ['Datetime_updated_seconds','Price_USD','Price_Crypto','volume','densities',
                   'vertex_count','edge_count','collection']
        y_columns = ['blacklisted','collection']
        feature_columns=['Price_USD','Price_Crypto','volume',
                         'densities', 'vertex_count','edge_count', 'collection', 'vertext_edge_ratio']
        X, y= get_X_and_y(df, x_columns, y_columns)
        X['vertext_edge_ratio'] = X['vertex_count']/X['edge_count']
        sequence_dict = get_daily_sequences(X, y, feature_columns, sequence_dict)
    return sequence_dict

In [10]:
def retrieve_all_train_test_sequences(all_df):
    train_test_sequences = []
    all_sequence_dict = get_all_sequences(all_df)
        
    train_sequences, test_sequences = train_test_split(list(all_sequence_dict.items()), test_size = 0.3)
#     for train_s
#         scaled_train_sequences, scaler = scale_dataset(train_sequences)
#         scaled_test_sequences ,  _ = scale_dataset(test_sequences, fitted_scale r= scaler)
    return train_sequences, test_sequences

In [11]:
train_sequences, test_sequences = retrieve_all_train_test_sequences([day1_dataset_10min, day1_dataset_30min, day1_dataset_60min])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['vertext_edge_ratio'] = X['vertex_count']/X['edge_count']


In [12]:
scaled_train_dataset_dict = {}
scaled_test_dataset_dict = {}
scaled_train_dataset= []
scaled_test_dataset= []
length = 3
for n in range(length):
    train_scaled, scaler = scale_dataset([(sequence[0], sequence[1][0][n], sequence[1][1]) for sequence in
                                          train_sequences])
    test_scaled, _ = scale_dataset([(sequence[0], sequence[1][0][n], sequence[1][1]) for sequence in
                                    test_sequences],
                  fitted_scaler= scaler)
    scaled_train_dataset.append(train_scaled)
    scaled_test_dataset.append(test_scaled)
    
for scale_sequence in scaled_train_dataset:
    for collection in scale_sequence:
        if scaled_train_dataset_dict.get(collection[0]):
            scaled_train_dataset_dict[collection[0]][0].append(collection[1])
        else:
            scaled_train_dataset_dict[collection[0]] = ([collection[1]], collection[2])

    
for scale_sequence in scaled_test_dataset:
    for collection in scale_sequence:
        if scaled_test_dataset_dict.get(collection[0]):
            scaled_test_dataset_dict[collection[0]][0].append(collection[1])
        else:
            scaled_test_dataset_dict[collection[0]] = ([collection[1]], collection[2])

In [None]:
len(list(scaled_train_dataset_dict.keys()))

In [None]:
len(list(scaled_test_dataset_dict.keys()))

In [None]:
list(scaled_test_dataset_dict.values())[1][0][2]

In [None]:
list(scaled_train_dataset_dict.values())[1][0][2]

## Model

In [13]:
train_values = list(scaled_train_dataset_dict.values())

In [14]:
test_values = list(scaled_test_dataset_dict.values())

In [15]:
MODEL_SHAPE1  = train_values[0][0][0].shape
MODEL_SHAPE1

(144, 7)

In [16]:
MODEL_SHAPE2  = train_values[0][0][1].shape
MODEL_SHAPE2

(48, 7)

In [17]:
MODEL_SHAPE3 = train_values[0][0][2].shape
MODEL_SHAPE3

(24, 7)

In [18]:
NUM_FILTERS1 = 10
NUM_FILTERS2 = 5
NUM_FILTERS3 = 3
K_SIZE1 = 8
K_SIZE2 = 6
K_SIZE3 = 3

In [81]:
def get_base_model(shape, k_size = K_SIZE1, num_filters = NUM_FILTERS1):
    print("base model shape", shape)
    input_seq = Input(shape=shape)
    nb_filters = num_filters
    convolved = Conv1D(num_filters, k_size, padding="same", activation="relu")(input_seq)
    processed = GlobalMaxPooling1D()(convolved)
    compressed = Dense(50, activation="relu")(processed)
    compressed = Dropout(0.3)(compressed)
    model = Model(inputs=input_seq, outputs=compressed)
    return model

In [80]:
def main_model(shape1, shape2, shape3):
    shape1_input = Input(shape=shape1, name='input1')
    shape2_input = Input(shape=shape2, name='input2')
    shape3_input = Input(shape=shape3, name='input3')
    shape1_model = get_base_model(shape1, k_size = K_SIZE1, num_filters=NUM_FILTERS1)
    shape2_model = get_base_model(shape2, k_size = K_SIZE2, num_filters=NUM_FILTERS2)
    shape3_model = get_base_model(shape3, k_size = K_SIZE3, num_filters=NUM_FILTERS3)
    
    embedding1 = shape1_model(shape1_input)
    embedding2 = shape2_model(shape2_input)
    embedding3 = shape3_model(shape3_input)
    merged = Concatenate()([embedding1, embedding2, embedding3])
    # todo: add hidden layers
    out = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[shape1_input, shape2_input, shape3_input], outputs=out)
    return model

In [95]:
def run_model(x_train, y_train, x_test, y_test):
    es = keras.callbacks.EarlyStopping(min_delta=0.00001, patience=10)
    #data_set = tf.data.Dataset.from_tensor_slices(  (x_train ,y_train) )
    model = main_model(MODEL_SHAPE1, MODEL_SHAPE2, MODEL_SHAPE3)
    model.compile(loss='binary_crossentropy', # categorical_crossentropy
                          optimizer='rmsprop', #sgd, nadam, adam, rmsprop
                          metrics=['binary_accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),
                                   tf.keras.metrics.AUC(curve='PR')])
    model.summary()
    print('SIZE OF TRAINING DATA', len(x_train))
    model_hist = model.fit(x_train, y_train,
                               validation_data=(x_test, y_test),
                               batch_size=100, epochs=1000, 
                           callbacks=[es]
                          )
    return model_hist, model

In [22]:
x_train_1 = [sequence[0][0] for sequence in train_values]
x_train_2 = [sequence[0][1] for sequence in train_values]
x_train_3 = [sequence[0][2] for sequence in train_values]
y_train = [sequence[1] for sequence in train_values]

In [23]:
X_train1 = np.stack(x_train_1)
X_train2 = np.stack(x_train_2)
X_train3 = np.stack(x_train_3)

In [24]:
X_train1 = X_train1.reshape(696,144,7)
X_train2 = X_train2.reshape(696,48,7)
X_train3 = X_train3.reshape(696,24,7)

In [25]:
x_test_1 = [sequence[0][0] for sequence in test_values]
x_test_2 = [sequence[0][1] for sequence in test_values]
x_test_3 = [sequence[0][2] for sequence in test_values]
y_test = [sequence[1] for sequence in train_values]

In [26]:
X_test1 = np.stack(x_test_1)
X_test2 = np.stack(x_test_2)
X_test3 = np.stack(x_test_3)

In [27]:
X_test1 = X_test1.reshape(299,144,7)
X_test2 = X_test2.reshape(299,48,7)
X_test3 = X_test3.reshape(299,24,7)

In [None]:
# X_train = [sequence[0].to_numpy() for sequence in scaled_train_sequences]
# y_train = [sequence[1] for sequence in scaled_train_sequences]
# X_test = [sequence[0].to_numpy() for sequence in scaled_test_sequences]
# y_test = [sequence[1] for sequence in scaled_test_sequences]

In [96]:
model_hist, model = run_model([X_train1, X_train2, X_train3], y_train, [X_test1, X_test2, X_test3],  y_test)

base model shape (144, 7)
base model shape (48, 7)
base model shape (24, 7)
Model: "model_91"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_139 (InputLayer)          [(None, 144, 7)]     0                                            
__________________________________________________________________________________________________
input_140 (InputLayer)          [(None, 48, 7)]      0                                            
__________________________________________________________________________________________________
input_141 (InputLayer)          [(None, 24, 7)]      0                                            
__________________________________________________________________________________________________
model_88 (Functional)           (None, 50)           1120        input_139[0][0]                  
_______________

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>"}), (<class 'list'> containing values of types {"<class 'numpy.int64'>"})

In [56]:
a_model = run_model([X_train1, X_train2, X_train3], y_train, [X_test1, X_test2, X_test3],  y_test)

base model shape (144, 7)
base model shape (48, 7)
base model shape (24, 7)
(696, 144, 7)


In [None]:
a_model.fit()

In [57]:
es = keras.callbacks.EarlyStopping(min_delta=0.00001, patience=10)
model_hist = a_model.fit([X_train1, X_train2, X_train3], y_train,batch_size=50, epochs=1000, callbacks=[es])

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>"}), (<class 'list'> containing values of types {"<class 'numpy.int64'>"})

In [None]:
prediction = model.predict(all_x_test)

In [None]:
prediction[prediction <=0.5] = 0
prediction[prediction >0.5] = 1

In [None]:
[pred[0] for pred in prediction.tolist()]

In [None]:
def plot_training_curves(history, title=None):
    ''' Plot the training curves for loss and accuracy given a model history
    '''
    # find the minimum loss epoch
    minimum = np.min(history.history['val_loss'])
    min_loc = np.where(minimum == history.history['val_loss'])[0]
    # get the vline y-min and y-max
    loss_min, loss_max = (min(history.history['val_loss'] + history.history['loss']),
                          max(history.history['val_loss'] + history.history['loss']))
    acc_min, acc_max = (min(history.history['val_binary_accuracy'] + history.history['binary_accuracy']),
                        max(history.history['val_binary_accuracy'] + history.history['binary_accuracy']))
    # create figure
    fig, ax = plt.subplots(ncols=2, figsize = (15,7))
    fig.suptitle(title)
    index = np.arange(1, len(history.history['binary_accuracy']) + 1)
    # plot the loss and validation loss
    ax[0].plot(index, history.history['loss'], label = 'loss')
    ax[0].plot(index, history.history['val_loss'], label = 'val_loss')
    ax[0].vlines(min_loc + 1, loss_min, loss_max, label = 'min_loss_location')
    ax[0].set_title('Loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epochs')
    ax[0].legend()
    # plot the accuracy and validation accuracy
    ax[1].plot(index, history.history['binary_accuracy'], label = 'accuracy')
    ax[1].plot(index, history.history['val_binary_accuracy'], label = 'val_accuracy')
    ax[1].vlines(min_loc + 1, acc_min, acc_max, label = 'min_loss_location')
    ax[1].set_title('Accuracy')
    ax[1].set_ylabel('Accuracy')
    ax[1].set_xlabel('Epochs')
    ax[1].legend()
    plt.show()

In [None]:
def plot_training_curves_auc(history, title=None):
    ''' Plot the training curves for loss and accuracy given a model history
    '''
    # find the minimum loss epoch
    minimum = np.min(history.history['val_loss'])
    min_loc = np.where(minimum == history.history['val_loss'])[0]
    # get the vline y-min and y-max
    loss_min, loss_max = (min(history.history['val_loss'] + history.history['loss']),
                          max(history.history['val_loss'] + history.history['loss']))
    acc_min, acc_max = (min(history.history['val_auc'] + history.history['auc']),
                        max(history.history['val_auc'] + history.history['auc']))
    # create figure
    fig, ax = plt.subplots(ncols=2, figsize = (15,7))
    fig.suptitle(title)
    index = np.arange(1, len(history.history['auc']) + 1)
    # plot the loss and validation loss
    ax[0].plot(index, history.history['loss'], label = 'loss')
    ax[0].plot(index, history.history['val_loss'], label = 'val_loss')
    ax[0].vlines(min_loc + 1, loss_min, loss_max, label = 'min_loss_location')
    ax[0].set_title('Loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epochs')
    ax[0].legend()
    # plot the accuracy and validation accuracy
    ax[1].plot(index, history.history['auc'], label = 'auc')
    ax[1].plot(index, history.history['val_auc'], label = 'val_auc')
    ax[1].vlines(min_loc + 1, acc_min, acc_max, label = 'min_loss_location')
    ax[1].set_title('AUC')
    ax[1].set_ylabel('auc')
    ax[1].set_xlabel('Epochs')
    ax[1].legend()
    plt.show()

In [None]:
plot_training_curves(history=model_hist)

In [None]:
plot_training_curves_auc(history=model_hist)