In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Flatten, Dense, BatchNormalization
from keras.layers.convolutional import Conv1D, MaxPooling1D
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from scipy.stats import bootstrap
from sklearn.model_selection import KFold

In [2]:
#create confusion Matrix
def confusionMatrix(model, testX, testY):
    pred = model.predict(testX)
    predy = []
    for i in range(0,len(pred)):
        if pred[i] >= .5:
            predy.append(1)
        else:
            predy.append(0)
        
    print(confusion_matrix(testY,predy))
    
#Calculates our model metrics
def modelMetrics(testX, testY, model):
    y_scores = model.predict(testX)
    precision, recall, thresholds = precision_recall_curve(testY, y_scores)
    rec80 = np.max(recall[precision >= .8])
    print("Recall at 80% Precision: " +str(rec80))
    aucPR = auc(recall, precision)
    print("PR-AUC: " + str(aucPR))
    
    return rec80, aucPR, 

#plots a PR curve
def PRplot(testX, testY, model):
    y_scores = model.predict(testX)
    precision, recall, thresholds = precision_recall_curve(testY, y_scores)
    plt.plot(recall, precision)
    plt.title("PR curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")

In [3]:
def normalize(data): 
    for i in data.columns:
        data[i] = (data[i] - np.mean(data[i]))/np.std(data[i])
    return np.nan_to_num(data)

In [4]:
def newModel(numConvFilters = 242, dropout = .215, numDenseNodes = 190):
    #Create new sequential model
    model = Sequential()

    #Add 1D Convolutional layers
    model.add((Conv1D(filters=32, kernel_size=3, activation='relu')))
    model.add((Conv1D(filters=numConvFilters, kernel_size=3, activation='relu')))
    #Add Max Pooling layer
    model.add((MaxPooling1D(pool_size=2)))
    #Apply dropout
    model.add(Dropout(dropout))
    #Flatten model
    model.add(Flatten())
    #Add fully connected dense layer
    model.add(Dense(numDenseNodes, activation='sigmoid'))
    #Add output node
    model.add(Dense(1, activation='sigmoid'))
    
    #Define optimizer
    adam = tf.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)
    #Compile model
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(curve='PR'),   
                                                                       tf.keras.metrics.RecallAtPrecision(0.8)])
    return model

# Bootstrapping

In [None]:
scores = []

for i in range(0,10):
    #Concatenate positive and negative samples
    trainFull = pd.read_csv("PreprocessedData/trainFull.csv")
    testFull = pd.read_csv("PreprocessedData/testFull.csv")

    trainIndex = np.arange(0,len(trainFull))
    testIndex = np.arange(0,len(testFull))

    trainResample = np.random.choice(trainIndex, size = len(trainFull))
    testResample = np.random.choice(testIndex, size = len(testFull))

    trainResampleFull = trainFull.iloc[trainResample]
    testResampleFull = testFull.iloc[testResample]

    #Reduce features and extract labels
    trainX = trainResampleFull.iloc[:,:-2]
    trainY = trainResampleFull.iloc[:,-1]
    testX = testResampleFull.iloc[:,:-2]
    testY = testResampleFull.iloc[:,-1]

    trainX = normalize(trainX)
    testX = normalize(testX)

    #Reshape data to 3D for CNN

    trainX = trainX[..., None]
    trainY = trainY.to_numpy()[..., None]
    testX = testX[..., None]
    testY = testY.to_numpy()[..., None]
    steps = 15
    model= newModel()
    history = model.fit(trainX, trainY, validation_data = (testX, testY), epochs = steps, batch_size=20, verbose = 0)
    confusionMatrix(model, testX, testY)
    modelMetrics(testX, testY, model)
    scores.append(model.evaluate(testX, testY, verbose=0))


[[1490  422]
 [ 333 1545]]
Recall at 80% Precision: 0.7976570820021299
PR-AUC: 0.8880738927769967
[[1467  451]
 [ 330 1542]]
Recall at 80% Precision: 0.7932692307692307
PR-AUC: 0.8806311106547259
[[1573  315]
 [ 469 1433]]
Recall at 80% Precision: 0.7802313354363828
PR-AUC: 0.884320557462374
[[1626  282]
 [ 526 1356]]
Recall at 80% Precision: 0.7821466524973433
PR-AUC: 0.8869215324748826
[[1600  315]
 [ 369 1506]]
Recall at 80% Precision: 0.8421333333333333
PR-AUC: 0.896757971350418
[[1527  357]
 [ 385 1521]]
Recall at 80% Precision: 0.8174186778593914
PR-AUC: 0.8859854642298088
[[1492  390]
 [ 348 1560]]
Recall at 80% Precision: 0.8218029350104822
PR-AUC: 0.8972468558600044
[[1495  425]
 [ 343 1527]]
Recall at 80% Precision: 0.7807486631016043
PR-AUC: 0.8883010850364794
[[1673  276]
 [ 484 1357]]
Recall at 80% Precision: 0.8126018468223791
PR-AUC: 0.8905118710400105


In [None]:
transpose = np.array(scores).T

fold = np.arange(0,10)

plt.figure(figsize = (16,4))
plt.subplot(1,3,1)
plt.plot(fold,transpose[0])
plt.plot(fold, np.full(10,np.mean(tranpose[0])))
plt.title("Loss")
plt.xlabel("Sample")

plt.subplot(1,3,2)
plt.plot(fold,transpose[1])
plt.plot(fold, np.full(10,np.mean(tranpose[1])))
plt.title("Recall @ 80% Precision")
plt.xlabel("Sample")

plt.subplot(1,3,3)
plt.plot(fold,transpose[2])
plt.plot(fold, np.full(10,np.mean(tranpose[2])))
plt.title("PR-AUC")
plt.xlabel("Sample")


In [None]:

keys = list(history.history.keys())
plt.figure(figsize = (16,4))
epoch = np.arange(0,steps,1)

plt.subplot(1,3,1)
plt.plot(epoch, history.history[keys[0]], label = "Training")
plt.plot(epoch, history.history[keys[3]], label = "Validation")
plt.title("Loss")
plt.legend()

plt.subplot(1,3,2)
plt.plot(epoch, history.history[keys[1]], label = "Training")
plt.plot(epoch, history.history[keys[4]], label = "Validation")
plt.title("Recall @ 80% Precision")

plt.subplot(1,3,3)
plt.plot(epoch, history.history[keys[2]], label = "Training")
plt.plot(epoch, history.history[keys[5]], label = "Validation")
plt.title("PR AUC");



# Cross Validation

In [None]:
trainFull = pd.read_csv("PreprocessedData/trainFull.csv")
testFull = pd.read_csv("PreprocessedData/testFull.csv")

trainX = trainFull.iloc[:,:-2]
trainY = trainFull.iloc[:,-1]
testX = testFull.iloc[:,:-2]
testY = testFull.iloc[:,-1]

trainX = normalize(trainX)
testX = normalize(testX)

trainX = trainX[..., None]
trainY = trainY.to_numpy()[..., None]
testX = testX[..., None]
testY = testY.to_numpy()[..., None]

X = np.concatenate([trainX, testX])
Y = np.concatenate([trainY, testY])

In [None]:
kfold = KFold(n_splits=10, shuffle=True)

scores = []

fold_no = 1
for train, test in kfold.split(X, Y):
    model = newModel()
    history = model.fit(X[train], Y[train], epochs = 15, verbose = 0)
    scores.append(model.evaluate(X[test], Y[test], verbose=0))
    #print(f'fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%; {model.metrics_names[2]} of {scores[2]*100}%')
    fold_no = fold_no + 1
    confusionMatrix(model, X[test], Y[test])
    modelMetrics(X[test], Y[test], model)
    

In [None]:
transpose = np.array(scores).T

fold = np.arange(0,10)

plt.figure(figsize = (16,4))
plt.subplot(1,3,1)
plt.plot(fold,transpose[0])
plt.plot(fold, np.full(10,np.mean(tranpose[0])))
plt.title("Loss")
plt.xlabel("Fold")

plt.subplot(1,3,2)
plt.plot(fold,transpose[1])
plt.plot(fold, np.full(10,np.mean(tranpose[1])))
plt.title("Recall @ 80% Precision")
plt.xlabel("Fold")

plt.subplot(1,3,3)
plt.plot(fold,transpose[2])
plt.plot(fold, np.full(10,np.mean(tranpose[2])))
plt.title("PR-AUC")
plt.xlabel("Fold")

