In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Data Processing

In [None]:
# Extract transaction data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_transaction.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_trans_df = pd.concat(chunks)

In [None]:
# Extract identity data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_identity.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_identity_df = pd.concat(chunks)

In [None]:
# Merge transaction and identity datasets
whole_df = train_trans_df.merge(train_identity_df, on='TransactionID')

In [None]:
# Drop features with too many null values
minimum = 0.5
null_percentage = whole_df.isnull().mean()
whole_df = whole_df.drop(columns=null_percentage[null_percentage > minimum].index)

In [None]:
# Define and encode categorical features
trans_category_features = ['isFraud', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

identity_category_features = [f'id_{i}' for i in range(12,39)] + ['DeviceType', 'DeviceInfo']

categories = trans_category_features + identity_category_features

for feature in categories:
  if feature in whole_df.columns:
    le = LabelEncoder()
    whole_df[feature] = le.fit_transform(whole_df[feature].astype(str))

# Fill null values with -999
whole_df.fillna(-999)

In [None]:
# Extract numerical features
numerical_features = [feature for feature in whole_df.columns if feature not in categories]

In [None]:
# Feature Scaling
scaler = StandardScaler()
whole_df[numerical_features] = scaler.fit_transform(whole_df[numerical_features])

In [None]:
# Remove low variance features
selector = VarianceThreshold(threshold=0.05)
matrix_whole_df = selector.fit_transform(whole_df)
columns_kept = whole_df.columns[selector.get_support()]
whole_df = pd.DataFrame(matrix_whole_df, columns=columns_kept)

In [None]:
# Get X and Y
X = whole_df.drop('isFraud', axis=1)
y = whole_df['isFraud']

In [None]:
# Get train, validation, and test sets
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.02, random_state=20)
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, test_size=0.5, random_state=20)

In [None]:
# Remove any rows with null values
X_train_dropped = X_train.dropna()
X_valid_dropped = X_valid.dropna()
X_test_dropped = X_test.dropna()
y_train_dropped = y_train[X_train_dropped.index]
y_valid = y_valid[X_valid_dropped.index]
y_test = y_test[X_test_dropped.index]

In [None]:
# Resampling for unbalanced dataset
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=20)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dropped, y_train_dropped)

Logistic Regression

In [None]:
import tensorflow as tf

class log_reg_model(tf.keras.Model):
  def __init__(self, input_size):
    super(log_reg_model, self).__init__()
    self.linear = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.GlorotNormal())

  def call(self, x):
    z = self.linear(x)
    return z

In [None]:
# Model Training
input_dim = X_train_resampled.shape[1]
model_log_reg = log_reg_model(input_dim)

criterion = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

X_train_tensor = tf.convert_to_tensor(X_train_resampled.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

num_epochs = 4000
for epoch in range(num_epochs):
  with tf.GradientTape() as tape:
    outputs = model_log_reg(X_train_tensor)
    loss = criterion(y_train_tensor, outputs)

    gradients = tape.gradient(loss, model_log_reg.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model_log_reg.trainable_variables))

    if epoch % 20 == 0:
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.numpy():.4f}')

In [None]:
# Model Evaluation on Validation Set to test learning rate
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc

X_valid_tensor = tf.convert_to_tensor(X_valid_dropped.values, dtype=tf.float32)
y_valid_tensor = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

y_valid_tensor = model_log_reg(X_valid_tensor)

y_pred = tf.math.sigmoid(y_valid_tensor)

threshold = 0.5
predicted = tf.squeeze(tf.cast(y_pred > threshold, tf.float32))

y_valid_tensor = tf.squeeze(y_valid_tensor)

correct = tf.equal(predicted, y_valid_tensor)
accuracy = tf.reduce_mean(tf.cast((correct), tf.float32)).numpy()
print(f'accuracy: {accuracy}')

# Calculate AUC
probs_np = y_pred.numpy().squeeze()
y_valid_np = y_valid_tensor.numpy()
auc = roc_auc_score(y_valid_np, probs_np)
print(f'AUC: {auc}')

# Confusion matrix
confuse_matrix = confusion_matrix(y_valid_np, predicted.numpy())
print(f'Confusion Matrix : {confuse_matrix}')

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Evaluating logistic regression model on the test set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc

X_test_tensor = tf.convert_to_tensor(X_test_dropped.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

y_pred_tensor = model_log_reg(X_test_tensor)
y_pred_train_tensor = model_log_reg(X_train_tensor)

y_pred = tf.math.sigmoid(y_pred_tensor)
y_pred_train = tf.math.sigmoid(y_pred_train_tensor)

threshold = 0.5
predicted = tf.squeeze(tf.cast(y_pred > threshold, tf.float32))

y_test_tensor = tf.squeeze(y_test_tensor)

correct = tf.equal(predicted, y_test_tensor)
accuracy = tf.reduce_mean(tf.cast((correct), tf.float32)).numpy()
print(f'accuracy: {accuracy}')

# Calculate AUC
probs_np = y_pred.numpy().squeeze()
y_test_np = y_test_tensor.numpy()
auc = roc_auc_score(y_test_np, probs_np)
train_auc = roc_auc_score(y_train_tensor.numpy(), y_pred_train.numpy())
print(f'AUC: {auc}')
print(f'Training AUC: {train_auc}')

# Confusion matrix
confuse_matrix = confusion_matrix(y_test_np, predicted.numpy())
print(f'Confusion Matrix : {confuse_matrix}')

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Plot ROC curve
import matplotlib.pyplot as plt

false_positive, true_positive, _ = roc_curve(y_test_np, probs_np)

plt.figure(figsize=(12,9))
plt.plot(false_positive, true_positive, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.show()

XGBoost

In [None]:
# Fitting and training XGB Model of Depth 10
import xgboost as xgb
import matplotlib.pyplot as plt

eval_list = [(X_train_resampled, y_train_resampled), (X_valid_dropped, y_valid)]

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 10,
    n_estimators=500,
    learning_rate= 0.00001)

xgb_model.fit(X_train_resampled, y_train_resampled, eval_set = eval_list)

results = xgb_model.evals_result()

In [None]:
# Plot Binary Cross Entropy Loss
epochs = range(len(results['validation_0']['logloss']))
plt.figure(figsize=(12,9))
plt.plot(epochs, results['validation_0']['logloss'], label='Training')
plt.plot(epochs, results['validation_1']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.legend()
plt.show()

In [None]:
# Evaluating XGB model of depth 10 on the test set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

y_pred_xgb = xgb_model.predict(X_test_dropped)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test_dropped)[:,1])
roc_train = roc_auc_score(y_train_resampled, xgb_model.predict_proba(X_train_resampled)[:,1])
confuse_matrix = confusion_matrix(y_test, y_pred_xgb)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb depth 10: {accuracy_xgb}")
print(f"roc_xgb depth 10: {roc_xgb}")
print(f"roc_xgb training depth 10: {roc_train}")
print(f"confuse_matrix depth 10: {confuse_matrix}")

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth10:{precision}')
print(f'Recall_depth10:{recall}')
print(f'F1_depth10: {f1}')

In [None]:
# Hyperparameter search for XGBoost
from sklearn.model_selection import RandomizedSearchCV

params = {
    'learning_rate': np.logspace(np.log10(0.00001), np.log10(0.1), num=5),
    'max_depth': [20, 25, 30],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.3,0.4,0.5,0.6,0.7, 0.8, 0.9],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 0.2, 0.3],
    'reg_lambda': [0, 0.1, 0.2, 0.3],
    'learning_rate': [0.01, 0.001, 0.0001, 0.00001]
}

param_search = RandomizedSearchCV(estimator=xgb_model, param_distributions = params, scoring='roc_auc', cv=5, n_iter = 30, n_jobs=-1)
param_search.fit(X_valid_dropped, y_valid)

best_params = param_search.best_params_
best_model = param_search.best_estimator_
print(best_params)

In [None]:
# Optimized XGBoost Model following hyperparameter optimization

import xgboost as xgb

eval_list = [(X_train_resampled, y_train_resampled), (X_valid_dropped, y_valid)]

xgb_model_depth25 = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 25,
    subsample = 0.9,
    min_child_weight = 1,
    colsample_bytree = 0.5,
    gamma = 0,
    reg_lambda = 0.2,
    reg_alpha = 0.2,
    learning_rate= 0.00001,
    n_estimators=500,
    early_stopping_rounds=100)

xgb_model_depth25.fit(X_train_resampled, y_train_resampled, eval_set=eval_list)

results = xgb_model_depth25.evals_result()

In [None]:
epochs = range(len(results['validation_0']['logloss']))
plt.figure(figsize=(12,9))
plt.plot(epochs, results['validation_0']['logloss'], label='Training')
plt.plot(epochs, results['validation_1']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.legend()
plt.show()

In [None]:
# Evaluating optimized XGBoost Model on the test set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

y_pred_xgb = xgb_model_depth25.predict(X_test_dropped)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_xgb = roc_auc_score(y_test, xgb_model_depth25.predict_proba(X_test_dropped)[:,1])
train_roc = roc_auc_score(y_train_resampled, xgb_model_depth25.predict_proba(X_test_dropped)[:,1])
confuse_matrix = confusion_matrix(y_test, y_pred_xgb)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb depth 25: {accuracy_xgb}")
print(f"roc_xgb depth 25: {roc_xgb}")
print(f"Training roc depth 25: {train_roc}")
print(f"confuse_matrix depth 25: {confuse_matrix}")

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth 25:{precision}')
print(f'Recall_depth 25:{recall}')
print(f'F1_depth25: {f1}')

In [None]:
# Threshold tuning for XGBoost on Validation Set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

y_pred_valid_thres = xgb_model_depth25.predict_proba(X_valid_dropped)[:,1]

threshold = 0.499
y_pred_xgb = (y_pred_valid_thres > threshold).astype(int)

accuracy_xgb = accuracy_score(y_valid, y_pred_xgb)
roc_xgb = roc_auc_score(y_valid, y_pred_valid_thres)
confuse_matrix = confusion_matrix(y_valid, y_pred_xgb)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb depth 25: {accuracy_xgb}")
print(f"roc_xgb depth 25: {roc_xgb}")
print(f"confuse_matrix depth 25: {confuse_matrix}")

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth25:{precision}')
print(f'Recall_depth25:{recall}')
print(f'F1_depth25: {f1}')

In [None]:
# Evaluation of tuned XGBoost on Test Set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

y_pred_test_thres = xgb_model_depth25.predict_proba(X_test_dropped)[:,1]

threshold = 0.499
y_pred_xgb = (y_pred_test_thres > threshold).astype(int)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_xgb = roc_auc_score(y_test, y_pred_test_thres)
confuse_matrix = confusion_matrix(y_test, y_pred_xgb)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb depth 25: {accuracy_xgb}")
print(f"roc_xgb depth 25: {roc_xgb}")
print(f"confuse_matrix depth 25: {confuse_matrix}")

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth25:{precision}')
print(f'Recall_depth25:{recall}')
print(f'F1_depth25: {f1}')

In [None]:
# Plot ROC curve

import matplotlib.pyplot as plt

false_positive, true_positive, _ = roc_curve(y_test, y_pred_valid_thres)

plt.figure(figsize=(12,9))
plt.plot(false_positive, true_positive, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.show()

Neural Networks

In [None]:
# Fitting and Training a single-layer RNN Model with ReLU activation in Hidden Layers
import tensorflow as tf

model_relu = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation='relu', input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_relu.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

X_val_resampled_rnn = X_valid_dropped.values.reshape((X_valid_dropped.shape[0], 1, X_valid_dropped.shape[1]))
X_val_resampled_rnn = tf.convert_to_tensor(X_val_resampled_rnn, dtype=tf.float32)
y_val_resampled_rnn = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

eval_set = (X_val_resampled_rnn, y_val_resampled_rnn)

history = model_relu.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=15, batch_size=32, validation_data = eval_set)

In [None]:
# Plot binary cross entropy loss
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('ReLU Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Evaluation of single-layer RNN Model with ReLU activation in Hidden Layers

X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_relu.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

y_pred_rnn = model_relu.predict(X_test_dropped_rnn)

threshold = 0.5
predicted = tf.cast(y_pred_rnn >= threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Fitting and Training RNN Model with Leaky ReLU activation in Hidden Layers
import tensorflow as tf

model_leaky = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation=None, input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.LeakyReLU(alpha=0.01),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_leaky.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

X_val_resampled_rnn = X_valid_dropped.values.reshape((X_valid_dropped.shape[0], 1, X_valid_dropped.shape[1]))
X_val_resampled_rnn = tf.convert_to_tensor(X_val_resampled_rnn, dtype=tf.float32)
y_val_resampled_rnn = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

eval_set = (X_val_resampled_rnn, y_val_resampled_rnn)

history_leaky = model_leaky.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=15, batch_size=32, validation_data=eval_set)

In [None]:
# Plot binary cross entropy loss

plt.plot(history_leaky.history['loss'], label='Training')
plt.plot(history_leaky.history['val_loss'], label='Validation')
plt.title('Leaky ReLU Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Evaluation of single-layer RNN Model with Leaky ReLU activation in hidden layers

X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_leaky.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

y_pred_rnn = model_leaky.predict(X_test_dropped_rnn)

threshold = 0.5
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

Models with LSTM Layers

In [None]:
# Fitting and Training RNN Model with LSTM Hidden Layers

import tensorflow as tf

model_LSTM_relu = tf.keras.Sequential([tf.keras.layers.LSTM(units=128, activation='relu', input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_LSTM_relu.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_lstm = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_lstm = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_lstm = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

X_val_resampled_rnn = X_valid_dropped.values.reshape((X_valid_dropped.shape[0], 1, X_valid_dropped.shape[1]))
X_val_resampled_rnn = tf.convert_to_tensor(X_val_resampled_rnn, dtype=tf.float32)
y_val_resampled_rnn = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

eval_set = (X_val_resampled_rnn, y_val_resampled_rnn)

history_lstm = model_LSTM_relu.fit(X_train_resampled_lstm, y_train_resampled_lstm, epochs=15, batch_size=32, validation_data=eval_set)

In [None]:
# Plot binary cross entropy loss

plt.plot(history_lstm.history['loss'], label='Training')
plt.plot(history_lstm.history['val_loss'], label='Validation')
plt.title('LSTM Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Evaluation of RNN Model with LSTM Hidden Layers on Test Set

X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_LSTM_relu.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

y_pred_rnn = model_LSTM_relu.predict(X_test_dropped_rnn)

threshold = 0.5
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

Increased Model Complexity

In [None]:
# Fitting and Training RNN Model with 2 LSTM and 1 Dense Hidden Layers

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping

X_train_resampled_lstm = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, 300))
X_train_resampled_lstm = tf.convert_to_tensor(X_train_resampled_lstm, dtype=tf.float32)
y_train_resampled_lstm = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

model_LSTM_complex = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1, X_train_resampled_lstm.shape[2])),
    tf.keras.layers.LSTM(units=256, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(units=256, activation='relu'),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model_LSTM_complex.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

X_val_resampled_rnn = X_valid_dropped.values.reshape((X_valid_dropped.shape[0], 1, X_valid_dropped.shape[1]))
X_val_resampled_rnn = tf.convert_to_tensor(X_val_resampled_rnn, dtype=tf.float32)
y_val_resampled_rnn = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

eval_set = (X_val_resampled_rnn, y_val_resampled_rnn)

history_complex = model_LSTM_complex.fit(X_train_resampled_lstm, y_train_resampled_lstm, epochs=15, batch_size=32, validation_data=eval_set)

In [None]:
# Plot binary cross entropy loss

import matplotlib.pyplot as plt

plt.plot(history_complex.history['loss'], label='Training')
plt.plot(history_complex.history['val_loss'], label='Validation')
plt.title('More Complex LSTM Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Evaluation of RNN Model with 2 LSTM Hidden Layers and 1 Hidden Dense Layer on Test Set

from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_LSTM_complex.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

y_pred_rnn = model_LSTM_complex.predict(X_test_dropped_rnn)

threshold = 0.5
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Fitting and Training RNN Model with 2 LSTM and 1 Dense Hidden Layers

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

X_train_resampled_lstm = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, 300))
X_train_resampled_lstm = tf.convert_to_tensor(X_train_resampled_lstm, dtype=tf.float32)
y_train_resampled_lstm = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

early_stopping = EarlyStopping(monitor='AUC', patience=5, restore_best_weights=True)

model_LSTM_complex_reg = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1, X_train_resampled_lstm.shape[2])),
    tf.keras.layers.LSTM(units=256,
                         activation='relu',
                         dropout=0.05,
                         recurrent_dropout=0.05,
                         kernel_regularizer=tf.keras.regularizers.l2(0.0001),
                         return_sequences=True),
    tf.keras.layers.LSTM(units=256,
                         activation='relu',
                         dropout=0.05,
                         recurrent_dropout=0.05,
                         kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.Dense(units=256,
                          activation='relu',
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.Dense(1, activation='sigmoid')])

adam_optimizer = Adam(learning_rate=0.00001)

model_LSTM_complex_reg.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['AUC'])

history_reg = model_LSTM_complex_reg.fit(X_train_resampled_lstm, y_train_resampled_lstm, epochs=40, batch_size=32,callbacks=[early_stopping])

In [None]:
# Hyperparamter tuning for multi-layer LSTM Model on Validation Set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score

X_valid_dropped_rnn = X_valid_dropped.values.reshape((X_valid_dropped.shape[0], 1, X_valid_dropped.shape[1]))
X_valid_dropped_rnn = tf.convert_to_tensor(X_valid_dropped_rnn, dtype=tf.float32)
y_valid_rnn = tf.convert_to_tensor(y_valid.values, dtype=tf.float32)

loss, auc = model_LSTM_complex_reg.evaluate(X_valid_dropped_rnn, y_test_rnn, batch_size=32)

threshold = 0.5
y_pred_rnn = model_LSTM_complex_reg.predict(X_test_dropped_rnn)
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Validation Loss: {loss}, Validation AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Evaluation of RNN Model with 2 LSTM Hidden Layers and 1 Hidden Dense Layer (along with regularization, early stopping, and dropout) on Test Set
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, auc = model_LSTM_complex_reg.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

threshold = 0.5
y_pred_rnn = model_LSTM_complex_reg.predict(X_test_dropped_rnn)
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

In [None]:
# Plot ROC curve

import matplotlib.pyplot as plt

false_positive, true_positive, _ = roc_curve(y_test_rnn, y_pred_rnn)

plt.figure(figsize=(12,9))
plt.plot(false_positive, true_positive, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.show()