In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Data Processing:

In [3]:
# Extract transaction data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_transaction.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_trans_df = pd.concat(chunks)

In [4]:
# Extract identity data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_identity.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_identity_df = pd.concat(chunks)

In [5]:
# Merge transaction and identity datasets
whole_df = train_trans_df.merge(train_identity_df, on='TransactionID')

In [6]:
# Drop features with too many null values
minimum = 0.5
null_percentage = whole_df.isnull().mean()
whole_df = whole_df.drop(columns=null_percentage[null_percentage > minimum].index)

In [7]:
# Define and encode categorical features
trans_category_features = ['isFraud', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

identity_category_features = [f'id_{i}' for i in range(12,39)] + ['DeviceType', 'DeviceInfo']

categories = trans_category_features + identity_category_features

for feature in categories:
  if feature in whole_df.columns:
    le = LabelEncoder()
    whole_df[feature] = le.fit_transform(whole_df[feature].astype(str))

# Fill null values with -999
whole_df.fillna(-999)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0,86506,50.000,1,5786,399,39,2,2,...,124,3,164,3,1,0,1,1,1,954
1,2987008,0,86535,15.000,1,4968,0,39,4,97,...,98,3,48,2,1,0,0,1,1,1727
2,2987010,0,86549,75.887,0,3487,243,10,2,28,...,44,4,260,4,0,0,1,1,0,1598
3,2987011,0,86555,16.495,0,5766,266,66,2,95,...,44,4,260,4,0,0,1,1,0,1742
4,2987016,0,86620,30.000,1,4247,437,39,4,97,...,44,2,40,3,1,0,1,1,0,723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,0,15810802,48.877,0,1104,198,4,2,95,...,57,4,260,4,0,0,1,0,1,276
144229,3577526,1,15810876,250.000,2,1171,71,39,4,97,...,31,3,255,3,1,0,1,0,1,65
144230,3577529,0,15810912,73.838,0,6080,437,66,2,31,...,54,4,260,4,0,0,1,0,1,748
144231,3577531,0,15810935,400.000,2,6523,464,39,4,97,...,56,2,183,3,1,0,1,0,0,723


In [8]:
# Extract numerical features
numerical_features = [feature for feature in whole_df.columns if feature not in categories]

In [9]:
# Feature Scaling
scaler = StandardScaler()
whole_df[numerical_features] = scaler.fit_transform(whole_df[numerical_features])

In [10]:
# Remove low variance features
selector = VarianceThreshold(threshold=0.05)
matrix_whole_df = selector.fit_transform(whole_df)
columns_kept = whole_df.columns[selector.get_support()]
whole_df = pd.DataFrame(matrix_whole_df, columns=columns_kept)

In [11]:
# Get X and Y
X = whole_df.drop('isFraud', axis=1)
y = whole_df['isFraud']

In [12]:
# Get train, validation, and test sets
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.02, random_state=20)
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, test_size=0.5, random_state=20)

In [14]:
# Remove any rows with null values
X_train_dropped = X_train.dropna()
X_valid_dropped = X_valid.dropna()
X_test_dropped = X_test.dropna()
y_train_dropped = y_train[X_train_dropped.index]
y_valid = y_valid[X_valid_dropped.index]
y_test = y_test[X_test_dropped.index]

In [15]:
# Resampling for unbalanced dataset
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=20)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dropped, y_train_dropped)

Logistic Regression

In [22]:
import tensorflow as tf

class log_reg_model(tf.keras.Model):
  def __init__(self, input_size):
    super(log_reg_model, self).__init__()
    self.linear = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.GlorotNormal())


  def call(self, x):
    z = self.linear(x)
    return z

In [23]:
# Model Training
input_dim = X_train_resampled.shape[1]
model_log_reg = log_reg_model(input_dim)

criterion = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

X_train_tensor = tf.convert_to_tensor(X_train_resampled.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

num_epochs = 4000
for epoch in range(num_epochs):
  with tf.GradientTape() as tape:
    outputs = model_log_reg(X_train_tensor)
    loss = criterion(y_train_tensor, outputs)

    gradients = tape.gradient(loss, model_log_reg.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model_log_reg.trainable_variables))

    if epoch % 20 == 0:
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.numpy():.4f}')

Epoch [1/4000], Loss: 205.9255
Epoch [21/4000], Loss: 198.7369
Epoch [41/4000], Loss: 191.5482
Epoch [61/4000], Loss: 184.3596
Epoch [81/4000], Loss: 177.1709
Epoch [101/4000], Loss: 169.9823
Epoch [121/4000], Loss: 162.7936
Epoch [141/4000], Loss: 155.6050
Epoch [161/4000], Loss: 148.4165
Epoch [181/4000], Loss: 141.2281
Epoch [201/4000], Loss: 134.0397
Epoch [221/4000], Loss: 126.8528
Epoch [241/4000], Loss: 119.6711
Epoch [261/4000], Loss: 112.5051
Epoch [281/4000], Loss: 105.3767
Epoch [301/4000], Loss: 98.3251
Epoch [321/4000], Loss: 91.3648
Epoch [341/4000], Loss: 84.5269
Epoch [361/4000], Loss: 77.8567
Epoch [381/4000], Loss: 71.3472
Epoch [401/4000], Loss: 65.0100
Epoch [421/4000], Loss: 58.8900
Epoch [441/4000], Loss: 52.9866
Epoch [461/4000], Loss: 47.3109
Epoch [481/4000], Loss: 41.9340
Epoch [501/4000], Loss: 36.9044
Epoch [521/4000], Loss: 32.2286
Epoch [541/4000], Loss: 27.9560
Epoch [561/4000], Loss: 24.1443
Epoch [581/4000], Loss: 20.8902
Epoch [601/4000], Loss: 18.3044

In [30]:
# Model Evaluation
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

X_test_tensor = tf.convert_to_tensor(X_test_dropped.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

y_pred_tensor = model_log_reg(X_test_tensor)
y_pred = tf.math.sigmoid(y_pred_tensor)

threshold = 0.5
predicted = tf.squeeze(tf.cast(y_pred > threshold, tf.float32))

y_test_tensor = tf.squeeze(y_test_tensor)

correct = tf.equal(predicted, y_test_tensor)
accuracy = tf.reduce_mean(tf.cast((correct), tf.float32)).numpy()
print(f'accuracy: {accuracy}')

# Calculate AUC
probs_np = y_pred.numpy().squeeze()
y_test_np = y_test_tensor.numpy()
auc = roc_auc_score(y_test_np, probs_np)
print(f'AUC: {auc}')

# Confusion matrix
confuse_matrix = confusion_matrix(y_test_np, predicted.numpy())
print(f'Confusion Matrix : {confuse_matrix}')

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

accuracy: 0.8536585569381714
AUC: 0.7687719298245614
Confusion Matrix : [[333  47]
 [ 13  17]]
Precision:0.265625
Recall:0.5666666666666667
F1: 0.36170212765957444


XGBoost

In [26]:
# XGBoost
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test_dropped, label=y_test)

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 10,
    learning_rate= 0.00001)

xgb_model_depth20 = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 20,
    learning_rate= 0.00001)

xgb_model.fit(X_train_resampled, y_train_resampled)
xgb_model_depth20.fit(X_train_resampled, y_train_resampled)

In [None]:
# Hyperparameter search

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params = {
    'learning_rate': np.logspace(np.log10(0.00001), np.log10(0.1), num=5),
    'max_depth': [10, 20]
}

param_search = RandomizedSearchCV(estimator=xgb_model, param_distributions = params, scoring='roc_auc', cv=5, n_iter = 30, n_jobs=-1)
param_search.fit(X_valid_dropped, y_valid)

best_params = param_search.best_params_
best_model = param_search.best_estimator_

In [44]:
# Model Evaluation

y_pred_xgb = xgb_model.predict(X_test_dropped)
y_pred_xgb_depth20 = xgb_model_depth20.predict(X_test_dropped)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_xgb_depth20 = accuracy_score(y_test, y_pred_xgb_depth20)

roc_xgb = roc_auc_score(y_test, y_pred_xgb)
roc_xgb_depth20 = roc_auc_score(y_test, y_pred_xgb_depth20)

confuse_matrix = confusion_matrix(y_test, y_pred_xgb)
confuse_matrix_depth20 = confusion_matrix(y_test, y_pred_xgb_depth20)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb depth 10: {accuracy_xgb}")
print(f"roc_xgb depth 10: {roc_xgb}")
print(f"confuse_matrix depth 10: {confuse_matrix}")
print(f"accuracy_xgb_depth20: {accuracy_xgb_depth20}")
print(f"roc_xgb_depth20: {roc_xgb_depth20}")
print(f"confuse_matrix_depth20: {confuse_matrix_depth20}")

# F1
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth10:{precision}')
print(f'Recall_depth10:{recall}')
print(f'F1_depth10: {f1}')

precision = confuse_matrix_depth20[1,1]/(confuse_matrix_depth20[1,1] + confuse_matrix_depth20[0,1])
recall = confuse_matrix_depth20[1,1]/(confuse_matrix_depth20[1,1] + confuse_matrix_depth20[1,0])
f1 = (2*precision*recall)/(precision + recall)
print(f'Precision_depth20:{precision}')
print(f'Recall_depth20:{recall}')
print(f'F1_depth20: {f1}')

accuracy xgb depth 10: 0.9414634146341463
roc_xgb depth 10: 0.768859649122807
confuse_matrix depth 10: [[369  11]
 [ 13  17]]
accuracy_xgb_depth20: 0.9560975609756097
roc_xgb_depth20: 0.7921052631578948
confuse_matrix_depth20: [[374   6]
 [ 12  18]]
Precision_depth10:0.6071428571428571
Recall_depth10:0.5666666666666667
F1_depth10: 0.5862068965517241
Precision_depth20:0.75
Recall_depth20:0.6
F1_depth20: 0.6666666666666665


Recurrent Neural Network

In [47]:
# Fitting and Training RNN Model with ReLU activation in Hidden Layers
import tensorflow as tf

model_relu = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation='relu', input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_relu.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

model_relu.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=10, batch_size=32)

Epoch 1/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - AUC: 0.7700 - accuracy: 0.7229 - loss: 6.0159
Epoch 2/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8801 - accuracy: 0.8197 - loss: 0.9810
Epoch 3/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - AUC: 0.8828 - accuracy: 0.8256 - loss: 1.0542
Epoch 4/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - AUC: 0.8733 - accuracy: 0.8185 - loss: 1.2786
Epoch 5/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8979 - accuracy: 0.8359 - loss: 0.8399
Epoch 6/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - AUC: 0.9072 - accuracy: 0.8417 - loss: 0.7295
Epoch 7/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - AUC: 0.9163 - accuracy: 0.8489 - loss: 0.6000
Epoch 8/10
[1m2421/2421[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a5a51e32dd0>

In [63]:
# Model Evaluation
X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_relu.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

y_pred_rnn = model_relu.predict(X_test_dropped_rnn)

threshold = 0.5
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - AUC: 0.8389 - accuracy: 0.8216 - loss: 0.4892 
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Test Loss: 0.4969099164009094, Test Accuracy: 0.8146341443061829, Test AUC: 0.8741666078567505
Confusion Matrix : [[309  71]
 [  5  25]]
Precision:0.2604166666666667
Recall:0.8333333333333334
F1: 0.3968253968253969


In [57]:
# Fitting and Training RNN Model with Leaky ReLU activation in Hidden Layers
import tensorflow as tf

model_leaky = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation=None, input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.LeakyReLU(alpha=0.01),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_leaky.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

model_leaky.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=10, batch_size=32)

  super().__init__(**kwargs)


Epoch 1/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - AUC: 0.7858 - accuracy: 0.7353 - loss: 4.8069
Epoch 2/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - AUC: 0.8837 - accuracy: 0.8188 - loss: 0.8770
Epoch 3/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8960 - accuracy: 0.8298 - loss: 0.7896
Epoch 4/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8930 - accuracy: 0.8304 - loss: 0.8383
Epoch 5/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - AUC: 0.9131 - accuracy: 0.8469 - loss: 0.6500
Epoch 6/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.9139 - accuracy: 0.8469 - loss: 0.6326
Epoch 7/10
[1m2421/2421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - AUC: 0.9042 - accuracy: 0.8409 - loss: 0.7359
Epoch 8/10
[1m2421/2421[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a5a526ea2c0>

In [62]:
# Model Evaluation
X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_leaky.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)

threshold = 0.5
predicted = tf.cast(y_pred_rnn > threshold, tf.float32).numpy()

y_test_rnn = y_test_rnn.numpy()

confuse_matrix = confusion_matrix(y_test_rnn, predicted)
recall = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[1,0])
precision = confuse_matrix[1,1]/(confuse_matrix[1,1] + confuse_matrix[0,1])
f1 = (2*precision*recall)/(precision + recall)

print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')
print(f'Confusion Matrix : {confuse_matrix}')
print(f'Precision:{precision}')
print(f'Recall:{recall}')
print(f'F1: {f1}')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - AUC: 0.8565 - accuracy: 0.6502 - loss: 0.9565 
Test Loss: 0.8614310622215271, Test Accuracy: 0.699999988079071, Test AUC: 0.88210529088974
Confusion Matrix : [[309  71]
 [  5  25]]
Precision:0.2604166666666667
Recall:0.8333333333333334
F1: 0.3968253968253969
