In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Data Processing:

In [4]:
# Extract transaction data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_transaction.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_trans_df = pd.concat(chunks)

In [5]:
# Extract identity data
num_chunks = 10000
chunks = []
for chunk in pd.read_csv("/content/drive/My Drive/CS230_Folder/train_identity.csv",
                         chunksize=num_chunks, engine='python'):
    chunks.append(chunk)
train_identity_df = pd.concat(chunks)

In [6]:
# Merge transaction and identity datasets
whole_df = train_trans_df.merge(train_identity_df, on='TransactionID')

In [7]:
# Drop features with too many null values
minimum = 0.5
null_percentage = whole_df.isnull().mean()
whole_df = whole_df.drop(columns=null_percentage[null_percentage > minimum].index)

In [8]:
# Define and encode categorical features
trans_category_features = ['isFraud', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

identity_category_features = [f'id_{i}' for i in range(12,39)] + ['DeviceType', 'DeviceInfo']

categories = trans_category_features + identity_category_features

for feature in categories:
  if feature in whole_df.columns:
    le = LabelEncoder()
    whole_df[feature] = le.fit_transform(whole_df[feature].astype(str))

# Fill null values with -999
whole_df.fillna(-999)

In [9]:
# Extract numerical features
numerical_features = [feature for feature in whole_df.columns if feature not in categories]

In [10]:
# Feature Scaling
scaler = StandardScaler()
whole_df[numerical_features] = scaler.fit_transform(whole_df[numerical_features])

In [12]:
# Remove low variance features
selector = VarianceThreshold(threshold=0.05)
matrix_whole_df = selector.fit_transform(whole_df)
columns_kept = whole_df.columns[selector.get_support()]
whole_df = pd.DataFrame(matrix_whole_df, columns=columns_kept)

In [13]:
# Get X and Y
X = whole_df.drop('isFraud', axis=1)
y = whole_df['isFraud']

In [14]:
# Get train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=20)

In [15]:
# Remove any rows with null values
X_train_dropped = X_train.dropna()
X_test_dropped = X_test.dropna()
y_train_dropped = y_train[X_train_dropped.index]
y_test = y_test[X_test_dropped.index]

In [16]:
# Resampling for unbalanced dataset
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=20)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dropped, y_train_dropped)

Logistic Regression

In [25]:
import tensorflow as tf

class log_reg_model(tf.keras.Model):
  def __init__(self, input_size):
    super(log_reg_model, self).__init__()
    self.linear = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.GlorotNormal())


  def call(self, x):
    z = self.linear(x)
    return z

In [26]:
# Model Training
input_dim = X_train_resampled.shape[1]
model_log_reg = log_reg_model(input_dim)

criterion = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

X_train_tensor = tf.convert_to_tensor(X_train_resampled.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

num_epochs = 4000
for epoch in range(num_epochs):
  with tf.GradientTape() as tape:
    outputs = model_log_reg(X_train_tensor)
    loss = criterion(y_train_tensor, outputs)

    gradients = tape.gradient(loss, model_log_reg.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model_log_reg.trainable_variables))

    if epoch % 20 == 0:
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.numpy():.4f}')

Epoch [1/4000], Loss: 144.3367
Epoch [21/4000], Loss: 137.0984
Epoch [41/4000], Loss: 129.9684
Epoch [61/4000], Loss: 122.9409
Epoch [81/4000], Loss: 116.0099
Epoch [101/4000], Loss: 109.1627
Epoch [121/4000], Loss: 102.3908
Epoch [141/4000], Loss: 95.6938
Epoch [161/4000], Loss: 89.0893
Epoch [181/4000], Loss: 82.6282
Epoch [201/4000], Loss: 76.3376
Epoch [221/4000], Loss: 70.2594
Epoch [241/4000], Loss: 64.4943
Epoch [261/4000], Loss: 59.0264
Epoch [281/4000], Loss: 53.8362
Epoch [301/4000], Loss: 48.9285
Epoch [321/4000], Loss: 44.3210
Epoch [341/4000], Loss: 40.0113
Epoch [361/4000], Loss: 36.1023
Epoch [381/4000], Loss: 32.6959
Epoch [401/4000], Loss: 29.7757
Epoch [421/4000], Loss: 27.3101
Epoch [441/4000], Loss: 25.2624
Epoch [461/4000], Loss: 23.5739
Epoch [481/4000], Loss: 22.2494
Epoch [501/4000], Loss: 21.2981
Epoch [521/4000], Loss: 20.6326
Epoch [541/4000], Loss: 20.1059
Epoch [561/4000], Loss: 19.6351
Epoch [581/4000], Loss: 19.1816
Epoch [601/4000], Loss: 18.7310
Epoch [

In [32]:
# Model Evaluation
from sklearn.metrics import roc_auc_score, confusion_matrix

X_test_tensor = tf.convert_to_tensor(X_test_dropped.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

y_pred_tensor = model_log_reg(X_test_tensor)
y_pred = tf.math.sigmoid(y_pred_tensor)

threshold = 0.5
predicted = tf.squeeze(tf.cast(y_pred > threshold, tf.float32))

y_test_tensor = tf.squeeze(y_test_tensor)

correct = tf.equal(predicted, y_test_tensor)
accuracy = tf.reduce_mean(tf.cast((correct), tf.float32)).numpy()
print(f'accuracy: {accuracy}')

# Calculate AUC
probs_np = y_pred.numpy().squeeze()
y_test_np = y_test_tensor.numpy()
auc = roc_auc_score(y_test_np, probs_np)
print(f'AUC: {auc}')

# Confusion matrix
confuse_matrix = confusion_matrix(y_test_np, predicted.numpy())
print(f'Confusion Matrix : {confuse_matrix}')

accuracy: 0.8304294347763062
AUC: 0.8344612278062125
Confusion Matrix : [[3218  619]
 [  72  166]]


XGBoost

In [31]:
# XGBoost
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test_dropped, label=y_test)

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 10,
    learning_rate= 0.00001)

xgb_model_depth20 = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight= np.sum(y_train_resampled == 0)/np.sum(y_train_resampled == 1),
    max_depth = 20,
    learning_rate= 0.00001)

xgb_model.fit(X_train_resampled, y_train_resampled)
xgb_model_depth20.fit(X_train_resampled, y_train_resampled)

In [None]:
# Hyperparameter search

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params = {
    'learning_rate': np.logspace(np.log10(0.00001), np.log10(0.1), num=5),
    'max_depth': [10, 20]
}

param_search = RandomizedSearchCV(estimator=xgb_model, param_distributions = params, scoring='roc_auc', cv=5, n_iter = 30, n_jobs=-1)
param_search.fit(X_train_resampled, y_train_resampled)

best_params = param_search.best_params_
best_model = param_search.best_estimator_

In [34]:
# Model Evaluation

y_pred_xgb = xgb_model.predict(X_test_dropped)
y_pred_xgb_depth20 = xgb_model_depth20.predict(X_test_dropped)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_xgb_depth20 = accuracy_score(y_test, y_pred_xgb_depth20)

roc_xgb = roc_auc_score(y_test, y_pred_xgb)
roc_xgb_depth20 = roc_auc_score(y_test, y_pred_xgb_depth20)

confuse_matrix = confusion_matrix(y_test, y_pred_xgb)
confuse_matrix_depth20 = confusion_matrix(y_test, y_pred_xgb_depth20)

# Confuse matrix: [[TN, FP] [FN, TP]]
print(f"accuracy xgb: {accuracy_xgb}")
print(f"roc_xgb: {roc_xgb}")
print(f"confuse_matrix: {confuse_matrix}")
print(f"accuracy_xgb_depth20: {accuracy_xgb_depth20}")
print(f"roc_xgb_depth20: {roc_xgb_depth20}")
print(f"confuse_matrix_depth20: {confuse_matrix_depth20}")

accuracy xgb: 0.9484662576687116
roc_xgb: 0.8307566967365523
confuse_matrix: [[3699  138]
 [  72  166]]
accuracy_xgb_depth20: 0.9646625766871165
roc_xgb_depth20: 0.8413276960510554
confuse_matrix_depth20: [[3764   73]
 [  71  167]]


Recurrent Neural Network

In [37]:
# Fitting and Training RNN Model with ReLU activation in Hidden Layers
import tensorflow as tf

model_relu = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation='relu', input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_relu.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

model_relu.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=10, batch_size=32)

  super().__init__(**kwargs)


Epoch 1/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - AUC: 0.7966 - accuracy: 0.7436 - loss: 2.9023
Epoch 2/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - AUC: 0.8685 - accuracy: 0.8127 - loss: 1.2423
Epoch 3/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8846 - accuracy: 0.8267 - loss: 1.0175
Epoch 4/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - AUC: 0.8959 - accuracy: 0.8370 - loss: 0.9391
Epoch 5/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - AUC: 0.9035 - accuracy: 0.8427 - loss: 0.8433
Epoch 6/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - AUC: 0.9075 - accuracy: 0.8475 - loss: 0.7886
Epoch 7/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.9221 - accuracy: 0.8591 - loss: 0.6316
Epoch 8/10
[1m2228/2228[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f8eb9b11de0>

In [45]:
# Model Evaluation
X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_relu.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - AUC: 0.8792 - accuracy: 0.8471 - loss: 0.5523
Test Loss: 0.5517176985740662, Test Accuracy: 0.8488343358039856, Test AUC: 0.8821203112602234


In [38]:
# Fitting and Training RNN Model with Leaky ReLU activation in Hidden Layers
import tensorflow as tf

model_leaky = tf.keras.Sequential([tf.keras.layers.SimpleRNN(units=128, activation=None, input_shape=(1, X_train_resampled.shape[1])),
                             tf.keras.layers.LeakyReLU(alpha=0.01),
                             tf.keras.layers.Dense(1, activation='sigmoid')])

model_leaky.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

X_train_resampled_rnn = X_train_resampled.values.reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
X_train_resampled_rnn = tf.convert_to_tensor(X_train_resampled_rnn, dtype=tf.float32)
y_train_resampled_rnn = tf.convert_to_tensor(y_train_resampled.values, dtype=tf.float32)

model_leaky.fit(X_train_resampled_rnn, y_train_resampled_rnn, epochs=10, batch_size=32)



Epoch 1/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2ms/step - AUC: 0.7905 - accuracy: 0.7380 - loss: 4.9334
Epoch 2/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.8766 - accuracy: 0.8124 - loss: 1.0064
Epoch 3/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - AUC: 0.8968 - accuracy: 0.8343 - loss: 0.8411
Epoch 4/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - AUC: 0.8967 - accuracy: 0.8338 - loss: 0.8714
Epoch 5/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - AUC: 0.9138 - accuracy: 0.8521 - loss: 0.7195
Epoch 6/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - AUC: 0.9083 - accuracy: 0.8471 - loss: 0.7673
Epoch 7/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - AUC: 0.9048 - accuracy: 0.8449 - loss: 0.8085
Epoch 8/10
[1m2228/2228[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f8eba89d900>

In [48]:
# Model Evaluation
X_test_dropped_rnn = X_test_dropped.values.reshape((X_test_dropped.shape[0], 1, X_test_dropped.shape[1]))
X_test_dropped_rnn = tf.convert_to_tensor(X_test_dropped_rnn, dtype=tf.float32)
y_test_rnn = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

loss, accuracy, auc = model_leaky.evaluate(X_test_dropped_rnn, y_test_rnn, batch_size=32)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test AUC: {auc}')

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - AUC: 0.8541 - accuracy: 0.7339 - loss: 1.1060
Test Loss: 1.0798038244247437, Test Accuracy: 0.7344785332679749, Test AUC: 0.8690405488014221
