In [14]:
import sys
import os
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [15]:
# Preprocess Data
def load_dataset(filename):
  return pd.read_csv(filename)

def avg_trans_amt_per_card(data):
    # Average Transaction Amount Per Card: Used to calculate the average amount of money 
    # spent in transactions for each credit card in the dataset. Once we have the average we 
    # will use it to compare each transaction amount to the average, and can help identify 
    # transactions that are too high.
    # Group by credit card number and calculate the mean transaction amount
    average_amt_per_card = data.groupby('cc_num')['amt'].mean().reset_index()
    average_amt_per_card.rename(columns={'amt': 'avg_amt_per_card'}, inplace=True)
    # Merge the average transaction amount per card back to both the train and test datasets
    data = data.merge(average_amt_per_card, on='cc_num', how='left')
    # Create a new column to compare transaction amount to the average per card
    data['amt_vs_avg'] = data['amt'] / data['avg_amt_per_card']
    return data

def trans_freq_per_card(data):
    # Transaction Frequency Per Card: Used to count how many transactions are made each 
    # day per credit card. Helps detect unusual activity if there are more transactions per 
    # day than the usual pattern.
    # Calculate transaction frequency per card per day
    trans_freq_per_card = data.groupby(['cc_num', 'trans_date']).size().reset_index(name='trans_freq_per_day')
    data = data.merge(trans_freq_per_card, on=['cc_num', 'trans_date'], how='left')
    return data

def change_in_spending(data):
    # Change in Spending Pattern Per Card: Used to compare the current transaction amount to the average
    # amount spent for a similar category. By detecting significant deviations in spending patters per 
    # category we can detect fraud.
    # Calculate average spending per card per category
    avg_spending_per_card_category = data.groupby(['cc_num', 'category'])['amt'].transform('mean')
    data['change_in_spending'] = data['amt'] / avg_spending_per_card_category
    return data

def handle_date_time(data):
    # Converting trans_date_trans_time into a datetime object, then making new cols: trans_date and trans_time
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%m/%d/%y %H:%M') # converting the 'trans_date_trans_time' column from a string to a datetime object 
    data['trans_date'] = data['trans_date_trans_time'].dt.date # extracts the date part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_date'.
    data['trans_time'] = data['trans_date_trans_time'].dt.time # extracts the time part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_time'.
    return data # returns 2 new cols

def dates_since_last_purchase(data):
    # Days Since Last Purchase Per Card: Used to calculate the number of days between each transaction 
    # for the same credit card. Helps detect a pattern of how frequently the card is being used; for example, 
    # if the card is used 2-3 times a day and all of a sudden the card is being used 10 times a day for 
    # 2 days straight, it could be fraud.
    # Ensure data is sorted by date for correct days calculation
    data.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    # Calculate the number of days between each transaction for the same credit card
    data['days_since_last'] = data.groupby('cc_num')['trans_date_trans_time'].diff().dt.days.fillna(0).astype(int)
    return data

def convert_to_numerical_data(data):
    # Convert 'trans_date_trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_date_trans_time'].dt.hour * 3600 + data['trans_date_trans_time'].dt.minute * 60 + data['trans_date_trans_time'].dt.second
    
    # Perform one-hot encoding for 'merchant' and 'category'
    data = pd.get_dummies(data, columns=['merchant', 'category'])
    
    # Convert 'trans_date' to datetime and extract features
    data['trans_date'] = pd.to_datetime(data['trans_date'])
    data['trans_date_year'] = data['trans_date'].dt.year
    data['trans_date_month'] = data['trans_date'].dt.month
    data['trans_date_day'] = data['trans_date'].dt.day
    # Convert 'trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    # Drop the original non-numeric columns
    data.drop(['trans_date_trans_time', 'trans_date', 'trans_time'], axis=1, inplace=True)
    return data

def drop_unnecessary_cols(data):
    # Drop unnecessary columns
    cols_to_drop = ['first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'unix_time']
    data = data.drop(cols_to_drop, axis=1)
    return data

def separate_features_and_labels(data, label_column_name):
    # Since 'is_fraud' is the label column, separate the features and labels
    X = data.drop(label_column_name, axis=1)  # Drop the label column to create the features set
    y = data[label_column_name]              # Get the label column as the labels set
    return X, y

def preprocess_data(data):
    data = handle_date_time(data)
    data = avg_trans_amt_per_card(data)
    data = trans_freq_per_card(data)
    data = dates_since_last_purchase(data)
    data = change_in_spending(data)
    data = convert_to_numerical_data(data)
    data = drop_unnecessary_cols(data)
    # data = split_features(data)
    return data

def scale_data(X_train, X_val):
    # Initialize the scaler
    scaler = StandardScaler()
    # Fit the scaler only on the training data
    scaler.fit(X_train)
    # Apply the transformation to the training data
    X_train_scaled = scaler.transform(X_train)
    # Apply the same transformation to the validation data
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled

In [16]:
# ------------------------------ LOAD DATA ------------------------------
# Load data
train_data = load_dataset('fraudTrain.csv')
test_data = load_dataset('fraudTest.csv')

# ------------------------------ DROP COLS ------------------------------
# Remove first column since it is unnecessary
train_data = train_data.iloc[:, 1:] # DONE
test_data = test_data.iloc[:, 1:] # DONE

# --------------------------- PREPROCESS DATA ---------------------------
# Preporcess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# --------------------------- SPLIT FEATURES ---------------------------
X_train, y_train = separate_features_and_labels(train_data, 'is_fraud') # This is the only one were spliting into testing and training
X_test, y_test = separate_features_and_labels(test_data, 'is_fraud')
# Prep the data
X_train, X_test, Y_train, Y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# # Separate features and target
# Y = X['is_fraud']  # assuming 'is_fraud' is the target column
# X = X.drop(columns=['is_fraud'])
# X = preprocess_data(X)

# categorical_columns = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'trans_date_trans_time', 'dob', 'trans_num']
# # Performs One-Hot-Encoding on string data
# X_train = pd.get_dummies(X_train, columns=categorical_columns)

# X_test = pd.get_dummies(X_test, columns=categorical_columns)

In [17]:
print(X_test)
# Create DMatrices, which is XGBoost's optimized data structure
dtrain = xgb.DMatrix(X_train, label=Y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=Y_test, enable_categorical=True)

            cc_num     amt      lat      long  city_pop  merch_lat  \
1772  6.011368e+15    1.68  41.2419  -81.7453      7646  41.490778   
986   4.561550e+12    3.27  34.9889 -106.0609      7268  35.863489   
609   3.819900e+13  127.20  43.1960  -72.3001       477  43.324280   
2912  2.242543e+15   55.70  38.4921  -85.4524       564  37.645404   
791   4.661996e+18  105.68  40.8555  -79.7372      2054  41.776480   
...            ...     ...      ...       ...       ...        ...   
3396  3.576432e+15   32.61  33.5623 -112.0559   1312922  32.972587   
4589  2.242543e+15   12.59  38.4921  -85.4524       564  39.315427   
3629  4.425161e+15  100.69  31.4647 -100.3900    103927  31.969096   
72    3.027300e+13   57.23  35.5762  -91.4539       111  36.144895   
61    3.547560e+15   56.66  44.5232  -86.2061       680  44.005290   

      merch_long  avg_amt_per_card  amt_vs_avg  trans_freq_per_day  ...  \
1772  -82.295114         72.142000    0.023287                   5  ...   
986  -105

In [18]:
# Set parameters for XGBoost
params = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.1,      # the training step for each iteration
    'objective': 'binary:logistic',  # binary classification 
    'eval_metric': 'logloss'  # evaluation metric
}
num_rounds = 100  # the number of training iterations

# Train the model
model = xgb.train(params, dtrain, num_rounds)

preds = model.predict(dtest)
preds = (preds > 0.5).astype('int')  # Convert probabilities to 0 or 1

In [19]:
# Check Accuracy
print(len(preds), len(Y_test))
accuracy = accuracy_score(Y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100))

report = classification_report(Y_test, preds)
print(report)


1001 1001
Accuracy: 99.90%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       997
           1       1.00      0.75      0.86         4

    accuracy                           1.00      1001
   macro avg       1.00      0.88      0.93      1001
weighted avg       1.00      1.00      1.00      1001

