In [None]:
# Importing the necessary functions that we will be using throughout the Jupyter sheet

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

: 

In [None]:
# Below we have implemented a function for every pre processing teqnique with a breif explanation into what it does
# This approach was taken to keep our code more organinzed, and maintainable.
# Every time we train our model and get our results, we can simply come back to the functions and tweak them as needed.

In [20]:
def load_dataset(filename):
  return pd.read_csv(filename)

def avg_trans_amt_per_card(data):
    # Average Transaction Amount Per Card: Used to calculate the average amount of money 
    # spent in transactions for each credit card in the dataset. Once we have the average we 
    # will use it to compare each transaction amount to the average, and can help identify 
    # transactions that are too high.
    # Group by credit card number and calculate the mean transaction amount
    average_amt_per_card = data.groupby('cc_num')['amt'].mean().reset_index()
    average_amt_per_card.rename(columns={'amt': 'avg_amt_per_card'}, inplace=True)
    # Merge the average transaction amount per card back to both the train and test datasets
    data = data.merge(average_amt_per_card, on='cc_num', how='left')
    # Create a new column to compare transaction amount to the average per card
    data['amt_vs_avg'] = data['amt'] / data['avg_amt_per_card']
    return data

def trans_freq_per_card(data):
    # Transaction Frequency Per Card: Used to count how many transactions are made each 
    # day per credit card. Helps detect unusual activity if there are more transactions per 
    # day than the usual pattern.
    # Calculate transaction frequency per card per day
    trans_freq_per_card = data.groupby(['cc_num', 'trans_date']).size().reset_index(name='trans_freq_per_day')
    data = data.merge(trans_freq_per_card, on=['cc_num', 'trans_date'], how='left')
    return data

def change_in_spending(data):
    # Change in Spending Pattern Per Card: Used to compare the current transaction amount to the average
    # amount spent for a similar category. By detecting significant deviations in spending patters per 
    # category we can detect fraud.
    # Calculate average spending per card per category
    avg_spending_per_card_category = data.groupby(['cc_num', 'category'])['amt'].transform('mean')
    data['change_in_spending'] = data['amt'] / avg_spending_per_card_category
    return data

def handle_date_time(data):
    # Converting trans_date_trans_time into a datetime object, then making new cols: trans_date and trans_time
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%m/%d/%y %H:%M') # converting the 'trans_date_trans_time' column from a string to a datetime object 
    data['trans_date'] = data['trans_date_trans_time'].dt.date # extracts the date part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_date'.
    data['trans_time'] = data['trans_date_trans_time'].dt.time # extracts the time part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_time'.
    return data # returns 2 new cols

def dates_since_last_purchase(data):
    # Days Since Last Purchase Per Card: Used to calculate the number of days between each transaction 
    # for the same credit card. Helps detect a pattern of how frequently the card is being used; for example, 
    # if the card is used 2-3 times a day and all of a sudden the card is being used 10 times a day for 
    # 2 days straight, it could be fraud.
    # Ensure data is sorted by date for correct days calculation
    data.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    # Calculate the number of days between each transaction for the same credit card
    data['days_since_last'] = data.groupby('cc_num')['trans_date_trans_time'].diff().dt.days.fillna(0).astype(int)
    return data

def convert_to_numerical_data(data):
    # Convert 'trans_date_trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_date_trans_time'].dt.hour * 3600 + data['trans_date_trans_time'].dt.minute * 60 + data['trans_date_trans_time'].dt.second
    
    # Perform one-hot encoding for 'merchant' and 'category'
    data = pd.get_dummies(data, columns=['merchant', 'category'])
    
    # Convert 'trans_date' to datetime and extract features
    data['trans_date'] = pd.to_datetime(data['trans_date'])
    data['trans_date_year'] = data['trans_date'].dt.year
    data['trans_date_month'] = data['trans_date'].dt.month
    data['trans_date_day'] = data['trans_date'].dt.day
    # Convert 'trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    # Drop the original non-numeric columns
    data.drop(['trans_date_trans_time', 'trans_date', 'trans_time'], axis=1, inplace=True)
    return data

def drop_unnecessary_cols(data):
    # Drop unnecessary columns
    cols_to_drop = ['first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'unix_time']
    data = data.drop(cols_to_drop, axis=1)
    return data

def separate_features_and_labels(data, label_column_name):
    # Since 'is_fraud' is the label column, separate the features and labels
    X = data.drop(label_column_name, axis=1)  # Drop the label column to create the features set
    y = data[label_column_name]              # Get the label column as the labels set
    return X, y

def preprocess_data(data):
    data = handle_date_time(data)
    data = avg_trans_amt_per_card(data)
    data = trans_freq_per_card(data)
    data = dates_since_last_purchase(data)
    data = change_in_spending(data)
    data = convert_to_numerical_data(data)
    data = drop_unnecessary_cols(data)
    # data = split_features(data)
    return data

def scale_data(X_train, X_val):
    # Initialize the scaler
    scaler = StandardScaler()
    # Fit the scaler only on the training data
    scaler.fit(X_train)
    # Apply the transformation to the training data
    X_train_scaled = scaler.transform(X_train)
    # Apply the same transformation to the validation data
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled

In [24]:
# ------------------------------ LOAD DATA ------------------------------
# Load data
train_data = load_dataset('fraudTrain.csv')
test_data = load_dataset('fraudTest.csv')

# ------------------------------ DROP COLS ------------------------------
# Remove first column since it is unnecessary
train_data = train_data.iloc[:, 1:] # DONE
test_data = test_data.iloc[:, 1:] # DONE

# --------------------------- PREPROCESS DATA ---------------------------
# Preporcess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# --------------------------- SPLIT FEATURES ---------------------------
X_train, y_train = separate_features_and_labels(train_data, 'is_fraud') # This is the only one were spliting into testing and training
X_test, y_test = separate_features_and_labels(test_data, 'is_fraud')

# -------------------------- PLAN FOR EMPTY VALS -------------------------
# Ensure no missing values are left untreated
# This method will fill any NaN values in your feature sets with the mean of 
#their respective columns, which can help in maintaining the integrity of your dataset 
# without dropping valuable data points.
X_train.fillna(X_train.mean(), inplace=True)
y_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)
y_test.fillna(X_test.mean(), inplace=True)

# print(X_train.columns)
# print(X_test.columns)
# print("Shape of X_train:", X_train.shape)
# print("Shape of y_train:", y_train.shape)
# print("Shape of X_train:", X_test.shape)
# print("Shape of y_train:", y_test.shape)

# Splitting the training data into training and validation sets:
# train_test_split: This function is used to randomly split the 
# training data into new training data (X_train_final, y_train_final) 
# and validation data (X_val, y_val), ensuring that the validation 
# data is representative but not seen during training.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# X_train_scaled and X_val_scaled are ready for model training and validation
X_train_scaled, X_val_scaled = scale_data(X_train, X_val)

AttributeError: Can only use .dt accessor with datetimelike values

In [18]:
# ----------------------- LOGISTIC REGRESSION MODEL ----------------------
# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)  # Adjust max_iter if convergence issues occur

# Train the model
lr_model.fit(X_train_scaled, y_train)

# Predict on the validation set
lr_val_predictions = lr_model.predict(X_val_scaled)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_val, lr_val_predictions))
print("Logistic Regression Classification Report:")
print(classification_report(y_val, lr_val_predictions))


Logistic Regression Accuracy: 0.994005994005994
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       997
           1       0.33      0.50      0.40         4

    accuracy                           0.99      1001
   macro avg       0.67      0.75      0.70      1001
weighted avg       1.00      0.99      0.99      1001



In [19]:
# ----------------------- LOGISTIC REGRESSION MODEL ----------------------
# ------------------------------ USING SMOTE -----------------------------
# Create a SMOTE object
smote = SMOTE(sampling_strategy='minority', random_state=42)

# Create a pipeline that first oversamples and then runs the logistic regression
pipeline = make_pipeline(smote, LogisticRegression(max_iter=1000))

# Fit on the training data
pipeline.fit(X_train_scaled, y_train)

# Predict on the validation set
lr_val_predictions = pipeline.predict(X_val_scaled)

# Evaluate the model
print("Logistic Regression with SMOTE Accuracy:", accuracy_score(y_val, lr_val_predictions))
print("Logistic Regression with SMOTE Classification Report:")
print(classification_report(y_val, lr_val_predictions))


Logistic Regression with SMOTE Accuracy: 0.995004995004995
Logistic Regression with SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       997
           1       0.43      0.75      0.55         4

    accuracy                           1.00      1001
   macro avg       0.71      0.87      0.77      1001
weighted avg       1.00      1.00      1.00      1001



In [None]:
# ----------------------- LOGISTIC REGRESSION MODEL ----------------------
# ------------------------------ USING SMOTE -----------------------------
# Create a SMOTE object
smote = SMOTE(sampling_strategy='minority', random_state=42)

# Create a pipeline that first oversamples and then runs the logistic regression
pipeline = make_pipeline(smote, LogisticRegression(max_iter=1000))

# Fit on the training data
pipeline.fit(X_train_scaled, y_train)

# Predict on the validation set
lr_val_predictions = pipeline.predict(X_val_scaled)

# Evaluate the model
print("Logistic Regression with SMOTE Accuracy:", accuracy_score(y_val, lr_val_predictions))
print("Logistic Regression with SMOTE Classification Report:")
print(classification_report(y_val, lr_val_predictions))