In [14]:
import sys
import os
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

In [15]:
def load_dataset(filename):
  return pd.read_csv(filename)

def avg_trans_amt_per_card(data):
    # Average Transaction Amount Per Card: Used to calculate the average amount of money 
    # spent in transactions for each credit card in the dataset. Once we have the average we 
    # will use it to compare each transaction amount to the average, and can help identify 
    # transactions that are too high.
    # Group by credit card number and calculate the mean transaction amount
    average_amt_per_card = data.groupby('cc_num')['amt'].mean().reset_index()
    average_amt_per_card.rename(columns={'amt': 'avg_amt_per_card'}, inplace=True)
    # Merge the average transaction amount per card back to both the train and test datasets
    data = data.merge(average_amt_per_card, on='cc_num', how='left')
    # Create a new column to compare transaction amount to the average per card
    data['amt_vs_avg'] = data['amt'] / data['avg_amt_per_card']
    return data

def trans_freq_per_card(data):
    # Transaction Frequency Per Card: Used to count how many transactions are made each 
    # day per credit card. Helps detect unusual activity if there are more transactions per 
    # day than the usual pattern.
    # Calculate transaction frequency per card per day
    trans_freq_per_card = data.groupby(['cc_num', 'trans_date']).size().reset_index(name='trans_freq_per_day')
    data = data.merge(trans_freq_per_card, on=['cc_num', 'trans_date'], how='left')
    return data

def change_in_spending(data):
    # Change in Spending Pattern Per Card: Used to compare the current transaction amount to the average
    # amount spent for a similar category. By detecting significant deviations in spending patters per 
    # category we can detect fraud.
    # Calculate average spending per card per category
    avg_spending_per_card_category = data.groupby(['cc_num', 'category'])['amt'].transform('mean')
    data['change_in_spending'] = data['amt'] / avg_spending_per_card_category
    return data

def handle_date_time(data):
    # Converting trans_date_trans_time into a datetime object, then making new cols: trans_date and trans_time
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%m/%d/%y %H:%M') # converting the 'trans_date_trans_time' column from a string to a datetime object 
    data['trans_date'] = data['trans_date_trans_time'].dt.date # extracts the date part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_date'.
    data['trans_time'] = data['trans_date_trans_time'].dt.time # extracts the time part from the 'trans_date_trans_time' datetime object and stores it in a new column called 'trans_time'.
    return data # returns 2 new cols

def dates_since_last_purchase(data):
    # Days Since Last Purchase Per Card: Used to calculate the number of days between each transaction 
    # for the same credit card. Helps detect a pattern of how frequently the card is being used; for example, 
    # if the card is used 2-3 times a day and all of a sudden the card is being used 10 times a day for 
    # 2 days straight, it could be fraud.
    # Ensure data is sorted by date for correct days calculation
    data.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    # Calculate the number of days between each transaction for the same credit card
    data['days_since_last'] = data.groupby('cc_num')['trans_date_trans_time'].diff().dt.days.fillna(0).astype(int)
    return data

def convert_to_numerical_data(data):
    # Convert 'trans_date_trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_date_trans_time'].dt.hour * 3600 + data['trans_date_trans_time'].dt.minute * 60 + data['trans_date_trans_time'].dt.second
    
    # Perform one-hot encoding for 'merchant' and 'category'
    data = pd.get_dummies(data, columns=['merchant', 'category'])
    
    # Convert 'trans_date' to datetime and extract features
    data['trans_date'] = pd.to_datetime(data['trans_date'])
    data['trans_date_year'] = data['trans_date'].dt.year
    data['trans_date_month'] = data['trans_date'].dt.month
    data['trans_date_day'] = data['trans_date'].dt.day
    # Convert 'trans_time' to total seconds elapsed since midnight
    data['trans_time_seconds'] = data['trans_time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    # Drop the original non-numeric columns
    data.drop(['trans_date_trans_time', 'trans_date', 'trans_time'], axis=1, inplace=True)
    return data

def drop_unnecessary_cols(data):
    # Drop unnecessary columns
    cols_to_drop = ['first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'unix_time']
    data = data.drop(cols_to_drop, axis=1)
    return data

def separate_features_and_labels(data, label_column_name):
    # Since 'is_fraud' is the label column, separate the features and labels
    X = data.drop(label_column_name, axis=1)  # Drop the label column to create the features set
    y = data[label_column_name]              # Get the label column as the labels set
    return X, y

def preprocess_data(data):
    data = handle_date_time(data)
    data = avg_trans_amt_per_card(data)
    data = trans_freq_per_card(data)
    data = dates_since_last_purchase(data)
    data = change_in_spending(data)
    data = convert_to_numerical_data(data)
    data = drop_unnecessary_cols(data)
    # data = split_features(data)
    return data

def scale_data(X_train, X_val):
    # Initialize the scaler
    scaler = StandardScaler()
    # Fit the scaler only on the training data
    scaler.fit(X_train)
    # Apply the transformation to the training data
    X_train_scaled = scaler.transform(X_train)
    # Apply the same transformation to the validation data
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled

In [16]:
X = pd.read_csv('fraudTrain.csv')
Y = pd.read_csv('fraudTest.csv')
# Prep the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
# Load the complete dataset
data = pd.read_csv('fraudTrain.csv')

In [18]:
# Separate features and target
Y = X['is_fraud']  # assuming 'is_fraud' is the target column
X = X.drop(columns=['is_fraud'])
X = preprocess_data(X)

In [19]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
# Performs One-Hot-Encoding on string data
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [21]:
# Initialize Models
# For classification
modelC = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# For regression
modelR = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [22]:
print(X_test)
# Train models
modelC.fit(X_train, Y_train)
modelR.fit(X_train, Y_train)

      Unnamed: 0        cc_num     amt      lat      long  city_pop  \
4513        4513  2.131810e+14   23.86  40.5070 -123.9743      1139   
1261        1261  3.531130e+15    7.91  43.8065  -73.0882      5895   
1135        1135  3.538520e+15    6.61  41.5686  -83.3632       269   
3367        3367  3.055160e+13   59.52  35.1836  -81.4552      5621   
2640        2640  4.988300e+12   41.40  41.4575  -74.1659      2258   
...          ...           ...     ...      ...       ...       ...   
4502        4502  2.720433e+15    7.10  44.0575  -69.9656      3224   
4208        4208  4.003990e+15   77.11  33.3224  -86.9657     71463   
4627        4627  3.547560e+15   87.80  44.5232  -86.2061       680   
2797        2797  3.764450e+14  112.13  41.1558 -101.1360      1789   
1092        1092  6.011439e+15   68.59  34.2853  -91.3336      5161   

      merch_lat  merch_long  avg_amt_per_card  amt_vs_avg  ...  \
4513  39.864260 -123.116055         30.701538    0.777160  ...   
1261  44.742680

In [23]:
# Make predictions
predictions_R = modelR.predict(X_test)
predictions_C = modelC.predict(X_test)

In [27]:
# For classification
accuracy = accuracy_score(Y_test, predictions_C)
print(f"Classification Accuracy: {accuracy}")
mse = mean_squared_error(Y_test, predictions_R)
print(f" Classification Mean Squared Error: {mse}")

Classification Accuracy: 0.996003996003996
 Classification Mean Squared Error: 0.003529617916169468


In [28]:
# For regression
mse = mean_squared_error(Y_test, predictions_R)
print(f"Regression Mean Squared Error: {mse}")

Regression Mean Squared Error: 0.003529617916169468


In [None]:
# Make predictions on new data
new_predictions = model.predict(new_data)