In [24]:
#Importing dependencies
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# Importing Data From SQLite Database

We needed to retrieve the clean data from the SQLite database ('fraud_detection.db') and load it into a Pandas DataFrame to prepare it for analysis in the random forest model.

In [25]:
#Creating SQLAlchemy engine and connecting to the SQLite database
engine = create_engine('sqlite:///database/fraud_detection.db')
connection = engine.connect()

#Creating the pandas dataframe by reading the SQL table
model_df = pd.read_sql_table('CleanFraudDetection', con=connection)

model_df

Unnamed: 0,transaction_id,time,category,amount,age,gender,state,city_pop,is_fraud
0,119106,07:55:28,shopping_pos,1.07,59.0,F,CT,5438,0
1,179292,14:05:16,kids_pets,94.99,41.0,M,OK,7163,0
2,540729,16:22:29,kids_pets,31.28,69.0,F,MO,602,0
3,374360,10:44:50,gas_transport,73.06,41.0,M,OK,7163,0
4,314574,01:50:09,misc_net,9.99,52.0,F,IA,2036,0
...,...,...,...,...,...,...,...,...,...
111139,444284,12:42:49,food_dining,8.22,24.0,F,IA,1583,0
111140,89444,21:29:49,personal_care,205.69,35.0,F,OH,7646,0
111141,298536,01:05:31,misc_net,277.63,43.0,M,MA,47249,0
111142,301993,04:16:28,shopping_net,13.29,92.0,F,KY,571,0


# Dropping and Transforming Columns

Once the dataset was loaded into a pandas dataframe, certain columns needed to be dropped and transformed for the model to be created. In this case, it was necessary that the column 'time' was transformed into a datetime object and binned so that the model could interpret it. In addition, the transaction id needed to be deleted since it was not a feature or a target. Lastly, we used the pd.get_dummies function to convert the categorical data into numeric.

In [26]:
#Checking the data types
model_df.dtypes

transaction_id      int64
time               object
category           object
amount            float64
age               float64
gender             object
state              object
city_pop            int64
is_fraud            int64
dtype: object

In [27]:
#Converting 'time' column to datetime format
#We have to convert the 'time' column to a string first before converting it to a datetime object
model_df['time'] = model_df['time'].astype(str)
model_df['time'] = pd.to_datetime(model_df['time'])

#Defining custom time bins based on hour of the day
#Bins are as follows: early morning (12am-6am), late morning (6am-12pm).\
#afternoon (12pm-4pm), evening (4pm-8pm), night (8pm-12am)
bins = [0, 6, 12, 16, 20, 24]
labels = ['early morning', 'late morning', 'afternoon', 'evening', 'night']

#Extracting hour of the day and categorize into bins
model_df['hour'] = model_df['time'].dt.hour
model_df['time_bins'] = pd.cut(model_df['hour'], bins=bins, labels=labels, right=False)

#Dropping the 'hour' column if not needed
model_df = model_df.drop(columns=['hour', 'time'])

model_df

Unnamed: 0,transaction_id,category,amount,age,gender,state,city_pop,is_fraud,time_bins
0,119106,shopping_pos,1.07,59.0,F,CT,5438,0,late morning
1,179292,kids_pets,94.99,41.0,M,OK,7163,0,afternoon
2,540729,kids_pets,31.28,69.0,F,MO,602,0,evening
3,374360,gas_transport,73.06,41.0,M,OK,7163,0,late morning
4,314574,misc_net,9.99,52.0,F,IA,2036,0,early morning
...,...,...,...,...,...,...,...,...,...
111139,444284,food_dining,8.22,24.0,F,IA,1583,0,afternoon
111140,89444,personal_care,205.69,35.0,F,OH,7646,0,night
111141,298536,misc_net,277.63,43.0,M,MA,47249,0,early morning
111142,301993,shopping_net,13.29,92.0,F,KY,571,0,early morning


In [28]:
#Dropping the transaction_id since it's not a feature for the model
model_df.drop(columns='transaction_id', inplace=True)
model_df

Unnamed: 0,category,amount,age,gender,state,city_pop,is_fraud,time_bins
0,shopping_pos,1.07,59.0,F,CT,5438,0,late morning
1,kids_pets,94.99,41.0,M,OK,7163,0,afternoon
2,kids_pets,31.28,69.0,F,MO,602,0,evening
3,gas_transport,73.06,41.0,M,OK,7163,0,late morning
4,misc_net,9.99,52.0,F,IA,2036,0,early morning
...,...,...,...,...,...,...,...,...
111139,food_dining,8.22,24.0,F,IA,1583,0,afternoon
111140,personal_care,205.69,35.0,F,OH,7646,0,night
111141,misc_net,277.63,43.0,M,MA,47249,0,early morning
111142,shopping_net,13.29,92.0,F,KY,571,0,early morning


In [29]:
#Transforming the categorical data into numeric
categorical_df = pd.get_dummies(model_df)
categorical_df.shape

(111144, 75)

# Separating the Lables and the Features

In [30]:
#Separating the labels and the features 
X = categorical_df.drop(["is_fraud"], axis='columns').values
y = categorical_df["is_fraud"].values

In [31]:
#Reviewing the series of lables
y

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [32]:
#Reviewing the features
X

array([[1.0700e+00, 5.9000e+01, 5.4380e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [9.4990e+01, 4.1000e+01, 7.1630e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.1280e+01, 6.9000e+01, 6.0200e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [2.7763e+02, 4.3000e+01, 4.7249e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.3290e+01, 9.2000e+01, 5.7100e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0046e+02, 3.9000e+01, 1.3600e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

# Split the Data

In [33]:
#Importing dependencies 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [34]:
#Splitting the data into training and testing sets. Random state of 1 will be assigned.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [35]:
#Creating a StandardScaler instance
scaler = StandardScaler()

#Fitting the StandardScaler
X_scaler = scaler.fit(X_train)

#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Compile and Train the Model

In [36]:
#Importing dependencies 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score

In [37]:
#Creating an instance of the Random Forest Classifier
fraud_model = RandomForestClassifier(random_state=1)

#Fitting the Random Forest Classifier to the scaled training data
fraud_model.fit(X_train_scaled, y_train)

#Running the prediction on the test set
prediction = fraud_model.predict(X_test_scaled)

# Model Evaluation

In [38]:
#Calculating balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, prediction)
print("Balanced Accuracy Score:", balanced_acc)

Balanced Accuracy Score: 0.785601500367358


In [39]:
#Calculating confusion matrix
conf_matrix = confusion_matrix(y_test, prediction)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[22161     5]
 [   27    36]]


In [40]:
#Generating classification report
class_report = classification_report(y_test, prediction)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22166
           1       0.88      0.57      0.69        63

    accuracy                           1.00     22229
   macro avg       0.94      0.79      0.85     22229
weighted avg       1.00      1.00      1.00     22229

