In [11]:
#Importing dependencies
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# Importing Data From SQLite Database

We needed to retrieve the clean data from the SQLite database ('fraud_detection.db') and load it into a Pandas DataFrame to prepare it for analysis in the neural network model.

In [12]:
#Creating SQLAlchemy engine and connecting to the SQLite database
engine = create_engine('sqlite:///fraud_detection.db')
connection = engine.connect()

#Creating the pandas dataframe by reading the SQL table
model_df = pd.read_sql_table('CleanFraudDetection', con=connection)

model_df

Unnamed: 0,transaction_id,time,category,amount,age,gender,state,city_pop,is_fraud
0,119106,07:55:28,shopping_pos,1.07,59.0,F,CT,5438,0
1,179292,14:05:16,kids_pets,94.99,41.0,M,OK,7163,0
2,540729,16:22:29,kids_pets,31.28,69.0,F,MO,602,0
3,374360,10:44:50,gas_transport,73.06,41.0,M,OK,7163,0
4,314574,01:50:09,misc_net,9.99,52.0,F,IA,2036,0
...,...,...,...,...,...,...,...,...,...
111139,444284,12:42:49,food_dining,8.22,24.0,F,IA,1583,0
111140,89444,21:29:49,personal_care,205.69,35.0,F,OH,7646,0
111141,298536,01:05:31,misc_net,277.63,43.0,M,MA,47249,0
111142,301993,04:16:28,shopping_net,13.29,92.0,F,KY,571,0


# Dropping and Transforming Columns

Once the dataset was loaded into a pandas dataframe, certain columns needed to be dropped and transformed for the model to be created. In this case, it was necessary that the column 'time' was transformed into a datetime object and binned so that the model could interpret it. In addition, the transaction id needed to be deleted since it was not a feature or a target. Lastly, we used the pd.get_dummies function to convert the categorical data into numeric.

In [13]:
#Checking the data types
model_df.dtypes

transaction_id      int64
time               object
category           object
amount            float64
age               float64
gender             object
state              object
city_pop            int64
is_fraud            int64
dtype: object

In [14]:
#Converting 'time' column to datetime format
#We have to convert the 'time' column to a string first before converting it to a datetime object
model_df['time'] = model_df['time'].astype(str)
model_df['time'] = pd.to_datetime(model_df['time'])

#Defining custom time bins based on hour of the day
#Bins are as follows: early morning (12am-6am), late morning (6am-12pm).\
#afternoon (12pm-4pm), evening (4pm-8pm), night (8pm-12am)
bins = [0, 6, 12, 16, 20, 24]
labels = ['early morning', 'late morning', 'afternoon', 'evening', 'night']

#Extracting hour of the day and categorize into bins
model_df['hour'] = model_df['time'].dt.hour
model_df['time_bins'] = pd.cut(model_df['hour'], bins=bins, labels=labels, right=False)

#Dropping the 'hour' column if not needed
model_df = model_df.drop(columns=['hour', 'time'])

model_df

Unnamed: 0,transaction_id,category,amount,age,gender,state,city_pop,is_fraud,time_bins
0,119106,shopping_pos,1.07,59.0,F,CT,5438,0,late morning
1,179292,kids_pets,94.99,41.0,M,OK,7163,0,afternoon
2,540729,kids_pets,31.28,69.0,F,MO,602,0,evening
3,374360,gas_transport,73.06,41.0,M,OK,7163,0,late morning
4,314574,misc_net,9.99,52.0,F,IA,2036,0,early morning
...,...,...,...,...,...,...,...,...,...
111139,444284,food_dining,8.22,24.0,F,IA,1583,0,afternoon
111140,89444,personal_care,205.69,35.0,F,OH,7646,0,night
111141,298536,misc_net,277.63,43.0,M,MA,47249,0,early morning
111142,301993,shopping_net,13.29,92.0,F,KY,571,0,early morning


In [15]:
#Dropping the transaction_id since it's not a feature for the model
model_df.drop(columns='transaction_id', inplace=True)
model_df

Unnamed: 0,category,amount,age,gender,state,city_pop,is_fraud,time_bins
0,shopping_pos,1.07,59.0,F,CT,5438,0,late morning
1,kids_pets,94.99,41.0,M,OK,7163,0,afternoon
2,kids_pets,31.28,69.0,F,MO,602,0,evening
3,gas_transport,73.06,41.0,M,OK,7163,0,late morning
4,misc_net,9.99,52.0,F,IA,2036,0,early morning
...,...,...,...,...,...,...,...,...
111139,food_dining,8.22,24.0,F,IA,1583,0,afternoon
111140,personal_care,205.69,35.0,F,OH,7646,0,night
111141,misc_net,277.63,43.0,M,MA,47249,0,early morning
111142,shopping_net,13.29,92.0,F,KY,571,0,early morning


In [16]:
#Transforming the categorical data into numeric
categorical_df = pd.get_dummies(model_df)
categorical_df.shape

(111144, 75)

# Separating the Lables and the Features

In [17]:
#Separating the labels and the features 
X = categorical_df.drop(["is_fraud"], axis='columns').values
y = categorical_df["is_fraud"].values

In [18]:
#Reviewing the series of lables
y

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [19]:
#Reviewing the features
X

array([[1.0700e+00, 5.9000e+01, 5.4380e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [9.4990e+01, 4.1000e+01, 7.1630e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.1280e+01, 6.9000e+01, 6.0200e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [2.7763e+02, 4.3000e+01, 4.7249e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.3290e+01, 9.2000e+01, 5.7100e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0046e+02, 3.9000e+01, 1.3600e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

# Split the Data

In [20]:
#Importing dependencies 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [21]:
#Splitting the data into training and testing sets. Random state of 1 will be assigned.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [22]:
#Creating a StandardScaler instance
scaler = StandardScaler()

#Fitting the StandardScaler
X_scaler = scaler.fit(X_train)

#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Compile and Train the Model

Using the training and testing data, we created a neural network model with three layers. We created our model using 80 nodes in the first layer and 30 nodes in the second. We used 100 epochs for optimal model performance.

In [24]:
#Importing dependencies 
import tensorflow as tf
from sklearn.metrics import mean_absolute_error

In [25]:
#Defining the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30
nn = tf.keras.models.Sequential()

#First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

#Second hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer2, input_dim=number_input_features, activation="relu")
)

#Output layer
nn.add(
    tf.keras.layers.Dense(units=1, activation="sigmoid")
)

#Checking the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                6000      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 8461 (33.05 KB)
Trainable params: 8461 (33.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
#Compiling the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
#Training the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


# Model Evaluation

The model achieved 99.7% accuracy, signifying that it was able to correctly identify instances of fraud 99.7% of the time.

In [28]:
#Evaluating the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

695/695 - 1s - loss: 0.0376 - accuracy: 0.9978 - 875ms/epoch - 1ms/step
Loss: 0.03763430193066597, Accuracy: 0.9978406429290771


In [29]:
#Making a prediction with the model
prediction = nn.predict(X_test)
prediction



array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [30]:
#Calculating the mean absolute error
mae = mean_absolute_error(y_test, prediction)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 0.013470125234471756


In [70]:
#Calculating balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, prediction)
print("Balanced Accuracy Score:", balanced_acc)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [71]:
#Calculating confusion matrix
conf_matrix = confusion_matrix(y_test, prediction)
print("Confusion Matrix:")
print(conf_matrix)


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [72]:
#Generating classification report
class_report = classification_report(y_test, prediction)
print("Classification Report:")
print(class_report)

ValueError: Classification metrics can't handle a mix of binary and continuous targets