# Credit Card Fraud Detection Model
**Using data from Synthetic E-commerce Electronics Sales Data available on DataFabrica.** 

The data contains synthetic ecommerce sales data for different electronic products such as phones, tablets, and laptops.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter

## Prepare Data

In [2]:
df = pd.read_csv(r"C:\Users\arich\Desktop\PY/\raw_data\fraud_detection_data.csv")
df.head(5)

Unnamed: 0,transaction_id,transaction_date,card_number,card_type,merchant_name,merchant_category,merchant_state,merchant_city,transaction_status,transaction_amount,merchant_category_code,fraud_flag,cardholder_name,items,prices
0,1ANBFI-TOFZ-J1FHXR,2023-02-17 07:16,4318249446099857,visa,Subway,Fast Food,New Hampshire,Manchester,Declined,6.59,MCC 5814,0,James Lee,['Italian BMT'],[6.59]
1,UIF4NV-FC9E-XKCNBG,2023-03-22 17:22,4101211350337682,visa,Subway,Fast Food,New Hampshire,Nashua,Declined,6.59,MCC 5814,0,Gabriella Hicks,['Italian BMT'],[6.59]
2,75UXRX-LGYI-JC802K,2023-06-14 16:49,4436281502955562,visa,Subway,Fast Food,Maine,Lewiston,Successful,6.59,MCC 5814,0,Mr. Eduardo Ford,['Italian BMT'],[6.59]
3,3D1LES-NSWC-UGGOMS,2023-03-23 07:13,4426321457580551,visa,Subway,Fast Food,Georgia,Augusta,Declined,13.08,MCC 5814,0,Justin Francis,"['Italian BMT', 'Oven Roasted Chicken']","[6.59, 6.49]"
4,8G8LEE-313G-XFTWHT,2023-03-13 22:42,4947626214893547,visa,Subway,Fast Food,New Hampshire,Nashua,Pending,6.59,MCC 5814,0,Sara Reese,['Italian BMT'],[6.59]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743 entries, 0 to 1742
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   transaction_id          1743 non-null   object 
 1   transaction_date        1743 non-null   object 
 2   card_number             1743 non-null   int64  
 3   card_type               1743 non-null   object 
 4   merchant_name           1743 non-null   object 
 5   merchant_category       1743 non-null   object 
 6   merchant_state          1743 non-null   object 
 7   merchant_city           1743 non-null   object 
 8   transaction_status      1743 non-null   object 
 9   transaction_amount      1743 non-null   float64
 10  merchant_category_code  1743 non-null   object 
 11  fraud_flag              1743 non-null   int64  
 12  cardholder_name         1743 non-null   object 
 13  items                   1743 non-null   object 
 14  prices                  1743 non-null   

In [4]:
# make card number a string
df['card_number'] = df['card_number'].astype(str)

In [5]:
df.describe()

Unnamed: 0,transaction_amount,fraud_flag
count,1743.0,1743.0
mean,8.32704,0.026391
std,5.127765,0.160342
min,6.59,0.0
25%,6.59,0.0
50%,6.59,0.0
75%,6.59,0.0
max,62.3,1.0


In [6]:
df.columns

Index(['transaction_id', 'transaction_date', 'card_number', 'card_type',
       'merchant_name', 'merchant_category', 'merchant_state', 'merchant_city',
       'transaction_status', 'transaction_amount', 'merchant_category_code',
       'fraud_flag', 'cardholder_name', 'items', 'prices'],
      dtype='object')

Using the Counter method to analyze the distirbution in fraudulent and legitimate transactions, where fraud has a label of 1 and a legitimate transaction is 0

In [7]:
print(Counter(df['fraud_flag']))

Counter({0: 1697, 1: 46})


In [8]:
# convert the categorical columns

df['merchant_state'] = df['merchant_state'].astype('category')
df['merchant_state_code'] = df['merchant_state'].cat.codes

df['merchant_city'] = df['merchant_city'].astype('category')
df['merchant_city_code'] = df['merchant_city'].cat.codes


df['card_type'] = df['card_type'].astype('category')
df['card_type_code'] = df['card_type'].cat.codes


df['cardholder_name'] = df['cardholder_name'].astype('category')
df['cardholder_name_code'] = df['cardholder_name'].cat.codes

In [9]:
import ast
number_of_items = [len(ast.literal_eval(x)) for x in list(df['items'])]

df['number_of_items'] = number_of_items

I will use Z-scores to filter outliers in transaction_amount values. Here I will define an outlier as any data point that falls 3 or more standard deviations from the mean


In [10]:
#remove outliers

threshold = 3
z_scores = np.abs(stats.zscore(df['transaction_amount']))
df_no_outliers = df[(z_scores < threshold)]
df_no_outliers

Unnamed: 0,transaction_id,transaction_date,card_number,card_type,merchant_name,merchant_category,merchant_state,merchant_city,transaction_status,transaction_amount,merchant_category_code,fraud_flag,cardholder_name,items,prices,merchant_state_code,merchant_city_code,card_type_code,cardholder_name_code,number_of_items
0,1ANBFI-TOFZ-J1FHXR,2023-02-17 07:16,4318249446099857,visa,Subway,Fast Food,New Hampshire,Manchester,Declined,6.59,MCC 5814,0,James Lee,['Italian BMT'],[6.59],28,53,3,116,1
1,UIF4NV-FC9E-XKCNBG,2023-03-22 17:22,4101211350337682,visa,Subway,Fast Food,New Hampshire,Nashua,Declined,6.59,MCC 5814,0,Gabriella Hicks,['Italian BMT'],[6.59],28,60,3,100,1
2,75UXRX-LGYI-JC802K,2023-06-14 16:49,4436281502955562,visa,Subway,Fast Food,Maine,Lewiston,Successful,6.59,MCC 5814,0,Mr. Eduardo Ford,['Italian BMT'],[6.59],18,46,3,213,1
3,3D1LES-NSWC-UGGOMS,2023-03-23 07:13,4426321457580551,visa,Subway,Fast Food,Georgia,Augusta,Declined,13.08,MCC 5814,0,Justin Francis,"['Italian BMT', 'Oven Roasted Chicken']","[6.59, 6.49]",9,3,3,148,2
4,8G8LEE-313G-XFTWHT,2023-03-13 22:42,4947626214893547,visa,Subway,Fast Food,New Hampshire,Nashua,Pending,6.59,MCC 5814,0,Sara Reese,['Italian BMT'],[6.59],28,60,3,255,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738,IZ03FL-9QC6-7HGODT,2023-04-23 05:15,6011364363299557,discover,Subway,Fast Food,Kentucky,Lexington,Successful,6.59,MCC 5814,0,Ashley Owens,['Italian BMT'],[6.59],16,47,1,33,1
1739,75VXBE-L7JZ-4P45BB,2022-07-02 23:55,6557511708204946,discover,Subway,Fast Food,Pennsylvania,Pittsburgh,Pending,13.08,MCC 5814,0,Gabriel Davis,"['Italian BMT', 'Oven Roasted Chicken']","[6.59, 6.49]",37,74,1,99,2
1740,0E6LZT-76LT-882JBU,2021-05-08 21:14,6011361329927008,discover,Subway,Fast Food,Colorado,Denver,Successful,6.59,MCC 5814,0,Danielle Hammond,['Italian BMT'],[6.59],5,24,1,77,1
1741,LO6R00-IIAW-PDBS6T,2020-02-04 18:23,6011446757183347,discover,Subway,Fast Food,New Mexico,Albuquerque,Successful,6.59,MCC 5814,0,Ashley Owens,['Italian BMT'],[6.59],30,0,1,33,1


In [11]:
# we will define our features and targets. list of features and the fraud_flag column as our target

X  = ['merchant_state_code','merchant_city_code', 'card_type_code','cardholder_name_code',
             'transaction_amount', 'number_of_items']
y = 'fraud_flag'

In [12]:
features = df_no_outliers[X]

targets = df_no_outliers[y]

In [13]:
features.head(5)


Unnamed: 0,merchant_state_code,merchant_city_code,card_type_code,cardholder_name_code,transaction_amount,number_of_items
0,28,53,3,116,6.59,1
1,28,60,3,100,6.59,1
2,18,46,3,213,6.59,1
3,9,3,3,148,13.08,2
4,28,60,3,255,6.59,1


In [14]:
targets.head(5)

0    0
1    0
2    0
3    0
4    0
Name: fraud_flag, dtype: int64

## Building the model

**Using the random forest classification class**

I will this use to define our random forest model, the train/test split module which I will use to split data for training and testing, and the GridSearchCV method which I will use to perform our hyperparameter search

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris

 **I will define the grid I will use to perform our hyperparameter search.**

I will define a range of estimators, which are the number of individual decision trees used to construct the random forest, as 20–100 estimators, with an increment value of 20.

I will define the range of max depth values for each decision tree, which is the maximum number of nodes from the root of the decision tree to the farthest leaf, as 5–20 in increments of 5:

In [16]:
n_estimators_range = np.arange(20, 100, 20)
max_depth_range = np.arange(5, 30, 5)
param_grid = {
    'n_estimators': n_estimators_range,
    'max_depth': max_depth_range,
}

I will do is define our random forest classifier model object. I will also set a value for the random state which will ensure that our results are reproducible

In [17]:
rf_classifier = RandomForestClassifier(random_state=64)

I will then split the data for training and testing. I will set a value for random_state to ensure reproducibility, and define the test size as a randomized 20% of the data. The remaining 80% will be used for training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, random_state=128, test_size = 0.2)

I will perform a random forest hyperparameter grid search, select the best performing model (based on within sample cross-validation), and generate predictions on the test set.

I will calling the **GridSearchCV** method with our random forest model object **rf_classifier**; **param_grid**. The number of folders for the cross validation as 5, and "precision" as our scoring metric for hyperparameter selection

In [19]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='precision')

In [20]:
# I will then perform the grid search using the training data: 

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=64),
             param_grid={'max_depth': array([ 5, 10, 15, 20, 25]),
                         'n_estimators': array([20, 40, 60, 80])},
             scoring='precision')

In [21]:
# The best set of hyperparameters:

best_rf_model = grid_search.best_estimator_
best_rf_model

RandomForestClassifier(max_depth=5, n_estimators=20, random_state=64)

In [22]:
# Optimal model has been defined and predictions were made on the test set
y_pred = best_rf_model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Evaluate the model

In [23]:
validation_data = X_test
validation_data['actual'] = y_test
validation_data['predicted'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['actual'] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['predicted'] = y_pred


In [24]:
#store the actual and predicticted values in variables called actual and predicted

actual, predicted = validation_data['actual'], validation_data['predicted']

print(actual.head())

print(predicted.head())

612     0
885     0
820     0
670     0
1308    0
Name: actual, dtype: int64
612     0
885     0
820     0
670     0
1308    0
Name: predicted, dtype: int64


In [25]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
precision = precision_score(actual, predicted)
print("precision: ", precision)

# A precision score of 1 means that every transaction predicted as fraudulent by the model was indeed a fraudulent transaction.

precision:  1.0


In [26]:
accuracy = accuracy_score(actual, predicted)
print("accuracy: ", accuracy)

# A value of 0.979 for precision. This means that 97.9% of predictions were correct. 
# This is deceptively high and should not be interpeted in isolation due to the fact that most of the transations are legitimate.

accuracy:  0.9793510324483776


In [27]:
recall = recall_score(actual, predicted)
print("recall: ", recall)

# A recall score of 0.125 means that the model is only capturing 12.5% of the total fraudulent transactions.
# The remaining 87.5% of fraudulent transactions are being missed (classified as false negatives).

recall:  0.125


In [28]:
f1_score = f1_score(actual, predicted)
print("f1_score: ", f1_score)

f1_score:  0.2222222222222222
