In [2]:
import pandas as pd
import numpy as np

from scipy import stats

from collections import Counter

In [3]:
df = pd.read_csv("./assets/fraud_detection_data.csv")

In [4]:
df['card_number'] = df['card_number'].astype(str)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743 entries, 0 to 1742
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   transaction_id          1743 non-null   object 
 1   transaction_date        1743 non-null   object 
 2   card_number             1743 non-null   object 
 3   card_type               1743 non-null   object 
 4   merchant_name           1743 non-null   object 
 5   merchant_category       1743 non-null   object 
 6   merchant_state          1743 non-null   object 
 7   merchant_city           1743 non-null   object 
 8   transaction_status      1743 non-null   object 
 9   transaction_amount      1743 non-null   float64
 10  merchant_category_code  1743 non-null   object 
 11  fraud_flag              1743 non-null   int64  
 12  cardholder_name         1743 non-null   object 
 13  items                   1743 non-null   object 
 14  prices                  1743 non-null   

In [6]:
df.describe()

Unnamed: 0,transaction_amount,fraud_flag
count,1743.0,1743.0
mean,8.32704,0.026391
std,5.127765,0.160342
min,6.59,0.0
25%,6.59,0.0
50%,6.59,0.0
75%,6.59,0.0
max,62.3,1.0


In [8]:
df.columns

Index(['transaction_id', 'transaction_date', 'card_number', 'card_type',
       'merchant_name', 'merchant_category', 'merchant_state', 'merchant_city',
       'transaction_status', 'transaction_amount', 'merchant_category_code',
       'fraud_flag', 'cardholder_name', 'items', 'prices'],
      dtype='object')

In [10]:
# analyze the distirbution in fraudulent and legitimate transactions, where fraud has a label of 1 and a legitimate transaction is 0:
Counter(df['fraud_flag'])

Counter({0: 1697, 1: 46})

In [11]:
import ast 

In [16]:
# convert the categorical columns in our data into machine readable codes
df['merchant_state'] = df['merchant_state'].astype('category')
df['merchant_state_code'] = df['merchant_state'].cat.codes

df['merchant_city'] = df['merchant_city'].astype('category')
df['merchant_city_code'] = df['merchant_city'].cat.codes


df['card_type'] = df['card_type'].astype('category')
df['card_type_code'] = df['card_type'].cat.codes


df['cardholder_name'] = df['cardholder_name'].astype('category')
df['cardholder_name_code'] = df['cardholder_name'].cat.codes

In [17]:
# count the number of items purchased for each transaction:
number_of_items = [len(ast.literal_eval(x)) for x in list(df['items'])]

df['number_of_items'] = number_of_items

In [19]:
#  use Z-scores to filter outliers in transaction_amount values.
# an outlier is defined as any data point that falls 3 or more standard deviations from the mean
threshold = 3
z_scores = np.abs(stats.zscore(df['transaction_amount']))
df_no_outliers = df[(z_scores < threshold)]

In [20]:
# Define our features and targets
features  = ['merchant_state_code','merchant_city_code', 'card_type_code','cardholder_name_code',
             'transaction_amount', 'number_of_items']
target = 'fraud_flag'
X = df_no_outliers[features]

y = df_no_outliers[target]

In [21]:
#  write our features inputs and target data to CSV files
X.to_csv("features.csv", index=False)

y.to_csv("targets.csv", index=False)