In [2]:
# Import libraries.
import numpy as np
import pandas as pd
import os
import pickle
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Data Preprocessing


In [3]:
# Load cleaned data.
df = pd.read_pickle('data/cleaned_data')
# df.head()

In [4]:
# create a column to match entered cvv and card cvv. If match then 1, if no match then 0.
df['matchingCVV'] = df['cardCVV'] == df['enteredCVV']
# define variable for later 
account_number = df['accountNumber']
# For true false columns, replace True with 1 and False with 0.
for col in ['cardPresent', 'matchingCVV','expirationDateKeyInMatch', 'isFraud']:
    df[col] = df[col].replace({False: 0, True: 1})
# df.head()

In [5]:
# Drop columns that will not be used for predictive modeling.
df.drop(['cardCVV','cardLast4Digits','merchantName','transactionDateTime','dateOfLastAddressChange','accountOpenDate',
         'currentExpDate','customerId','accountNumber','enteredCVV'], inplace=True, axis=1)

In [6]:
# Columns used
df.isnull().sum()

creditLimit                    0
availableMoney                 0
transactionAmount              0
acqCountry                  4562
merchantCountryCode          724
posEntryMode                4054
posConditionCode             409
merchantCategoryCode           0
transactionType              698
currentBalance                 0
cardPresent                    0
expirationDateKeyInMatch       0
isFraud                        0
matchingCVV                    0
dtype: int64

In [7]:
# Look at the ratio of our data 
cnt_fraud = df['isFraud'].value_counts()
cnt_fraud
ratio_cases = cnt_fraud/len(df.index)
print(f'Ratio of fraudulent cases: {ratio_cases[1]}\nRatio of non-fraudulent cases: {ratio_cases[0]}')

Ratio of fraudulent cases: 0.01579041740264992
Ratio of non-fraudulent cases: 0.98420958259735


In [8]:
# Due to the imbalanced dataset, data resampling is needed for better model results.
undersample = RandomUnderSampler()
y = df['isFraud']
df.drop('isFraud', inplace=True, axis=1)
x_new, y_new = undersample.fit_resample(df, y)
print('old sample shape: ', df.shape)
print('new sample shape: ', x_new.shape)


old sample shape:  (786363, 13)
new sample shape:  (24834, 13)


In [9]:
# Split new resampled data to train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new)

pipeline = ColumnTransformer([
    ('cat_pipe', Pipeline([
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))]),
     ['merchantCountryCode','merchantCategoryCode','posConditionCode', 'posEntryMode','transactionType',
      'acqCountry']),], remainder='passthrough')

other_columns = [x for x in df.columns if x not in ['merchantCountryCode','merchantCategoryCode','posConditionCode',
                                                        'posEntryMode','transactionType','acqCountry']]
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
print(x_train.shape)

# Pickle preprocessed data 
data = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
with open('data/preprocessed_data', 'wb') as file:
    pickle.dump(data, file, protocol=4)

(18625, 45)
