## Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

In [2]:
# import the data into a dataframe
data = pd.read_csv("creditcard.csv")

# show the first five rows
data.head()

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:76274b691b16a6c49d3f159c883398e03cc...
1,size 150828752


## Data Exploration

In [3]:
data.describe()

Unnamed: 0,version https://git-lfs.github.com/spec/v1
count,2
unique,2
top,oid sha256:76274b691b16a6c49d3f159c883398e03cc...
freq,1


In [4]:
data.dtypes

version https://git-lfs.github.com/spec/v1    object
dtype: object

In [5]:
# check if there are any missing data
data.isnull().sum()

version https://git-lfs.github.com/spec/v1    0
dtype: int64

There are no missing data.

In [6]:
# separate the regular and fraudulent credit card transactions

genuine = data[data['Class'] == 0]
fraud = data[data['Class'] == 1]

# create a bar plot showcasing how many real vs fraud credit cards there are in the dataset

plt.title('Number of Genuine vs Fraudulent Credit Card Transactions')
plt.xlabel('Type of Credit Card Transaction')
plt.ylabel('Number of Transactions')
plt.bar(['Genuine', 'Fraud'], [len(genuine), len(fraud)])

KeyError: 'Class'

The number of genuine credit card transactions is significantly greater than the number of fraudulent credit card transactions. This can affect the model in a way such that the model may ignore the case of fraudulent credit card transactions when classifying the feature vectors. Thus, a sampling technique is needed before inputting the data into the model. In this particular case, I will be using a technique called random undersampling.

## Removing Duplicates

In [None]:
# remove duplicates
print(data.shape)
data.drop_duplicates(keep='first', inplace=True)
print(data.shape)

## Training and Test Sets

In [None]:
# seperate features from labels

X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
# split into training and test data 
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.2, random_state = 0)

## Random Undersampling Technique

Like I said earlier, there's a lot more genuine credit card transactions than fraud credit card transactions so to balance the dataset, let's use a technique called random undersampling.

In [None]:
# perform random undersampling by balancing the training set such that the number of geniune 
# credit card transactions equals the number of fraud credit card transactions

genuine_X_train = X_train.loc[y_train[y_train == 0].index, :]
fraud_X_train = X_train.loc[y_train[y_train == 1].index, :]

genuine_X_undersample = genuine_X_train.sample(len(fraud_X_train))
genuine_Y_undersample = y[genuine_X_undersample.index]

X_undersample = pd.concat([genuine_X_undersample, fraud_X_train])
y_undersample = y_train[X_undersample.index]

## Pipeline

In [None]:
# create a Multilayer Perceptron Classifier using 100 hidden layers and 10000 iterations
classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=10000)

In [None]:
# create a pipeline that normalizes the feature vectors using MinMaxScaler 
# and then passes the normalized feature vectors into a Multilayer Perceptron Classifier 
pipe = Pipeline(steps=[('scaler', MinMaxScaler()), ('clf', classifier)])

In [None]:
# train the model on the undersample data
pipe.fit(X_undersample, y_undersample)

In [None]:
# predictthe labels of the test data
y_pred = pipe.predict(X_test)

In [None]:
# evaluation function for the model
def evaluate(pipeline, y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1], average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('Evaluating the model:')
    print("Accuracy ", acc)
    print("Precision ", precision)
    print("Recall ", recall)
    print("ROC_AUC ", auc)
    print("f1 score ", f1)

In [None]:
evaluate(pipe, y_test, y_pred)