## Importing

### Import Libs

In [1]:
#STEP 1: IMPORTING PACKAGES

import pandas as pd # this package is for data processing
import numpy as np #library to work with arrays
import matplotlib.pyplot as plt #library to create visualizations in Python
import tensorflow as tf #TensorFlow
from termcolor import colored as cl #tool for text customization
import itertools #library in Python consisting of multiple methods used in iterators

from sklearn.preprocessing import StandardScaler #Tool for data normalization/standardization
from sklearn.model_selection import train_test_split #function that splits data arrays into 2 subsets: training & testing
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier #Decision tree algorithm
from sklearn.neighbors import KNeighborsClassifier #algorithm that implements learning based on the k nearest neighbors
from sklearn.linear_model import LogisticRegression #Logistic regression algorithm (ML algorithm used to predict the probability of a categorical dependent variable)
from sklearn.svm import SVC #Stands for "Support Vector Classification", and it's an SVM algorithm
from sklearn.ensemble import RandomForestClassifier #Random forest tree algorithm
from xgboost import XGBClassifier #Machine Learning Algorithm.

from sklearn.metrics import confusion_matrix #evaluation metric
from sklearn.metrics import accuracy_score #evaluation metric
from sklearn.metrics import f1_score #evaluation metric



import os

### Import Data

In [2]:
#Now I'm going to import my data from the Kaggle dataset which I have previously downloaded to my computer from
#the following website:

#kaggle dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud


from google.colab import files #On Nov 28 I introduced these two lines of code and it was one of the best decisions.

if not os.path.exists('./fraudTrain.csv'):
  uploadedTrain = files.upload() #This creates a widget prompting the user to browse for a file and uploads it to files inside Colab.
if not os.path.exists('./fraudTest.csv'):
  uploadedTrain = files.upload() #This creates a widget prompting the user to browse for a file and uploads it to files inside Colab.

train = pd.read_csv('./fraudTrain.csv')
test = pd.read_csv('./fraudTest.csv')

train.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('Unnamed: 0', axis=1, inplace=True)
total = pd.concat([test,train])

del train
del test

total.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Processing



### Feature Engineering

In [4]:
total['trans_date_trans_time']=pd.to_datetime(total['trans_date_trans_time'])
total['trans_date']=total['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
total['trans_date']=pd.to_datetime(total['trans_date'])
total['dob']=pd.to_datetime(total['dob'])

In [5]:
total["age"] = total["trans_date"]-total["dob"]
total["age"]=round(total["age"].dt.days / 365.25).astype('int32')

total['age']

Unnamed: 0,age
0,52
1,30
2,50
3,33
4,65
...,...
1296670,59
1296671,41
1296672,53
1296673,40


In [6]:
total['trans_month'] = pd.DatetimeIndex(total['trans_date']).month
total['trans_year'] = pd.DatetimeIndex(total['trans_date']).year

In [7]:
total['latitudinal_distance'] = abs(round(total['merch_lat']-total['lat'],3))
total['longitudinal_distance'] = abs(round(total['merch_long']-total['long'],3))

### Categorize Features

In [8]:
total['merchant'] = total['merchant'].map(lambda merchant: merchant.replace('fraud_', ''))
merchants_unique = total['merchant'].unique()
total['merchant'] = total['merchant'].map(lambda merchant: np.where(merchants_unique == merchant)[0][0]).astype('int32')

total['merchant']

Unnamed: 0,merchant
0,0
1,1
2,2
3,3
4,4
...,...
1296670,325
1296671,326
1296672,81
1296673,244


In [9]:
jobs_unique = total['job'].unique()
total['job'] = total['job'].map(lambda job: np.where(jobs_unique == job)[0][0]).astype('int16')

total['job']

Unnamed: 0,job
0,0
1,1
2,2
3,3
4,4
...,...
1296670,213
1296671,45
1296672,323
1296673,452


In [10]:
states_unique = total['state'].unique()
total['state'] = total['state'].map(lambda state: np.where(states_unique == state)[0][0]).astype('int16')

total['state']

Unnamed: 0,state
0,0
1,1
2,2
3,3
4,4
...,...
1296670,1
1296671,21
1296672,35
1296673,6


### One-hot encoding categories

In [11]:
total = pd.get_dummies(total, columns=['category', 'gender'])
total

Unnamed: 0,trans_date_trans_time,cc_num,merchant,amt,first,last,street,city,state,zip,...,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M
0,2020-06-21 12:14:25,2291163933867244,0,2.86,Jeff,Elliott,351 Darlene Green,Columbia,0,29209,...,False,False,False,False,True,False,False,False,False,True
1,2020-06-21 12:14:33,3573030041201292,1,29.84,Joanne,Williams,3638 Marsh Union,Altonah,1,84002,...,False,False,False,False,True,False,False,False,True,False
2,2020-06-21 12:14:53,3598215285024754,2,41.28,Ashley,Lopez,9333 Valentine Point,Bellmore,2,11710,...,False,False,False,False,False,False,False,False,True,False
3,2020-06-21 12:15:15,3591919803438423,3,60.05,Brian,Williams,32941 Krystal Mill Apt. 552,Titusville,3,32780,...,False,False,False,True,False,False,False,False,False,True
4,2020-06-21 12:15:17,3526826139003047,4,3.19,Nathan,Massey,5783 Evan Roads Apt. 465,Falmouth,4,49632,...,False,False,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,325,15.56,Erik,Patterson,162 Jessica Row Apt. 072,Hatch,1,84735,...,False,False,False,False,False,False,False,False,False,True
1296671,2020-06-21 12:12:19,6011149206456997,326,51.70,Jeffrey,White,8617 Holmes Terrace Suite 651,Tuscarora,21,21790,...,False,False,False,False,False,False,False,False,False,True
1296672,2020-06-21 12:12:32,3514865930894695,81,105.93,Christopher,Castaneda,1632 Cohen Drive Suite 639,High Rolls Mountain Park,35,88325,...,False,False,False,False,False,False,False,False,False,True
1296673,2020-06-21 12:13:36,2720012583106919,244,74.90,Joseph,Murray,42933 Ryan Underpass,Manderson,6,57756,...,False,False,False,False,False,False,False,False,False,True


### Data Cleaning

In [12]:
total.drop(['first','last','street','city','zip','unix_time','trans_num','dob','trans_date'],axis=1,inplace=True)


In [13]:
total['trans_date_trans_time'] = total['trans_date_trans_time'].astype('int64')
total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 1296674
Data columns (total 33 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   trans_date_trans_time    int64  
 1   cc_num                   int64  
 2   merchant                 int32  
 3   amt                      float64
 4   state                    int16  
 5   lat                      float64
 6   long                     float64
 7   city_pop                 int64  
 8   job                      int16  
 9   merch_lat                float64
 10  merch_long               float64
 11  is_fraud                 int64  
 12  age                      int32  
 13  trans_month              int32  
 14  trans_year               int32  
 15  latitudinal_distance     float64
 16  longitudinal_distance    float64
 17  category_entertainment   bool   
 18  category_food_dining     bool   
 19  category_gas_transport   bool   
 20  category_grocery_net     bool   
 21  category_groc

### Data Splitting

In [14]:
df_majority = total[(total['is_fraud']==0)]
df_minority = total[(total['is_fraud']==1)]

df_majority.shape,df_minority.shape


((1842743, 33), (9651, 33))

In [15]:
df_minority_upsampled = resample(df_minority,
                                 replace=True,    # sample with replacement
                                 n_samples= 1842743, # to match majority class
                                 random_state=42)  # reproducible results
df_minority_upsampled.shape

(1842743, 33)

In [16]:
total_upsampled = pd.concat([df_minority_upsampled, df_majority])
total_upsampled.shape

del df_majority
del df_minority

total = total_upsampled

In [17]:
X = total.drop('is_fraud', axis = 1).values
y = total['is_fraud'].values

del total

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(cl('X_train samples : ', attrs= ['bold']), X_train[:1])
print(cl('X_test samples : ', attrs= ['bold']), X_test[0:1])
print(cl('y_train samples : ', attrs=['bold']), y_train[0:20])
print(cl('y_test samples: ', attrs = ['bold']), y_test[0:20])

X_train samples :  [[1601503803000000000 180020605265701 346 943.12 23 39.8616 -97.1825 314
  310 39.237628 -97.445436 19 9 2020 0.624 0.263 False False False False
  False False False False False False False True False False False True]]
X_test samples :  [[1600898614000000000 60487002085 180 1057.35 22 32.3739 -90.1293 233060
  215 32.205139 -90.731305 48 9 2020 0.169 0.602 False False False False
  False False False False False False False True False False False True]]
y_train samples :  [1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 1]
y_test samples:  [1 0 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 0 0]


We have all the required components to build our classification models, which is our next step.

# **Building the Model**

For this project we will build 6 different types of classification models using algorithms by scikit-learn.

1. Decision Tree
2. K-Nearest Neighbors (KNN)
3. Logistic Regression
4. Support Vector Machine (SVM)
5. Random Forest
6. XGBoost

# **Evaluation Metrics for Classification Models**

We will use the following metrics to evaluate the models and decide which one is the best#

+ Accuracy Score
+ F1 Score
+ Confusion Matrix



In [18]:
# Models

# 1. Decision Tree

#When max_depth = 4 we allow the tree to split four times
#Criterion is a parameter that measures the quality of a split
#in our decision trees. "Gini Index" & "Entropy" are two
#different measures of impurity or disorder*?
tree_model = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)

In [19]:
#Let's print the evaluation metrics for this model.

# 1. Accuracy score

print(cl('ACCURACY SCORE', attrs = ['bold']))
print(cl('------------------------------------------------------------------------', attrs = ['bold']))
print(cl('Accuracy score of the Decision Tree Model is {}'.format(accuracy_score(y_test, tree_yhat)), attrs= ['bold']))
print(cl('------------------------------------------------------------------------', attrs = ['bold']))

ACCURACY SCORE
------------------------------------------------------------------------
Accuracy score of the Decision Tree Model is 0.9092752931089217
------------------------------------------------------------------------


In [None]:
#Models

#2. K-Nearest Neighbors

n = 5

knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)

In [None]:
#Evaluation Metrics from this metric

print(cl('Accuracy score of the KNN model is {}'.format(accuracy_score(y_test, knn_yhat)), attrs = ['bold'], color = 'green'))


In [None]:
#Models

#3. Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

In [None]:
print(cl('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat)), attrs= ['bold']))

In [None]:
#Models

#4. SVM

svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

In [None]:
print(cl('Accuracy score of the SVM model is{}'.format(accuracy_score(y_test, svm_yhat)), attrs = ['bold']))

In [None]:
#Models

#5. Random Forest Tree

rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

In [None]:
print(cl('Accuracy score of the Random Forest Tree model is {}'.format(accuracy_score(y_test, rf_yhat)), attrs = ['bold']))

In [None]:
 #Models

 #6. XGBoost

 xgb = XGBClassifier(max_depth = 4)
 xgb.fit(X_train, y_train)
 xgb_yhat = xgb.predict(X_test)

In [None]:
print(cl('Accuracy score of the XGBoost model is {}'.format(accuracy_score(y_test, xgb_yhat)), attrs = ['bold']))

In [None]:
X_train[0]

In [None]:
 #Models

 #7. MLP

mlp = tf.keras.Sequential([
  tf.keras.Input(shape=X_train[0].shape),
  tf.keras.layers.Dense(30, activation='sigmoid'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(18, activation='sigmoid'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(9, activation='sigmoid'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

mlp.compile(
  optimizer=tf.keras.optimizers.Adamax(0.001),
  loss=tf.keras.losses.BinaryCrossentropy(),
  metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),
  tf.keras.metrics.AUC()]
)

mlp.fit(tf.convert_to_tensor(X_train), y_train, epochs=20)
mlp_yhat = mlp.predict(X_test)

In [None]:
# 2. F1 scores

print(cl('F1 SCORES', attrs = ['bold']))

print(cl('F1 score of the Decision Tree model is {}'.format(f1_score(y_test, tree_yhat)), attrs = ['bold']))
print(cl('F1 score of the KNN model is {}'.format(f1_score(y_test, knn_yhat)), attrs = ['bold'], color = 'green'))
print(cl('F1 score of the Logistic Regression model is {}'.format(f1_score(y_test, lr_yhat)), attrs = ['bold'], color = 'red'))
print(cl('F1 score of the SVM model is {}'.format(f1_score(y_test, svm_yhat)), attrs = ['bold']))
print(cl('F1 score of the Random Forest Tree model is {}'.format(f1_score(y_test, rf_yhat)), attrs = ['bold']))
print(cl('F1 score of the XGBoost model is {}'.format(f1_score(y_test, xgb_yhat)), attrs = ['bold']))

# **Confusion Matrix**

In [None]:
#3. Confusion Matrix

#defining the plot function

#Everything works perfectly and the code below creates a confusion matrix for each of the classification models
#You can see those on the files tab or download them as a png file to see.
#for some reason each picture it creates is blank. I need to fix this.

def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
  title = 'Confusion Matrix of {}'.format(title)
  if normalize:
    cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    #cmap stands for color map and it's part of a library. Matplotlib?
    plt.title(title) #plt is the pyplot package that we imported at the beginning of our code.
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
      plt.text(j, i, format(cm[i, j], fmt),
               horizontalalignment = 'center',
               color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Now that we have the code above we are going to compute the confusion matrix for each of the classification models.

tree_matrix = confusion_matrix(y_test, tree_yhat, labels = [0, 1]) # Decision Tree
knn_matrix = confusion_matrix(y_test, knn_yhat, labels = [0, 1]) #K-Nearest Neighbors
lr_matrix = confusion_matrix(y_test, lr_yhat, labels = [0, 1]) #Logistic Regression
svm_matrix = confusion_matrix(y_test, svm_yhat, labels = [0, 1]) #Support Vector Machine
rf_matrix = confusion_matrix(y_test, rf_yhat, labels = [0, 1]) #Random Forest Tree
xgb_matrix = confusion_matrix(y_test, xgb_yhat, labels = [0, 1]) #XGBoost

#Plot each confusion matrix

plt.rcParams['figure.figsize'] = (6,6)

#1. Decision tree

tree_cm_plot = plot_confusion_matrix(tree_matrix,
                                     classes = ['Non-Default(0)', 'Default(1)'],
                                     normalize = False, title = 'Decision Tree')
plt.savefig('tree_cm_plot.png')
plt.show()

#2. K-Nearest Neighbors

knn_cm_plot = plot_confusion_matrix(knn_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'KNN')
plt.savefig('knn_cm_plot.png')
plt.show()

#3. Logistic Regression

lr_cm_plot = plot_confusion_matrix(lr_matrix,
                                   classes = ['Non-Default(0)', 'Default(1)'],
                                   normalize = False, title = 'Logistic Regression')
plt.savefig('lr_cm_plot.png')
plt.show()

#4. Support Vector Machine

svm_cm_plot = plot_confusion_matrix(svm_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'SVM')
plt.savefig('svm_cm_plot.png')
plt.show()

#5. Random Forest Tree

rf_cm_plot = plot_confusion_matrix(rf_matrix,
                                   classes = ['Non-Default(0)', 'Default(1)'],
                                   normalize = False, title = 'Random Forest Tree')
plt.savefig('rf_cm_plot.png')
plt.show()

#6. XGBoost

xgb_cm_plot = plot_confusion_matrix(xgb_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'XGBoost')
plt.savefig('xgb_cm_plot.png')
plt.show()