In [None]:
#STEP 1: IMPORTING PACKAGES

import pandas as pd # this package is for data processing
import numpy as np #library to work with arrays
import matplotlib.pyplot as plt #library to create visualizations in Python
import tensorflow as tf #TensorFlow
from termcolor import colored as cl #tool for text customization
import itertools #library in Python consisting of multiple methods used in iterators

from sklearn.preprocessing import StandardScaler #Tool for data normalization/standardization
from sklearn.model_selection import train_test_split #function that splits data arrays into 2 subsets: training & testing
from sklearn.tree import DecisionTreeClassifier #Decision tree algorithm
from sklearn.neighbors import KNeighborsClassifier #algorithm that implements learning based on the k nearest neighbors
from sklearn.linear_model import LogisticRegression #Logistic regression algorithm (ML algorithm used to predict the probability of a categorical dependent variable)
from sklearn.svm import SVC #Stands for "Support Vector Classification", and it's an SVM algorithm
from sklearn.ensemble import RandomForestClassifier #Random forest tree algorithm
from xgboost import XGBClassifier #Machine Learning Algorithm.

from sklearn.metrics import confusion_matrix #evaluation metric
from sklearn.metrics import accuracy_score #evaluation metric
from sklearn.metrics import f1_score #evaluation metric

**Notes:**
- Iterators are objects that allow us to iterate over all the elements of a collection and return one element at a time.
- When a dataset contains variable that are in different scales, *StandardScaler* performs the task of **Standardization** so they have a common scale.
- **Support Vector Machines** are supervised learning methods used for classification.
- **Random Forest Classifier** creates a set of decision trees from a random subset of the training set.
- **XGBoost** is an implementation of gradient boosted decision trees designed for speed and performance.
- A **Confusion Matrix** is a summary of prediction results on a classification problem.
- Accuracy is simply a ratio of correctly predicted observations to the total observations.
- Also known as F-score, the **F1 Score** is a weighted average of the precision and recall score. Used as an evaluation metric, a high F-score is a sign of a well-performing model.

In [None]:
#Now I'm going to import my data from the Kaggle dataset which I have previously downloaded to my computer from
#the following website:

#kaggle dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud


from google.colab import files #On Nov 28 I introduced these two lines of code and it was one of the best decisions.
uploaded = files.upload() #This creates a widget prompting the user to browse for a file and uploads it to files inside Colab.
df = pd.read_csv('../../Downloads/creditcard.csv')
df.drop('Time', axis = 1, inplace = True)
df.head

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cases = len(df)
nonfraud_count = len(df[df.Class == 0])
fraud_count = len(df[df.Class == 1])
fraud_percentage = round(fraud_count/nonfraud_count*100, 2)

print(cl('CASE COUNT', attrs = ['bold']))
print(cl('--------------------------------------------------', attrs= ['bold']))
print(cl('Total number of cases are {}'.format(cases), attrs = ['bold']))
print(cl('Number of Non-fraud cases are {}'.format(nonfraud_count), attrs = ['bold']))
print(cl('Percentage of fraud cases is {}'.format(fraud_percentage), attrs = ['bold']))
print(cl('--------------------------------------------------', attrs = ['bold']))

[1mCASE COUNT[0m
[1m--------------------------------------------------[0m
[1mTotal number of cases are 284807[0m
[1mNumber of Non-fraud cases are 284315[0m
[1mPercentage of fraud cases is 0.17[0m
[1m--------------------------------------------------[0m


**Notes:**
- When working with the **print statement** I am wondering... *What does the "cl" stand for?*

In [None]:
nonfraud_cases = df[df.Class == 0]
fraud_cases = df[df.Class == 1]

#What are we doing here? Are we creating a class or some sort
#of boolean?

print(cl('CASE AMOUNT STATISTICS', attrs = ['bold']))
print(cl('--------------------------------------------------', attrs= ['bold']))
print(cl('NON-FRAUD CASE AMOUNT STATS', attrs= ['bold']))
print(nonfraud_cases.Amount.describe())
print(cl('--------------------------------------------------', attrs= ['bold']))
print(cl('FRAUD CASE AMOUNT STATS', attrs= ['bold']))
print(fraud_cases.Amount.describe())
print(cl('--------------------------------------------------', attrs= ['bold']))

#I think what this is doing is to call the describe function
#with amounts. But what does *.amount* do?

[1mCASE AMOUNT STATISTICS[0m
[1m--------------------------------------------------[0m
[1mNON-FRAUD CASE AMOUNT STATS[0m
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64
[1m--------------------------------------------------[0m
[1mFRAUD CASE AMOUNT STATS[0m
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64
[1m--------------------------------------------------[0m


In [None]:
#Now we are going to normalize our data because we need to
#reduce the wide range of values in the "Amount" variable to
#a smaller scale so we can work better.

sc = StandardScaler()
amount = df['Amount'].values

df['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

print(cl(df['Amount'].head(10), attrs = ['bold']))

[1m0    0.244964
1   -0.342475
2    1.160686
3    0.140534
4   -0.073403
5   -0.338556
6   -0.333279
7   -0.190107
8    0.019392
9   -0.338516
Name: Amount, dtype: float64[0m


In [None]:
# DATA SPLIT
#Defining the independent and dependent variables.

X = df.drop('Class', axis = 1).values
y = df['Class'].values

#What does this line of code below mean?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(cl('X_train samples : ', attrs= ['bold']), X_train[:1])
print(cl('X_test samples : ', attrs= ['bold']), X_test[0:1])
print(cl('y_train samples : ', attrs=['bold']), y_train[0:20])
print(cl('y_test samples: ', attrs = ['bold']), y_test[0:20])

[1mX_train samples : [0m [[-1.11504743  1.03558276  0.80071244 -1.06039825  0.03262117  0.85342216
  -0.61424348 -3.23116112  1.53994798 -0.81690879 -1.30559201  0.1081772
  -0.85960958 -0.07193421  0.90665563 -1.72092961  0.79785322 -0.0067594
   1.95677806 -0.64489556  3.02038533 -0.53961798  0.03315649 -0.77494577
   0.10586781 -0.43085348  0.22973694 -0.0705913  -0.30145418]]
[1mX_test samples : [0m [[-0.32333357  1.05745525 -0.04834115 -0.60720431  1.25982115 -0.09176072
   1.1591015  -0.12433461 -0.17463954 -1.64440065 -1.11886302  0.20264731
   1.14596495 -1.80235956 -0.24717793 -0.06094535  0.84660574  0.37945439
   0.84726224  0.18640942 -0.20709827 -0.43389027 -0.26161328 -0.04665061
   0.2115123   0.00829721  0.10849443  0.16113917 -0.19330595]]
[1my_train samples : [0m [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1my_test samples: [0m [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


We have all the required components to build our classification models, which is our next step.

# **Building the Model**

For this project we will build 6 different types of classification models using algorithms by scikit-learn.

1. Decision Tree
2. K-Nearest Neighbors (KNN)
3. Logistic Regression
4. Support Vector Machine (SVM)
5. Random Forest
6. XGBoost

# **Evaluation Metrics for Classification Models**

We will use the following metrics to evaluate the models and decide which one is the best#

+ Accuracy Score
+ F1 Score
+ Confusion Matrix



In [None]:
# Models

# 1. Decision Tree

#When max_depth = 4 we allow the tree to split four times
#Criterion is a parameter that measures the quality of a split
#in our decision trees. "Gini Index" & "Entropy" are two
#different measures of impurity or disorder*?
tree_model = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)

In [None]:
#Let's print the evaluation metrics for this model.

# 1. Accuracy score

print(cl('ACCURACY SCORE', attrs = ['bold']))
print(cl('------------------------------------------------------------------------', attrs = ['bold']))
print(cl('Accuracy score of the Decision Tree Model is {}'.format(accuracy_score(y_test, tree_yhat)), attrs= ['bold']))
print(cl('------------------------------------------------------------------------', attrs = ['bold']))

[1mACCURACY SCORE[0m
[1m------------------------------------------------------------------------[0m
[1mAccuracy score of the Decision Tree Model is 0.9993679997191109[0m
[1m------------------------------------------------------------------------[0m


In [None]:
#Models

#2. K-Nearest Neighbors

n = 5

knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)

In [None]:
#Evaluation Metrics from this metric

print(cl('Accuracy score of the KNN model is {}'.format(accuracy_score(y_test, knn_yhat)), attrs = ['bold'], color = 'green'))


[1m[32mAccuracy score of the KNN model is 0.9995259997893332[0m


In [None]:
#Models

#3. Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

In [None]:
print(cl('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat)), attrs= ['bold']))

[1mAccuracy score of the Logistic Regression model is 0.9991924440855307[0m


In [None]:
#Models

#4. SVM

svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

In [None]:
print(cl('Accuracy score of the SVM model is{}'.format(accuracy_score(y_test, svm_yhat)), attrs = ['bold']))

[1mAccuracy score of the SVM model is0.9993153330290369[0m


In [None]:
#Models

#5. Random Forest Tree

rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

In [None]:
print(cl('Accuracy score of the Random Forest Tree model is {}'.format(accuracy_score(y_test, rf_yhat)), attrs = ['bold']))

[1mAccuracy score of the Random Forest Tree model is 0.9992802219023208[0m


In [None]:
 #Models

 #6. XGBoost

 xgb = XGBClassifier(max_depth = 4)
 xgb.fit(X_train, y_train)
 xgb_yhat = xgb.predict(X_test)

In [None]:
print(cl('Accuracy score of the XGBoost model is {}'.format(accuracy_score(y_test, xgb_yhat)), attrs = ['bold']))

[1mAccuracy score of the XGBoost model is 0.9994733330992591[0m


The model with the least accuracy score is the Logistic Regression model whereas the model with the highest accuracy score is the KNN.

Another accuracy metric widely used in evaluating models is the f1 score. You calculate it by dividing the product F1 score = 2((precision * recall) / (precision + recall))

In [None]:
# 2. F1 scores

print(cl('F1 SCORES', attrs = ['bold']))

print(cl('F1 score of the Decision Tree model is {}'.format(f1_score(y_test, tree_yhat)), attrs = ['bold']))
print(cl('F1 score of the KNN model is {}'.format(f1_score(y_test, knn_yhat)), attrs = ['bold'], color = 'green'))
print(cl('F1 score of the Logistic Regression model is {}'.format(f1_score(y_test, lr_yhat)), attrs = ['bold'], color = 'red'))
print(cl('F1 score of the SVM model is {}'.format(f1_score(y_test, svm_yhat)), attrs = ['bold']))
print(cl('F1 score of the Random Forest Tree model is {}'.format(f1_score(y_test, rf_yhat)), attrs = ['bold']))
print(cl('F1 score of the XGBoost model is {}'.format(f1_score(y_test, xgb_yhat)), attrs = ['bold']))

[1mF1 SCORES[0m
[1mF1 score of the Decision Tree model is 0.8105263157894738[0m
[1m[32mF1 score of the KNN model is 0.8571428571428572[0m
[1m[31mF1 score of the Logistic Regression model is 0.7356321839080459[0m
[1mF1 score of the SVM model is 0.7771428571428572[0m
[1mF1 score of the Random Forest Tree model is 0.768361581920904[0m
[1mF1 score of the XGBoost model is 0.8421052631578948[0m


# **Confusion Matrix**

In [None]:
#3. Confusion Matrix

#defining the plot function

#Everything works perfectly and the code below creates a confusion matrix for each of the classification models
#You can see those on the files tab or download them as a png file to see.
#for some reason each picture it creates is blank. I need to fix this.

def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
  title = 'Confusion Matrix of {}'.format(title)
  if normalize:
    cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    #cmap stands for color map and it's part of a library. Matplotlib?
    plt.title(title) #plt is the pyplot package that we imported at the beginning of our code.
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
      plt.text(j, i, format(cm[i, j], fmt),
               horizontalalignment = 'center',
               color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Now that we have the code above we are going to compute the confusion matrix for each of the classification models.

tree_matrix = confusion_matrix(y_test, tree_yhat, labels = [0, 1]) # Decision Tree
knn_matrix = confusion_matrix(y_test, knn_yhat, labels = [0, 1]) #K-Nearest Neighbors
lr_matrix = confusion_matrix(y_test, lr_yhat, labels = [0, 1]) #Logistic Regression
svm_matrix = confusion_matrix(y_test, svm_yhat, labels = [0, 1]) #Support Vector Machine
rf_matrix = confusion_matrix(y_test, rf_yhat, labels = [0, 1]) #Random Forest Tree
xgb_matrix = confusion_matrix(y_test, xgb_yhat, labels = [0, 1]) #XGBoost

#Plot each confusion matrix

plt.rcParams['figure.figsize'] = (6,6)

#1. Decision tree

tree_cm_plot = plot_confusion_matrix(tree_matrix,
                                     classes = ['Non-Default(0)', 'Default(1)'],
                                     normalize = False, title = 'Decision Tree')
plt.savefig('tree_cm_plot.png')
plt.show()

#2. K-Nearest Neighbors

knn_cm_plot = plot_confusion_matrix(knn_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'KNN')
plt.savefig('knn_cm_plot.png')
plt.show()

#3. Logistic Regression

lr_cm_plot = plot_confusion_matrix(lr_matrix,
                                   classes = ['Non-Default(0)', 'Default(1)'],
                                   normalize = False, title = 'Logistic Regression')
plt.savefig('lr_cm_plot.png')
plt.show()

#4. Support Vector Machine

svm_cm_plot = plot_confusion_matrix(svm_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'SVM')
plt.savefig('svm_cm_plot.png')
plt.show()

#5. Random Forest Tree

rf_cm_plot = plot_confusion_matrix(rf_matrix,
                                   classes = ['Non-Default(0)', 'Default(1)'],
                                   normalize = False, title = 'Random Forest Tree')
plt.savefig('rf_cm_plot.png')
plt.show()

#6. XGBoost

xgb_cm_plot = plot_confusion_matrix(xgb_matrix,
                                    classes = ['Non-Default(0)', 'Default(1)'],
                                    normalize = False, title = 'XGBoost')
plt.savefig('xgb_cm_plot.png')
plt.show()

<Figure size 432x432 with 0 Axes>

<Figure size 432x432 with 0 Axes>

<Figure size 432x432 with 0 Axes>

<Figure size 432x432 with 0 Axes>

<Figure size 432x432 with 0 Axes>

<Figure size 432x432 with 0 Axes>