# Project Overview

#### 1. Importing libraries and loading data

In [1]:
#Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import math
%matplotlib inline 

In [2]:
#Importing sklearn libraries 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
#laoding the dataset 
dataset = pd.read_csv('C:\script\python\MachineLearning\Main\Credit-Card-Fraud-Detection\creditcard.csv')

In [4]:
dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### 2. Creating feature and target data

In [5]:
X = dataset.iloc[:, 1:30].values
y = dataset.iloc[:, 30].values

In [6]:
print("Number of the rows for the input: ", X.shape)
print("Nmber of the rows for the output: ", y.shape)

Number of the rows for the input:  (284807, 29)
Nmber of the rows for the output:  (284807,)


In [7]:
print("Sample input: \n", X[0:3])

Sample input: 
 [[-1.35980713e+00 -7.27811733e-02  2.53634674e+00  1.37815522e+00
  -3.38320770e-01  4.62387778e-01  2.39598554e-01  9.86979013e-02
   3.63786970e-01  9.07941720e-02 -5.51599533e-01 -6.17800856e-01
  -9.91389847e-01 -3.11169354e-01  1.46817697e+00 -4.70400525e-01
   2.07971242e-01  2.57905802e-02  4.03992960e-01  2.51412098e-01
  -1.83067779e-02  2.77837576e-01 -1.10473910e-01  6.69280749e-02
   1.28539358e-01 -1.89114844e-01  1.33558377e-01 -2.10530535e-02
   1.49620000e+02]
 [ 1.19185711e+00  2.66150712e-01  1.66480113e-01  4.48154078e-01
   6.00176493e-02 -8.23608088e-02 -7.88029833e-02  8.51016549e-02
  -2.55425128e-01 -1.66974414e-01  1.61272666e+00  1.06523531e+00
   4.89095016e-01 -1.43772296e-01  6.35558093e-01  4.63917041e-01
  -1.14804663e-01 -1.83361270e-01 -1.45783041e-01 -6.90831352e-02
  -2.25775248e-01 -6.38671953e-01  1.01288021e-01 -3.39846476e-01
   1.67170404e-01  1.25894532e-01 -8.98309914e-03  1.47241692e-02
   2.69000000e+00]
 [-1.35835406e+00 -1.3

In [8]:
print("Sample output: \n", y[0:3])

Sample output: 
 [0 0 0]


#### 3. Handling Missing Values

In [9]:
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer = imputer.fit(X[:, 1:30])
X[:, 1:30] = imputer.fit_transform(X[:, 1:30])



#### 4. Spliting testing and training data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [11]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)

X_train.shape:  (213605, 29)
y_train.shape:  (213605,)


In [12]:
print("X_test.shape: ", X_test.shape)
print("y_test.shape: ", y_test.shape)

X_test.shape:  (71202, 29)
y_test.shape:  (71202,)


#### 5. Feature Scaling 

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
print("Standardised Training set : \n", X_train[0])

Standardised Training set : 
 [ 1.04272047  0.06657394 -1.19051456  0.05060912  0.18235446 -1.31399333
  0.58133086 -0.40257892 -0.09319222  0.16481198  1.60036637  1.18028602
 -0.24273404  1.08764203 -0.35935009 -0.76863613 -0.28881862 -0.39536117
  0.13774039 -0.34055771  0.32484688  1.13026957  0.03716189  0.90724443
  0.61754959  0.39904973 -0.21031503 -0.2607924  -0.35356699]


In [15]:
print("Standardised Test set : \n", X_train[0])

Standardised Test set : 
 [ 1.04272047  0.06657394 -1.19051456  0.05060912  0.18235446 -1.31399333
  0.58133086 -0.40257892 -0.09319222  0.16481198  1.60036637  1.18028602
 -0.24273404  1.08764203 -0.35935009 -0.76863613 -0.28881862 -0.39536117
  0.13774039 -0.34055771  0.32484688  1.13026957  0.03716189  0.90724443
  0.61754959  0.39904973 -0.21031503 -0.2607924  -0.35356699]


#### 6. Applying Decision Tree Classification

In [16]:
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [17]:
#Prediction
y_pred_decision_tree = dt_classifier.predict(X_test)
print("Decision Tree prediction: \n", y_pred_decision_tree)

Decision Tree prediction: 
 [0 0 0 ... 0 0 0]


In [18]:
#Confusion Matrix
cm_decision = confusion_matrix(y_test, y_pred_decision_tree)
print("Confusion Metrixfor Decision Tree: \n", cm_decision)

Confusion Metrixfor Decision Tree: 
 [[71052    30]
 [   25    95]]


In [19]:
#Accuracy
acc_decision = accuracy_score(y_test, y_pred_decision_tree)
print("Accuracy Score for Decision Tree: \n", acc_decision)

Accuracy Score for Decision Tree: 
 0.9992275497879273


#### Decision Tree Classification Result

In [20]:
Accuracy_Decison = ((cm_decision[0][0] + cm_decision[1][1]) / cm_decision.sum()) *100
print("Accuracy_Decison    : ", Accuracy_Decison)

Error_rate_Decison = ((cm_decision[0][1] + cm_decision[1][0]) / cm_decision.sum()) *100
print("Error_rate_Decison  : ", Error_rate_Decison)

# True Fake Recognition Rate
Specificity_Decison = (cm_decision[1][1] / (cm_decision[1][1] + cm_decision[0][1])) *100
print("Specificity_Decison : ", Specificity_Decison)

# True Genuine Recognition Rate
Sensitivity_Decison = (cm_decision[0][0] / (cm_decision[0][0] + cm_decision[1][0])) *100
print("Sensitivity_Decison : ", Sensitivity_Decison)

Accuracy_Decison    :  99.92275497879272
Error_rate_Decison  :  0.07724502120726946
Specificity_Decison :  76.0
Sensitivity_Decison :  99.96482687789299


#### 7. Applying Random Forest Classification 

In [21]:
svc_classifier = SVC(kernel='rbf', random_state=0)
svc_classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [22]:
#Prediction 
y_pred2 = svc_classifier.predict(X_test)
print("Prediction for Random Forest Classifier: \n", y_pred2)

Prediction for Random Forest Classifier: 
 [0 0 0 ... 0 0 0]


In [23]:
cm2 = confusion_matrix(y_test, y_pred2)
print("Confusion Matrix for Random Forest Classifier: \n", cm2)

Confusion Matrix for Random Forest Classifier: 
 [[71077     5]
 [   44    76]]


In [24]:
#Accuracy
acc_randomForest = accuracy_score(y_test, y_pred_decision_tree)
print("Accuracy Score for Random Forest Classifier: \n", acc_randomForest)

Accuracy Score for Random Forest Classifier: 
 0.9992275497879273


#### Random Forest Classification Result

In [25]:
# Validating the Prediction
Accuracy_svc = ((cm2[0][0] + cm2[1][1]) / cm2.sum()) *100
print("Accuracy_svc    : ", Accuracy_svc)

Error_rate_svc = ((cm2[0][1] + cm2[1][0]) / cm2.sum()) *100
print("Error_rate_svc  : ", Error_rate_svc)

# True Fake Recognition Rate
Specificity_svc = (cm2[1][1] / (cm2[1][1] + cm2[0][1])) *100
print("Specificity_svc : ", Specificity_svc)

# True Genuine Recognition Rate
Sensitivity_svc = (cm2[0][0] / (cm2[0][0] + cm2[1][0])) *100
print("Sensitivity_svc : ", Sensitivity_svc)

Accuracy_svc    :  99.93118170837899
Error_rate_svc  :  0.06881829162102188
Specificity_svc :  93.82716049382715
Sensitivity_svc :  99.93813360329578
