### Importing necessary packages 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data.shape

(284807, 31)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26  

In [5]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Checking for missing values 

In [6]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
data['Class'].unique()

array([0, 1])

In [8]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### lets do some preprocessing on amount column 

In [9]:
scale= StandardScaler()

In [10]:
data['Amount']= scale.fit_transform(data['Amount'])



In [11]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


### lets split the data into Independent and dependent variables

In [12]:
y = data['Class'].values 

In [13]:
X= data.drop(['Time', 'Class'], axis =1 ).values 

In [14]:
y.shape

(284807,)

In [15]:
X.shape

(284807, 29)

### Spliting data into train/test for validation 

In [16]:
X_train,X_test, y_train, y_test= train_test_split(X,y, random_state=42)

In [17]:
X_train.shape

(213605, 29)

In [18]:
X_test.shape

(71202, 29)

In [19]:
y_train.shape

(213605,)

In [20]:
y_test.shape

(71202,)

### Building ML models 

### 1. LogisticRegression

In [21]:
log= LogisticRegression()

In [22]:
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
y_pred= log.predict(X_test)

In [24]:
log.score(X_test, y_test)

0.99918541613999612

In [25]:
confusion_matrix(y_test, y_pred)

array([[71077,    12],
       [   46,    67]])

In [26]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71089
          1       0.85      0.59      0.70       113

avg / total       1.00      1.00      1.00     71202



In [27]:
roc_auc_score(y_test, y_pred)

0.79637577574763874

### 2. KNN

In [28]:
knn= KNeighborsClassifier()

In [29]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
y_pred1= knn.predict(X_test)

In [31]:
knn.score(X_test, y_test)

0.9994522625768939

In [32]:
confusion_matrix(y_test, y_pred1)

array([[71076,    13],
       [   26,    87]])

In [33]:
print(classification_report(y_test, y_pred1))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71089
          1       0.87      0.77      0.82       113

avg / total       1.00      1.00      1.00     71202



In [34]:
roc_auc_score(y_test, y_pred1)

0.88486431753191852

### 3. RandomForest

In [35]:
random = RandomForestClassifier()

In [36]:
random.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [37]:
y_pred2= random.predict(X_test)

In [38]:
random.score(X_test, y_test)

0.99949439622482517

In [39]:
confusion_matrix(y_test, y_pred2)

array([[71082,     7],
       [   29,    84]])

In [40]:
print(classification_report(y_test, y_pred2))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71089
          1       0.92      0.74      0.82       113

avg / total       1.00      1.00      1.00     71202



In [41]:
roc_auc_score(y_test, y_pred2)

0.87163218187048841

### Conclusion:

** Because of the class Imbalance in this dataset, we need to look for recall and roc score ignoring other metrics for evaluation. So here knn has more recall and roc_auc score when compare to other models.**