In [59]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score


In [60]:
credit = pd.read_csv('creditcard.csv')

In [61]:
credit.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,Target
0,0.114697,0.796303,-0.149553,-0.823011,0.878763,-0.553152,0.939259,-0.108502,0.111137,-0.390521,...,-0.335776,-0.807853,-0.05594,-1.025281,-0.369557,0.204653,0.242724,0.085713,0.89,0
1,-0.039318,0.495784,-0.810884,0.546693,1.986257,4.386342,-1.344891,-1.743736,-0.563103,-0.616315,...,-1.377003,-0.0722,-0.197573,1.014807,1.011293,-0.167684,0.113136,0.256836,85.0,0
2,2.275706,-1.531508,-1.021969,-1.602152,-1.220329,-0.462376,-1.196485,-0.147058,-0.950224,1.560463,...,-0.193271,-0.103533,0.150945,-0.811083,-0.197913,-0.128446,0.014197,-0.051289,42.7,0
3,1.940137,-0.357671,-1.210551,0.382523,0.050823,-0.171322,-0.109124,-0.002115,0.869258,-0.001965,...,0.157994,0.650355,0.034206,0.739535,0.223605,-0.195509,-0.012791,-0.056841,29.99,0
4,1.081395,-0.502615,1.075887,-0.543359,-1.472946,-1.065484,-0.443231,-0.143374,1.659826,-1.131238,...,0.224157,0.821209,-0.137223,0.986259,0.563228,-0.574206,0.089673,0.052036,68.0,0


In [62]:
credit.shape

(56962, 30)

In [63]:
credit.dtypes

Unnamed: 0,0
V1,float64
V2,float64
V3,float64
V4,float64
V5,float64
V6,float64
V7,float64
V8,float64
V9,float64
V10,float64


In [64]:
(credit.isnull().sum()/len(credit))*100

Unnamed: 0,0
V1,0.0
V2,0.0
V3,0.0
V4,0.0
V5,0.0
V6,0.0
V7,0.0
V8,0.0
V9,0.0
V10,0.0


In [65]:
credit.duplicated().sum()

np.int64(675)

In [66]:
credit.drop_duplicates(inplace=True)

In [67]:
credit.shape

(56287, 30)

In [68]:
(credit['Target'].value_counts() / len(credit))*100

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
0,99.825892
1,0.174108


In [69]:
X = credit.drop(columns='Target')
y = credit['Target']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify = y)

In [71]:
#Splitting the data into train, validation and test set
train_data, temp_data = train_test_split(credit, train_size =0.7, test_size = 0.3, random_state= 45)

validation_data, test_data = train_test_split(temp_data, train_size = 0.5, test_size = 0.5, random_state = 45)


In [72]:
train_data.shape


(39400, 30)

In [73]:
validation_data.shape


(8443, 30)

In [74]:
test_data.shape

(8444, 30)

In [49]:
#Apply random forest algo
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)

In [50]:
y_pred = random_forest.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred)

0.9993782199324924

In [52]:
roc_auc_score(y_test, y_pred)

np.float64(0.8499555080975262)

In [53]:
#Apply Decision Tree Classifier
decision_tree = DecisionTreeClassifier(criterion='gini', max_depth = 12, min_samples_split = 10)
decision_tree.fit(X_train, y_train)

In [54]:
decision_tree.get_depth()

12

In [55]:
y_pred2 = decision_tree.predict(X_test)

In [56]:
roc_auc_score(y_test, y_pred2) # to be used with imbalanced data

np.float64(0.8749110161950525)

In [57]:
# To check overfitting
y_pred_train = decision_tree.predict(X_train)

In [58]:
roc_auc_score(y_train, y_pred_train)

np.float64(0.9230769230769231)