In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

In [18]:
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,Target
0,0.114697,0.796303,-0.149553,-0.823011,0.878763,-0.553152,0.939259,-0.108502,0.111137,-0.390521,...,-0.335776,-0.807853,-0.05594,-1.025281,-0.369557,0.204653,0.242724,0.085713,0.89,0
1,-0.039318,0.495784,-0.810884,0.546693,1.986257,4.386342,-1.344891,-1.743736,-0.563103,-0.616315,...,-1.377003,-0.0722,-0.197573,1.014807,1.011293,-0.167684,0.113136,0.256836,85.0,0
2,2.275706,-1.531508,-1.021969,-1.602152,-1.220329,-0.462376,-1.196485,-0.147058,-0.950224,1.560463,...,-0.193271,-0.103533,0.150945,-0.811083,-0.197913,-0.128446,0.014197,-0.051289,42.7,0
3,1.940137,-0.357671,-1.210551,0.382523,0.050823,-0.171322,-0.109124,-0.002115,0.869258,-0.001965,...,0.157994,0.650355,0.034206,0.739535,0.223605,-0.195509,-0.012791,-0.056841,29.99,0
4,1.081395,-0.502615,1.075887,-0.543359,-1.472946,-1.065484,-0.443231,-0.143374,1.659826,-1.131238,...,0.224157,0.821209,-0.137223,0.986259,0.563228,-0.574206,0.089673,0.052036,68.0,0


In [19]:
df.shape

(56962, 30)

In [20]:
df.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'Target'],
      dtype='object')

In [21]:
df['Target'].value_counts()

0    56864
1       98
Name: Target, dtype: int64

In [22]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15,random_state=1,stratify=y)

In [24]:
print(f'the value count of y_train is :',y_train.value_counts())

the value count of y_train is : 0    48334
1       83
Name: Target, dtype: int64


In [25]:
print(f'the value count of y_test is :',y_test.value_counts())

the value count of y_test is : 0    8530
1      15
Name: Target, dtype: int64


### FITTING RANDOM FOREST

In [26]:
rf = RandomForestClassifier() # For GBM, use GradientBoostingClassifier()


# Fit the object to train dataset
rf.fit(x_train, y_train)

In [27]:
rf_train_preds=rf.predict(x_train)

In [28]:
rf_test_preds=rf.predict(x_test)

In [29]:
print(f'the accuracy for train data is :{accuracy_score(y_train,rf_train_preds)}')
print(f'the accuracy for train data is :{accuracy_score(y_test,rf_test_preds)}')

the accuracy for train data is :1.0
the accuracy for train data is :0.9991808074897601


In [30]:
roc_auc_score(y_train,rf_train_preds)

1.0

In [31]:
roc_auc_score(y_test,rf_test_preds)

0.7999413833528722

### GBM FITTING

In [32]:
gbm = GradientBoostingClassifier()
gbm.fit(x_train,y_train)

In [33]:
gbm_train_preds=gbm.predict(x_train)
gbm_test_preds=gbm.predict(x_test)

In [34]:
print(f'the accuracy for train data is :{accuracy_score(y_train,gbm_train_preds)}')
print(f'the accuracy for train data is :{accuracy_score(y_test,gbm_test_preds)}')

the accuracy for train data is :0.999442344631018
the accuracy for train data is :0.9989467524868344


In [35]:
print(f'the area under the curve index of train model is : {roc_auc_score(y_train,gbm_train_preds)}')
print(f'the area under the curve index of train model is : {roc_auc_score(y_test,gbm_test_preds)}')

the area under the curve index of train model is : 0.8974869145967742
the area under the curve index of train model is : 0.7998241500586167


### GRIDSEARCH CV WITH RANDOM FOREST

In [36]:
rf2 = RandomForestClassifier() # For GBM, use GradientBoostingClassifier()
grid_values = {'n_estimators':[50, 80,  100], 'max_depth':[3, 5, 7]}
classifier = GridSearchCV(rf2, param_grid = grid_values, scoring = 'roc_auc', cv=5)

# Fit the object to train dataset
classifier.fit(x_train, y_train)

In [37]:
rf2_train_preds=classifier.predict(x_train)
rf2_test_preds=classifier.predict(x_test)
print(f'the accuracy for train data is :{accuracy_score(y_train,rf2_train_preds)}')
print(f'the accuracy for train data is :{accuracy_score(y_test,rf2_test_preds)}')
print(f'the area under the curve index of train model is : {roc_auc_score(y_train,rf2_train_preds)}')
print(f'the area under the curve index of train model is : {roc_auc_score(y_test,rf2_test_preds)}')

the accuracy for train data is :0.9996901914616767
the accuracy for train data is :0.9991808074897601
the area under the curve index of train model is : 0.9096385542168675
the area under the curve index of train model is : 0.7999413833528722


#### RF ACCURACY
the accuracy for train data is :1.0 and the accuracy for train data is :0.9991808074897601

### GRID SEARCH CV WITH GBM

In [39]:
gbm2 = GradientBoostingClassifier() # For GBM, use GradientBoostingClassifier()
grid_values = {'n_estimators':[50, 80,  100], 'max_depth':[3, 5, 7]}
gbm2_classifier = GridSearchCV(gbm2, param_grid = grid_values, scoring = 'roc_auc', cv=5)

# Fit the object to train dataset
gbm2_classifier.fit(x_train, y_train)

In [45]:
gbm2_train_preds=gbm2_classifier.predict(x_train)
gbm2_test_preds=gbm2_classifier.predict(x_test)
print(f'the accuracy for train data is :{accuracy_score(y_train,gbm2_train_preds)}')
print(f'the accuracy for test data is :{accuracy_score(y_test,gbm2_test_preds)}')
print(f'the area under the curve index of train model is : {roc_auc_score(y_train,gbm2_train_preds)}')
print(f'the area under the curve index of test model is : {roc_auc_score(y_test,gbm2_test_preds)}')

the accuracy for train data is :0.9999793460974451
the accuracy for test data is :0.9989467524868344
the area under the curve index of train model is : 0.9999896553150991
the area under the curve index of test model is : 0.7998241500586167


#### GBM ACCURACY
the accuracy for train data is :0.999442344631018 and the accuracy for test data is :0.9989467524868344

### FEATURED SELECETION

In [41]:
print(len(rf.feature_importances_))
print(len(x_train.columns))

29
29


In [42]:
features_importance=pd.Series(rf.feature_importances_,index=x_train.columns)
features_importance.sort_values(ascending=False).head(3)

V17    0.151552
V12    0.126372
V14    0.122123
dtype: float64

In [43]:
x=df[['V14','V17','V12']]
y=df['Target']

In [44]:
x_train2,x_test2,y_train2,y_test2=train_test_split(x,y,test_size=0.1,random_state=1)

In [72]:
random=RandomForestClassifier()
grid_values = {'n_estimators':[50, 80,  100], 'max_depth':[3, 5, 7]}
random1 = GridSearchCV(random, param_grid = grid_values, scoring = 'roc_auc', cv=5)

# Fit the object to train dataset
random1.fit(x_train2, y_train2)