In [145]:
import pandas as pd
import numpy as np
import pandas_profiling

# For UnBalanced Dataset 
from imblearn.under_sampling import NearMiss

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import roc_auc_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, precision_recall_curve, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, RidgeClassifier, ElasticNet 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

%matplotlib inline
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [146]:
org_train = pd.read_csv('credit_train.csv');
org_test = pd.read_csv('credit_test.csv');

In [147]:
org_data = pd.concat([org_train,org_test],axis='rows',sort=False,ignore_index=False)
org_data.shape

(284807, 30)

In [148]:
profile = pandas_profiling.ProfileReport(org_data);
profile.to_file(outputfile="eda.html");

In [149]:
org_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284807 entries, 0 to 56961
Data columns (total 30 columns):
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26       284807 non-null float64
V27   

In [150]:
org_data['Class'].unique()

array(["'0'", "'1'"], dtype=object)

In [151]:
org_data['Class'] = org_data['Class'].replace({"'0'": 0, "'1'": 1});

In [152]:
org_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284807 entries, 0 to 56961
Data columns (total 30 columns):
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26       284807 non-null float64
V27   

In [153]:
pd.value_counts(org_data['Class'])

0    284315
1       492
Name: Class, dtype: int64

# Target variable is Un-balanced So we are doing undersampling.

In [154]:
x = org_data.drop(columns=['Class'])
y = org_data['Class']
nm = NearMiss()
x_res,y_res=nm.fit_sample(x,y)
print('xshape', x_res.shape,'yShape',y_res.shape)

xshape (984, 29) yShape (984,)


# Scaling and Split Daataset

In [155]:
stdScal = StandardScaler();
x_std = stdScal.fit_transform(x_res)

In [156]:
x_train,x_test,y_train,y_test = train_test_split(x_std,y_res,test_size=0.3,random_state=1)

In [157]:
def roc_draw(x_test, y_test,logreg):
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show() 

In [158]:
algorithName=[]
f1_train=[];
f1_test=[];
model_name=[];
def model_fit(model,train_x,train_y,test_x,test_y,name,roc= False):
    model.fit(train_x,train_y)
    #model.predict_proba(train_x)
    y_train_pred = model.predict(train_x);
    f1Train=f1_score(train_y,y_train_pred)
    print(" Train F1-Score :",f1Train)
    #print(classification_report(train_y,y_train_pred))
    y_test_pred = model.predict(x_test);
    f1Test=f1_score(test_y,y_test_pred)
    print(" Test F1-Score :",f1Test)
    #print(classification_report(test_y,y_test_pred))
    algorithName.append(name)
    f1_train.append(f1Train)
    f1_test.append(f1Test)
    model_name.append(name)
    if (roc):
        roc_draw(x_train, y_train,model)

In [159]:
def exportResult(name,f1_train,f1_test):
    result = pd.DataFrame([name,f1_train,f1_test]).T;
    result.columns = ['Model','F1_Train','F1_Test'];
    result.to_excel('credit_fraud1.xlsx',index=False);
    return result

# Predictive Modelling Algorithm

In [160]:
#logistic Regression Ridge
ridge = LogisticRegression(penalty='l2')
model_fit(ridge,x_train,y_train,x_test,y_test,'Ridge_logistic');

 Train F1-Score : 0.9865067466266867
 Test F1-Score : 0.9836065573770492


In [161]:
#logistic Regression Lasso
lasso = LogisticRegression(solver='saga',penalty='l1')
model_fit(lasso,x_train,y_train,x_test,y_test,'Lasso_logistic');

 Train F1-Score : 0.9834586466165414
 Test F1-Score : 0.9801324503311257




In [162]:
#logistic Regression Elastic Net
elastic = LogisticRegression(solver='saga',penalty='elasticnet',l1_ratio=0.7)
model_fit(elastic,x_train,y_train,x_test,y_test,'Elastic_logistic');

 Train F1-Score : 0.9834586466165414
 Test F1-Score : 0.9801324503311257




In [163]:
# Gradient descent
gradient = SGDClassifier(loss='log',random_state=2)
model_fit(elastic,x_train,y_train,x_test,y_test,'Gradient_logistic');

 Train F1-Score : 0.9834586466165414
 Test F1-Score : 0.9801324503311257




In [164]:
# K-neighbor
knn = KNeighborsClassifier()
model_fit(knn,x_train,y_train,x_test,y_test,'KNeighbors_Classifier')

 Train F1-Score : 0.9631901840490796
 Test F1-Score : 0.9594594594594594


In [165]:
# Bernoulli Naiye Bayes
bnb = BernoulliNB()
model_fit(bnb,x_train,y_train,x_test,y_test,'BernoulliNB')

 Train F1-Score : 0.9470404984423676
 Test F1-Score : 0.9415807560137458


In [166]:
# Gaussian Naiye Bayes
nb = GaussianNB()
model_fit(bnb,x_train,y_train,x_test,y_test,'GaussianNB')

 Train F1-Score : 0.9470404984423676
 Test F1-Score : 0.9415807560137458


In [167]:
# Decision Tree Gini
dt_gini = DecisionTreeClassifier(criterion="gini")
model_fit(dt_gini,x_train,y_train,x_test,y_test,'DecisionTreeClassifier_Gini')

 Train F1-Score : 1.0
 Test F1-Score : 0.9869281045751634


In [168]:
# Decision Tree Entropy
dt_entropy = DecisionTreeClassifier(criterion="entropy")
model_fit(dt_entropy,x_train,y_train,x_test,y_test,'DecisionTreeClassifier_Entropy')

 Train F1-Score : 1.0
 Test F1-Score : 0.9770491803278688


In [169]:
# AdaBoost Classifier
ada = AdaBoostClassifier(n_estimators=50,learning_rate=1.0)
model_fit(ada,x_train,y_train,x_test,y_test,'AdaBoostClassifier')

 Train F1-Score : 1.0
 Test F1-Score : 0.990228013029316


In [170]:
# Gradient Classifier
gb=GradientBoostingClassifier(n_estimators=100,min_samples_leaf=1,max_features='auto')
model_fit(gb,x_train,y_train,x_test,y_test,'GradientBoostingClassifier')

 Train F1-Score : 1.0
 Test F1-Score : 0.9869281045751634


In [171]:
# Xgb Boost
xgbBoost=XGBClassifier(n_estimators=50,learning_rate=0.05)
model_fit(xgbBoost,x_train,y_train,x_test,y_test,'XGBClassifier')

 Train F1-Score : 0.9970326409495549
 Test F1-Score : 0.9836065573770492


In [172]:
# Random Forest
rnd = RandomForestClassifier(n_estimators=50)
model_fit(rnd,x_train,y_train,x_test,y_test,'RandomForestClassifier')

 Train F1-Score : 1.0
 Test F1-Score : 0.990228013029316


In [173]:
# SVM Classifier
svm = SVC()
model_fit(svm,x_train,y_train,x_test,y_test,'SVMClassifier')

 Train F1-Score : 0.9819277108433735
 Test F1-Score : 0.9868421052631579


In [174]:
resultFile = exportResult(algorithName,f1_train,f1_test);

In [175]:
resultFile

Unnamed: 0,Model,F1_Train,F1_Test
0,Ridge_logistic,0.986507,0.983607
1,Lasso_logistic,0.983459,0.980132
2,Elastic_logistic,0.983459,0.980132
3,Gradient_logistic,0.983459,0.980132
4,KNeighbors_Classifier,0.96319,0.959459
5,BernoulliNB,0.94704,0.941581
6,GaussianNB,0.94704,0.941581
7,DecisionTreeClassifier_Gini,1.0,0.986928
8,DecisionTreeClassifier_Entropy,1.0,0.977049
9,AdaBoostClassifier,1.0,0.990228


# Hyper Tuning consider random Forest and Ada Boost

In [176]:
params={
 "criterion"    : ['gini','entropy'] ,
 "n_estimators" : [1, 2, 4, 8, 16, 32, 64, 100, 200]
} 

In [177]:
rnd_hyp = RandomForestClassifier();
random_search=RandomizedSearchCV(rnd_hyp,param_distributions=params,n_iter=100,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [178]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


AttributeError: 'LokyProcess' object has no attribute 'env'