In [1]:
# load libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix,classification_report,roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

# load train data set
df=pd.read_csv('experiment1.csv')
df.head()

Unnamed: 0,policy_deductable,policy_annual_premium,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,vehicle_claim,customer,Customer_age_group,have_umbrella_limit,policy_csl,insured_sex,insured_education_level,incident_severity,property_damage,police_report_available,fraud_reported,insured_occupation_armed-forces,insured_occupation_craft-repair,insured_occupation_exec-managerial,insured_occupation_farming-fishing,insured_occupation_handlers-cleaners,insured_occupation_machine-op-inspct,insured_occupation_other-service,insured_occupation_priv-house-serv,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving,insured_relationship_not-in-family,insured_relationship_other-relative,insured_relationship_own-child,insured_relationship_unmarried,insured_relationship_wife,incident_type_Parked Car,incident_type_Single Vehicle Collision,incident_type_Vehicle Theft,collision_type_Rear Collision,collision_type_Side Collision,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police
0,1000,1406.91,53300,0,5,1,1,2,52080,0,0,1,2.5,1,6,3,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1
1,2000,1197.22,0,0,8,1,0,0,3510,0,0,0,2.5,1,6,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1
2,2000,1413.14,35100,0,7,3,2,3,23100,0,1,0,1.0,0,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
3,2000,1415.74,48900,-62400,5,1,1,2,50720,0,0,0,2.5,0,7,3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
4,1000,1583.91,66000,-46000,20,1,0,1,4550,0,0,0,5.0,1,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0


In [4]:
# Divide data into X and y
X=df.drop('fraud_reported',axis=1)
y=df['fraud_reported']

# Train test split

In [6]:
# Splitting the data into train and test set for model training
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)


print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

Shape of x_train : (800, 45)
Shape of x_test : (200, 45)
Shape of y_train : (800,)
Shape of y_test : (200,)


# Data standardization

In [8]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train=pd.DataFrame(ss.fit_transform(x_train),columns=x_train.columns)
x_test=pd.DataFrame(ss.transform(x_test),columns=x_test.columns)

# Model Building

Logestic Regression

In [15]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_train_pred=lr.predict(x_train)
y_train_prob=lr.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
print("AUC of train:",roc_auc_score(y_train,y_train_prob))


y_test_pred=lr.predict(x_test)
y_test_prob=lr.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[597  13]
 [176  14]]
Accuracy of train: 0.76375
AUC of train: 0.7000000000000001



Confusion matrix 
 [[137   6]
 [ 54   3]]
Accuracy of test: 0.7
AUC of test: 0.5401791191264875



Classification report: '
'               precision    recall  f1-score   support

           0       0.72      0.96      0.82       143
           1       0.33      0.05      0.09        57

    accuracy                           0.70       200
   macro avg       0.53      0.51      0.46       200
weighted avg       0.61      0.70      0.61       200



Hyper parameter tuning to get the best ROC-AUC score

In [68]:
grid={'penalty' : ['l1', 'l2', 'none'],
        'C': [1.0,10,0.0001,1000000.0,1e-06,100000.0],
        'solver' : ['lbfgs', 'liblinear'],
        'max_iter':[50,100,500,1000,10000],
        'n_jobs' : [-1],
        }
    
lr = LogisticRegression(class_weight='balanced',random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCV(lr,param_grid= grid, scoring='roc_auc', cv=fold)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.5894576335374684


In [69]:
gs.best_estimator_

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lr=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

lr.fit(x_train,y_train)
y_train_pred=lr.predict(x_train)
y_train_prob=lr.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
lr_train_score= roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
lr_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=lr.predict(x_test)
y_test_prob=lr.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
lr_test_score= roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
lr_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[368 242]
 [ 58 132]]
Accuracy of train: 0.625
AUC of train: 0.6996980155306298
F1-score train:  0.4680851063829788



Confusion matrix 
 [[84 59]
 [30 27]]
Accuracy of test: 0.555
AUC of test: 0.5470494417862839
F1-score test:  0.37762237762237755



Classification report: '
'               precision    recall  f1-score   support

           0       0.74      0.59      0.65       143
           1       0.31      0.47      0.38        57

    accuracy                           0.56       200
   macro avg       0.53      0.53      0.52       200
weighted avg       0.62      0.56      0.58       200



Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
dt=dt=DecisionTreeClassifier(random_state=42)

dt.fit(x_train,y_train)
y_train_pred=dt.predict(x_train)
y_train_prob=dt.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
print("AUC of train:",roc_auc_score(y_train,y_train_prob))


y_test_pred=dt.predict(x_test)
y_test_prob=dt.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[610   0]
 [  0 190]]
Accuracy of train: 1.0
AUC of train: 1.0



Confusion matrix 
 [[116  27]
 [ 28  29]]
Accuracy of test: 0.725
AUC of test: 0.6599803705066863



Classification report: '
'               precision    recall  f1-score   support

           0       0.81      0.81      0.81       143
           1       0.52      0.51      0.51        57

    accuracy                           0.73       200
   macro avg       0.66      0.66      0.66       200
weighted avg       0.72      0.72      0.72       200



HyperParameter tuning decission tree

In [73]:
grid={'criterion':['gini','entropy'],
    'max_depth': [2,3,5,9],
    'max_features': [18,34,44,41,36,'auto'],
        }
    
dt = DecisionTreeClassifier(class_weight='balanced',random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCV(dt,param_grid= grid, scoring='roc_auc', cv=fold)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.765018111272444


In [74]:
gs.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=2, max_features=44, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [20]:
dt=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=2, max_features=44, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

dt.fit(x_train,y_train)
y_train_pred=dt.predict(x_train)
y_train_prob=dt.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
dt_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
dt_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=dt.predict(x_test)
y_test_prob=dt.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
dt_test_score= roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
dt_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[521  89]
 [ 62 128]]
Accuracy of train: 0.81125
AUC of train: 0.7888352027610008
F1-score train:  0.628992628992629



Confusion matrix 
 [[123  20]
 [ 18  39]]
Accuracy of test: 0.81
AUC of test: 0.7467795362532204
F1-score test:  0.6724137931034484



Classification report: '
'               precision    recall  f1-score   support

           0       0.87      0.86      0.87       143
           1       0.66      0.68      0.67        57

    accuracy                           0.81       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.81      0.81      0.81       200



Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf=RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)
y_train_pred=rf.predict(x_train)
y_train_prob=rf.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
print("AUC of train:",roc_auc_score(y_train,y_train_prob))


y_test_pred=rf.predict(x_test)
y_test_prob=rf.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[610   0]
 [  0 190]]
Accuracy of train: 1.0
AUC of train: 1.0



Confusion matrix 
 [[138   5]
 [ 54   3]]
Accuracy of test: 0.705
AUC of test: 0.728560912771439



Classification report: '
'               precision    recall  f1-score   support

           0       0.72      0.97      0.82       143
           1       0.38      0.05      0.09        57

    accuracy                           0.70       200
   macro avg       0.55      0.51      0.46       200
weighted avg       0.62      0.70      0.62       200



In [78]:
grid={'n_estimators':[10,20,50,100,500],
    'criterion':['gini', 'entropy'],
    'bootstrap' : [True],
    'oob_score': [True],
    'n_jobs':[-1],
    'class_weight':["balanced"],
    'max_depth': [2,3,5,9,None],
    'max_features': [18,34,44,41,36,'auto']
    }
rf = RandomForestClassifier(random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = RandomizedSearchCV(rf,param_distributions= grid, scoring='roc_auc', cv=fold,n_iter=50,n_jobs=-1,random_state=42)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.7744745145900029


In [80]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=2, max_features=18,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [23]:
rf=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=2, max_features=18,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

rf.fit(x_train,y_train)
y_train_pred=rf.predict(x_train)
y_train_prob=rf.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
rf_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
rf_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=rf.predict(x_test)
y_test_prob=rf.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
rf_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
rf_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[340 270]
 [ 34 156]]
Accuracy of train: 0.62
AUC of train: 0.8173166522864538
F1-score train:  0.5064935064935066



Confusion matrix 
 [[84 59]
 [15 42]]
Accuracy of test: 0.63
AUC of test: 0.7552447552447552
F1-score test:  0.5316455696202531



Classification report: '
'               precision    recall  f1-score   support

           0       0.85      0.59      0.69       143
           1       0.42      0.74      0.53        57

    accuracy                           0.63       200
   macro avg       0.63      0.66      0.61       200
weighted avg       0.73      0.63      0.65       200



Support vectore Classifier

In [24]:
from sklearn.svm import SVC

In [25]:
svc=SVC(random_state=42,probability=True)
svc.fit(x_train,y_train)
y_train_pred=svc.predict(x_train)
y_train_prob=svc.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
print("AUC of train:",roc_auc_score(y_train,y_train_prob))


y_test_pred=svc.predict(x_test)
y_test_prob=svc.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[610   0]
 [181   9]]
Accuracy of train: 0.77375
AUC of train: 0.99893011216566



Confusion matrix 
 [[143   0]
 [ 57   0]]
Accuracy of test: 0.715
AUC of test: 0.5280333701386333



Classification report: '
'               precision    recall  f1-score   support

           0       0.71      1.00      0.83       143
           1       0.00      0.00      0.00        57

    accuracy                           0.71       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.51      0.71      0.60       200



Hyperameter tuning svc

In [97]:
grid={'C': [0.1,0.01, 0.001,0.5, 1, 10],  
      'gamma': ['auto', 'scale'], 
      'kernel': ['rbf','linear','poly'],
      'degree' : [0, 1, 2, 3, 4, 5, 6],
      'class_weight' : ["balanced",None]}

svc=SVC(random_state=42)
fold=KFold(n_splits=4,random_state=42,shuffle=True)
gs=RandomizedSearchCV(svc,param_distributions=grid,scoring='roc_auc',n_jobs=-1,n_iter=50,cv=fold)
gs.fit(x_train,y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.585647463495043


In [98]:
gs.best_estimator_ 

SVC(C=0.001, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=2, gamma='auto',
    kernel='linear', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

In [26]:
svc=SVC(C=0.001, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', gamma='auto',
    kernel='linear', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)


svc.fit(x_train,y_train)
y_train_pred=svc.predict(x_train)
y_train_prob=svc.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
svc_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
svc_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=svc.predict(x_test)
y_test_prob=svc.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
svc_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
svc_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[152 458]
 [ 13 177]]
Accuracy of train: 0.41125
AUC of train: 0.6786712683347714
F1-score train:  0.4290909090909091



Confusion matrix 
 [[ 35 108]
 [  7  50]]
Accuracy of test: 0.425
AUC of test: 0.5471721261194946
F1-score test:  0.4651162790697675



Classification report: '
'               precision    recall  f1-score   support

           0       0.83      0.24      0.38       143
           1       0.32      0.88      0.47        57

    accuracy                           0.42       200
   macro avg       0.57      0.56      0.42       200
weighted avg       0.69      0.42      0.40       200



KNN Classifier

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
knn=KNeighborsClassifier()

knn.fit(x_train,y_train)
y_train_pred=knn.predict(x_train)
y_train_prob=knn.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
print("AUC of train:",roc_auc_score(y_train,y_train_prob))


y_test_pred=knn.predict(x_test)
y_test_prob=knn.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[581  29]
 [141  49]]
Accuracy of train: 0.7875
AUC of train: 0.7811345987920622



Confusion matrix 
 [[124  19]
 [ 50   7]]
Accuracy of test: 0.655
AUC of test: 0.5163783584836217



Classification report: '
'               precision    recall  f1-score   support

           0       0.71      0.87      0.78       143
           1       0.27      0.12      0.17        57

    accuracy                           0.66       200
   macro avg       0.49      0.49      0.48       200
weighted avg       0.59      0.66      0.61       200



KNN Hyperparameter Tunning

In [107]:
grid = {'algorithm' :['auto','ball_tree', 'kd_tree', 'brute'],
         'n_jobs':[-1,None], 
         'n_neighbors':list(range(1,41,2)), 
         'p':[1,2],
         'weights':['uniform','distance']
       }

knn=KNeighborsClassifier()
fold=KFold(n_splits=4,random_state=42,shuffle=True)
gs=RandomizedSearchCV(knn,param_distributions=grid,scoring='roc_auc',n_jobs=-1,n_iter=50,cv=fold,random_state=42)
gs.fit(x_train,y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.6445789496117058


In [108]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=37, p=1,
                     weights='uniform')

In [29]:
knn=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=37, p=1,
                     weights='uniform')

knn.fit(x_train,y_train)
y_train_pred=knn.predict(x_train)
y_train_prob=knn.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
knn_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
knn_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=knn.predict(x_test)
y_test_prob=knn.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
knn_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
knn_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[610   0]
 [189   1]]
Accuracy of train: 0.76375
AUC of train: 0.7308541846419327
F1-score train:  0.010471204188481674



Confusion matrix 
 [[143   0]
 [ 57   0]]
Accuracy of test: 0.715
AUC of test: 0.6211507790455159
F1-score test:  0.0



Classification report: '
'               precision    recall  f1-score   support

           0       0.71      1.00      0.83       143
           1       0.00      0.00      0.00        57

    accuracy                           0.71       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.51      0.71      0.60       200



# Bagging Classifier

In [30]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

Bagged Decision Tree

In [32]:
dt_bag=BaggingClassifier(base_estimator=dt,random_state=42,n_jobs=-1)

dt_bag.fit(x_train,y_train)
y_train_pred=dt_bag.predict(x_train)
y_train_prob=dt_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
dt_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
dt_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))


y_test_pred=dt_bag.predict(x_test)
y_test_prob=dt_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
dt_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
dt_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[521  89]
 [ 62 128]]
Accuracy of train: 0.81125
AUC of train: 0.8009534081104401
F1-score train:  0.628992628992629



Confusion matrix 
 [[123  20]
 [ 18  39]]
Accuracy of test: 0.81
AUC of test: 0.7736474052263526
F1-score test:  0.6724137931034484



Classification report: '
'               precision    recall  f1-score   support

           0       0.87      0.86      0.87       143
           1       0.66      0.68      0.67        57

    accuracy                           0.81       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.81      0.81      0.81       200



Bagged Random Forest

In [34]:
rf_bag=BaggingClassifier(base_estimator=rf,random_state=42,n_jobs=-1)
rf_bag.fit(x_train,y_train)
y_train_pred=rf_bag.predict(x_train)
y_train_prob=rf_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
rf_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
rf_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))


y_test_pred=rf_bag.predict(x_test)
y_test_prob=rf_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
rf_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
rf_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[383 227]
 [ 37 153]]
Accuracy of train: 0.67
AUC of train: 0.8193097497842967
F1-score train:  0.5368421052631579



Confusion matrix 
 [[94 49]
 [15 42]]
Accuracy of test: 0.68
AUC of test: 0.7758557232241443
F1_Score test:  0.5675675675675675



Classification report: '
'               precision    recall  f1-score   support

           0       0.86      0.66      0.75       143
           1       0.46      0.74      0.57        57

    accuracy                           0.68       200
   macro avg       0.66      0.70      0.66       200
weighted avg       0.75      0.68      0.70       200



Bagged KNN

In [35]:
knn_bag=BaggingClassifier(base_estimator=knn,random_state=42)
knn_bag.fit(x_train,y_train)
y_train_pred=knn_bag.predict(x_train)
y_train_prob=knn_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
knn_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
knn_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print('F1_Score train: ',f1_score(y_train,y_train_pred))

y_test_pred=knn_bag.predict(x_test)
y_test_prob=knn_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
knn_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
knn_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[609   1]
 [189   1]]
Accuracy of train: 0.7625
AUC of train: 0.7313934426229508
F1_Score train:  0.010416666666666666



Confusion matrix 
 [[143   0]
 [ 57   0]]
Accuracy of test: 0.715
AUC of test: 0.6188811188811189
F1_Score test:  0.0



Classification report: '
'               precision    recall  f1-score   support

           0       0.71      1.00      0.83       143
           1       0.00      0.00      0.00        57

    accuracy                           0.71       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.51      0.71      0.60       200



# Voting Classifier

In [36]:
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score

vote_est = [ 
    ('bag_dt', dt_bag),
    ('bag_rf', rf_bag),
    
    
    ]

voting = VotingClassifier(estimators = vote_est , voting = 'soft',weights=[1,4])
voting.fit(x_train, y_train)

y_train_pred=voting.predict(x_train)
y_train_prob=voting.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
voting_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
voting_train_f1_score= f1_score(y_train,y_train_pred) 
print("F1_Score:", f1_score(y_train,y_train_pred))

y_test_pred=voting.predict(x_test)
y_test_prob=voting.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
voting_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
voting_test_f1_score= f1_score(y_test,y_test_pred) 
print("F1_Score:", f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[518  92]
 [ 57 133]]
Accuracy of train: 0.81375
AUC of train: 0.819672131147541
F1_Score: 0.6409638554216868



Confusion matrix 
 [[122  21]
 [ 18  39]]
Accuracy of test: 0.805
AUC of test: 0.7857931542142068
F1_Score: 0.6666666666666667



Classification report: '
'               precision    recall  f1-score   support

           0       0.87      0.85      0.86       143
           1       0.65      0.68      0.67        57

    accuracy                           0.81       200
   macro avg       0.76      0.77      0.76       200
weighted avg       0.81      0.81      0.81       200



In [37]:
from sklearn.model_selection import cross_val_score

column_names=['w1','w2','score']
score_frame=pd.DataFrame(columns=column_names)
i=0
for w1 in range(1,5):
    for w2 in range(1,5):
        if len(set((w1,w2)))==1:
            continue
        clf=VotingClassifier(estimators=[('bag_dt', dt_bag),('bag_rf', rf_bag)],
                                     weights=[w1,w2],voting='soft')
        kfold = KFold(shuffle=True,n_splits=4,random_state=42)
        score = cross_val_score(clf,x_train,y_train,cv=kfold,n_jobs=-1,scoring='roc_auc')
        score_frame.loc[i]=[w1,w2,np.mean(score)]
        i=i+1
score_frame.sort_values(by='score',ascending=False)

Unnamed: 0,w1,w2,score
2,1.0,4.0,0.773626
1,1.0,3.0,0.773541
0,1.0,2.0,0.772312
5,2.0,4.0,0.772312
4,2.0,3.0,0.771696
8,3.0,4.0,0.771306
11,4.0,3.0,0.770212
7,3.0,2.0,0.769983
3,2.0,1.0,0.768777
10,4.0,2.0,0.768777


# Model Comparison

In [38]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree ', 'Random Forest','Bagged Random Forest','Bagged Decision Tree', 
              'Support Vector Classifier','KNN','Bagged KNN', 'Voting Classifier'],
              
    'Train AUC Score': [lr_train_score, dt_train_score, rf_train_score, rf_bagged_train_score, dt_bagged_train_score,
              svc_train_score, knn_train_score, knn_bagged_train_score, voting_train_score],
    
    'Test AUC Score': [lr_test_score, dt_test_score, rf_test_score, rf_bagged_test_score, dt_bagged_test_score,
              svc_test_score,  knn_test_score, knn_bagged_test_score, voting_test_score],
    
    'Train F1_Score': [lr_train_f1_score, dt_train_f1_score, rf_train_f1_score, rf_bagged_train_f1_score, dt_bagged_train_f1_score,
              svc_train_f1_score, knn_train_f1_score, knn_bagged_train_f1_score, voting_train_f1_score],
    
    'Test F1_Score': [lr_test_f1_score, dt_test_f1_score, rf_test_f1_score, rf_bagged_test_f1_score, dt_bagged_test_f1_score,
              svc_test_f1_score, knn_test_f1_score, knn_bagged_test_f1_score, voting_test_f1_score]
            })

result = results.sort_values(by='Train AUC Score', ascending=False)
result = result.set_index('Model')
display(result)

Unnamed: 0_level_0,Train AUC Score,Test AUC Score,Train F1_Score,Test F1_Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Voting Classifier,0.819672,0.785793,0.640964,0.666667
Bagged Random Forest,0.81931,0.775856,0.536842,0.567568
Random Forest,0.817317,0.755245,0.506494,0.531646
Bagged Decision Tree,0.800953,0.773647,0.628993,0.672414
Decision Tree,0.788835,0.74678,0.628993,0.672414
Bagged KNN,0.731393,0.618881,0.010417,0.0
KNN,0.730854,0.621151,0.010471,0.0
Logistic Regression,0.699698,0.547049,0.468085,0.377622
Support Vector Classifier,0.678671,0.547172,0.429091,0.465116
