In [1]:
# load libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold

In [2]:
# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

# load train data set
df=pd.read_csv('experiment1.csv')
df.head()

Unnamed: 0,policy_deductable,policy_annual_premium,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,vehicle_claim,customer,Customer_age_group,have_umbrella_limit,policy_csl,insured_sex,insured_education_level,incident_severity,property_damage,police_report_available,fraud_reported,insured_occupation_armed-forces,insured_occupation_craft-repair,insured_occupation_exec-managerial,insured_occupation_farming-fishing,insured_occupation_handlers-cleaners,insured_occupation_machine-op-inspct,insured_occupation_other-service,insured_occupation_priv-house-serv,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving,insured_relationship_not-in-family,insured_relationship_other-relative,insured_relationship_own-child,insured_relationship_unmarried,insured_relationship_wife,incident_type_Parked Car,incident_type_Single Vehicle Collision,incident_type_Vehicle Theft,collision_type_Rear Collision,collision_type_Side Collision,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police
0,1000,1406.91,53300,0,5,1,1,2,52080,0,0,1,2.5,1,6,3,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1
1,2000,1197.22,0,0,8,1,0,0,3510,0,0,0,2.5,1,6,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1
2,2000,1413.14,35100,0,7,3,2,3,23100,0,1,0,1.0,0,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
3,2000,1415.74,48900,-62400,5,1,1,2,50720,0,0,0,2.5,0,7,3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
4,1000,1583.91,66000,-46000,20,1,0,1,4550,0,0,0,5.0,1,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0


In [3]:
# Divide data into X and y
X=df.drop('fraud_reported',axis=1)
y=df['fraud_reported']

# Applying Sampling Techniques

In [4]:
# Over Sampling with SMOTE
from imblearn.over_sampling import SMOTE

x_resample, y_resample  = SMOTE().fit_sample(X, y.values.ravel())

print(x_resample.shape)
print(y_resample.shape)

(1506, 45)
(1506,)


# Train test split

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_resample, y_resample, test_size = 0.2, random_state = 0)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1204, 45)
(1204,)
(302, 45)
(302,)


# Data standardization

In [6]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Model Building

Logestic Regression

In [21]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print("Training Accuracy: ", lr.score(x_train, y_train))
print('Testing Accuarcy: ', lr.score(x_test, y_test))
print("roc_auc_score: ",roc_auc_score(y_test,  y_pred))

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

Training Accuracy:  0.7998338870431894
Testing Accuarcy:  0.7715231788079471
roc_auc_score:  0.76991926991927
[[131  23]
 [ 46 102]]
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       154
           1       0.82      0.69      0.75       148

    accuracy                           0.77       302
   macro avg       0.78      0.77      0.77       302
weighted avg       0.78      0.77      0.77       302



Hyper parameter tuning to get the best ROC-AUC score

In [32]:
grid={'penalty' : ['l1', 'l2', 'none'],
        'C': [1.0,10,0.0001,1000000.0,1e-06,100000.0],
        'solver' : ['lbfgs', 'liblinear'],
        'max_iter':[50,100,500,1000,10000],
        'n_jobs' : [-1],
        }
    
lr = LogisticRegression(class_weight='balanced',random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCV(lr,param_grid= grid, scoring='roc_auc', cv=fold)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.8573314273476095


In [33]:
gs.best_estimator_

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

lr.fit(x_train,y_train)
y_train_pred=lr.predict(x_train)
y_train_prob=lr.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
lr_train_score= roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
lr_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=lr.predict(x_test)
y_test_prob=lr.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
lr_test_score= roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
lr_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[517  82]
 [159 446]]
Accuracy of train: 0.7998338870431894
AUC of train: 0.8789856372190565
F1-score train:  0.7872903795233893



Confusion matrix 
 [[131  23]
 [ 47 101]]
Accuracy of test: 0.7682119205298014
AUC of test: 0.8158564408564409
F1-score test:  0.7426470588235295



Classification report: '
'               precision    recall  f1-score   support

           0       0.74      0.85      0.79       154
           1       0.81      0.68      0.74       148

    accuracy                           0.77       302
   macro avg       0.78      0.77      0.77       302
weighted avg       0.77      0.77      0.77       302



# Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
dt=dt=DecisionTreeClassifier(random_state=42)

dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

print("Training Accuracy: ", dt.score(x_train, y_train))
print('Testing Accuarcy: ', dt.score(x_test, y_test))
print("ROC_Score",roc_auc_score(y_test,  y_pred))

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

Training Accuracy:  1.0
Testing Accuarcy:  0.7317880794701986
ROC_Score 0.7317479817479818
[[113  41]
 [ 40 108]]
              precision    recall  f1-score   support

           0       0.74      0.73      0.74       154
           1       0.72      0.73      0.73       148

    accuracy                           0.73       302
   macro avg       0.73      0.73      0.73       302
weighted avg       0.73      0.73      0.73       302



HyperParameter tuning decission tree

In [38]:
grid={'criterion':['gini','entropy'],
    'max_depth': [2,3,5,9],
    'max_features': [18,34,44,41,36,'auto'],
        }
    
dt = DecisionTreeClassifier(class_weight='balanced',random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCV(dt,param_grid= grid, scoring='roc_auc', cv=fold)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.8495957834418058


In [39]:
gs.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=5, max_features=41,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=42, splitter='best')

In [27]:
dt=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=5, max_features=41,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=42, splitter='best')


dt.fit(x_train,y_train)
y_train_pred=dt.predict(x_train)
y_train_prob=dt.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
dt_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
dt_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=dt.predict(x_test)
y_test_prob=dt.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
dt_test_score= roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
dt_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[438 161]
 [ 59 546]]
Accuracy of train: 0.8172757475083057
AUC of train: 0.8985954552353096
F1-score train:  0.8323170731707317



Confusion matrix 
 [[100  54]
 [ 24 124]]
Accuracy of test: 0.7417218543046358
AUC of test: 0.7984819234819234
F1-score test:  0.7607361963190185



Classification report: '
'               precision    recall  f1-score   support

           0       0.81      0.65      0.72       154
           1       0.70      0.84      0.76       148

    accuracy                           0.74       302
   macro avg       0.75      0.74      0.74       302
weighted avg       0.75      0.74      0.74       302



# Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf=RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

print("Training Accuracy: ", rf.score(x_train, y_train))
print('Testing Accuarcy: ', rf.score(x_test, y_test))
print("ROC_Score",roc_auc_score(y_test,  y_pred))

# making a classification report
cr = classification_report(y_test,  y_pred_rf)
print(cr)

# confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

Training Accuracy:  1.0
Testing Accuarcy:  0.8311258278145696
ROC_Score 0.7567128817128816
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       154
           1       0.86      0.78      0.82       148

    accuracy                           0.83       302
   macro avg       0.83      0.83      0.83       302
weighted avg       0.83      0.83      0.83       302

[[135  19]
 [ 32 116]]


In [43]:
grid={'n_estimators':[10,20,50,100,500],
    'criterion':['gini', 'entropy'],
    'bootstrap' : [True],
    'oob_score': [True],
    'n_jobs':[-1],
    'class_weight':["balanced"],
    'max_depth': [2,3,5,9,None],
    'max_features': [18,34,44,41,36,'auto']
    }
rf = RandomForestClassifier(random_state=42)
fold = KFold(n_splits=4, shuffle=True, random_state=42)
gs = RandomizedSearchCV(rf,param_distributions= grid, scoring='roc_auc', cv=fold)
gs.fit(x_train, y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.9207824372145024


In [44]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features=18,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [29]:
rf=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features=18,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

rf.fit(x_train,y_train)
y_train_pred=rf.predict(x_train)
y_train_prob=rf.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
rf_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
rf_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=rf.predict(x_test)
y_test_prob=rf.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
rf_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
rf_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[599   0]
 [  0 605]]
Accuracy of train: 1.0
AUC of train: 1.0
F1-score train:  1.0



Confusion matrix 
 [[132  22]
 [ 28 120]]
Accuracy of test: 0.8344370860927153
AUC of test: 0.9018734643734644
F1-score test:  0.8275862068965518



Classification report: '
'               precision    recall  f1-score   support

           0       0.82      0.86      0.84       154
           1       0.85      0.81      0.83       148

    accuracy                           0.83       302
   macro avg       0.84      0.83      0.83       302
weighted avg       0.83      0.83      0.83       302



# Support vectore Classifier

In [30]:
from sklearn.svm import SVC

In [47]:
svc=SVC(random_state=42)
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)

print("Training Accuracy: ", svc.score(x_train, y_train))
print('Testing Accuarcy: ', svc.score(x_test, y_test))
print("ROC_Score",roc_auc_score(y_test,  y_pred))

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

Training Accuracy:  0.8853820598006644
Testing Accuarcy:  0.804635761589404
ROC_Score 0.8012021762021763
[[150   4]
 [ 55  93]]
              precision    recall  f1-score   support

           0       0.73      0.97      0.84       154
           1       0.96      0.63      0.76       148

    accuracy                           0.80       302
   macro avg       0.85      0.80      0.80       302
weighted avg       0.84      0.80      0.80       302



Hyperameter tuning svc

In [170]:
grid={'C': [0.1, 1, 10, 100, 1000],  
      'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'auto', 'scale'], 
      'kernel': ['rbf','linear','poly'],
      'degree' : [0, 1, 2, 3, 4, 5, 6],
      'class_weight' : ["balanced",None]}

svc=SVC(random_state=42)
fold=KFold(n_splits=4,random_state=42,shuffle=True)
gs=RandomizedSearchCV(svc,param_distributions=grid,scoring='roc_auc',n_jobs=-1,n_iter=50,cv=fold)
gs.fit(x_train,y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.8764492483041175


In [48]:
gs.best_estimator_ 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features=18,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [31]:
svc=SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=4, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=True, random_state=42, shrinking=True, tol=0.001,
    verbose=False)


svc.fit(x_train,y_train)
y_train_pred=svc.predict(x_train)
y_train_prob=svc.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
svc_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
svc_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=svc.predict(x_test)
y_test_prob=svc.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
svc_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
svc_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[599   0]
 [  0 605]]
Accuracy of train: 1.0
AUC of train: 1.0
F1-score train:  1.0



Confusion matrix 
 [[153   1]
 [ 59  89]]
Accuracy of test: 0.8013245033112583
AUC of test: 0.853939978939979
F1-score test:  0.7478991596638654



Classification report: '
'               precision    recall  f1-score   support

           0       0.72      0.99      0.84       154
           1       0.99      0.60      0.75       148

    accuracy                           0.80       302
   macro avg       0.86      0.80      0.79       302
weighted avg       0.85      0.80      0.79       302



# KNN Classifier

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [51]:
knn=KNeighborsClassifier()

knn.fit(x_train,y_train)
y_pred= knn.predict(x_test)

print("Training Accuracy: ", knn.score(x_train, y_train))
print('Testing Accuarcy: ', knn.score(x_test, y_test))
print("ROC_Score",roc_auc_score(y_test,  y_pred))

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

Training Accuracy:  0.8289036544850499
Testing Accuarcy:  0.7450331125827815
ROC_Score 0.7465777465777466
[[103  51]
 [ 26 122]]
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       154
           1       0.71      0.82      0.76       148

    accuracy                           0.75       302
   macro avg       0.75      0.75      0.74       302
weighted avg       0.75      0.75      0.74       302



KNN Hyperparameter Tunning

In [33]:
grid = {'algorithm' :['auto','ball_tree', 'kd_tree', 'brute'],
         'n_jobs':[-1,None], 
         'n_neighbors':list(range(1,41,2)), 
         'p':[1,2],
         'weights':['uniform','distance']
       }

knn=KNeighborsClassifier()
fold=KFold(n_splits=4,random_state=42,shuffle=True)
gs=RandomizedSearchCV(knn,param_distributions=grid,scoring='roc_auc',n_jobs=-1,n_iter=50,cv=fold,random_state=42)
gs.fit(x_train,y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.8730960947618893


In [34]:
gs.best_estimator_

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=21, p=1,
                     weights='distance')

In [35]:
knn=KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=21, p=1,
                     weights='distance')


knn.fit(x_train,y_train)
y_train_pred=knn.predict(x_train)
y_train_prob=knn.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
knn_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
knn_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))



y_test_pred=knn.predict(x_test)
y_test_prob=knn.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
knn_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
knn_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[599   0]
 [  0 605]]
Accuracy of train: 1.0
AUC of train: 1.0
F1-score train:  1.0



Confusion matrix 
 [[ 38 116]
 [  9 139]]
Accuracy of test: 0.5860927152317881
AUC of test: 0.8469857844857845
F1-score test:  0.6898263027295285



Classification report: '
'               precision    recall  f1-score   support

           0       0.81      0.25      0.38       154
           1       0.55      0.94      0.69       148

    accuracy                           0.59       302
   macro avg       0.68      0.59      0.53       302
weighted avg       0.68      0.59      0.53       302



# Bagging Classifier

In [50]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

Bagged Decision Treee

In [51]:
dt_bag=BaggingClassifier(base_estimator=dt,random_state=42,n_jobs=-1)

dt_bag.fit(x_train,y_train)
y_train_pred=dt_bag.predict(x_train)
y_train_prob=dt_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
dt_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
dt_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))


y_test_pred=dt_bag.predict(x_test)
y_test_prob=dt_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
dt_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
dt_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print("F1-score test: ",f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[518  81]
 [104 501]]
Accuracy of train: 0.8463455149501661
AUC of train: 0.9375846797003269
F1-score train:  0.8441449031171021



Confusion matrix 
 [[124  30]
 [ 42 106]]
Accuracy of test: 0.7615894039735099
AUC of test: 0.8248288873288873
F1-score test:  0.7464788732394366



Classification report: '
'               precision    recall  f1-score   support

           0       0.75      0.81      0.77       154
           1       0.78      0.72      0.75       148

    accuracy                           0.76       302
   macro avg       0.76      0.76      0.76       302
weighted avg       0.76      0.76      0.76       302



Bagged random Forest

In [52]:
rf_bag=BaggingClassifier(base_estimator=rf,random_state=42,n_jobs=-1)
rf_bag.fit(x_train,y_train)
y_train_pred=rf_bag.predict(x_train)
y_train_prob=rf_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
rf_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
rf_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print("F1-score train: ",f1_score(y_train,y_train_pred))


y_test_pred=rf_bag.predict(x_test)
y_test_prob=rf_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
rf_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
rf_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[591   8]
 [ 12 593]]
Accuracy of train: 0.9833887043189369
AUC of train: 0.9989596986713393
F1-score train:  0.9834162520729686



Confusion matrix 
 [[130  24]
 [ 26 122]]
Accuracy of test: 0.8344370860927153
AUC of test: 0.8996797121797121
F1_Score test:  0.8299319727891157



Classification report: '
'               precision    recall  f1-score   support

           0       0.83      0.84      0.84       154
           1       0.84      0.82      0.83       148

    accuracy                           0.83       302
   macro avg       0.83      0.83      0.83       302
weighted avg       0.83      0.83      0.83       302



KNN Bagging

In [53]:
knn_bag=BaggingClassifier(base_estimator=knn,random_state=42)
knn_bag.fit(x_train,y_train)
y_train_pred=knn_bag.predict(x_train)
y_train_prob=knn_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
knn_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
knn_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print('F1_Score train: ',f1_score(y_train,y_train_pred))

y_test_pred=knn_bag.predict(x_test)
y_test_prob=knn_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
knn_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
knn_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[583  16]
 [  0 605]]
Accuracy of train: 0.9867109634551495
AUC of train: 0.9999862029001504
F1_Score train:  0.9869494290375205



Confusion matrix 
 [[ 39 115]
 [  8 140]]
Accuracy of test: 0.5927152317880795
AUC of test: 0.84498946998947
F1_Score test:  0.6947890818858562



Classification report: '
'               precision    recall  f1-score   support

           0       0.83      0.25      0.39       154
           1       0.55      0.95      0.69       148

    accuracy                           0.59       302
   macro avg       0.69      0.60      0.54       302
weighted avg       0.69      0.59      0.54       302



Bagged Logistic Regression

In [40]:
grid = { 'n_jobs':[-1,None], 
         'max_samples':[1,2,3,4,5,6,7],
         'max_features':[1,2,3,4,5,6,7],
        }

lr_bag=BaggingClassifier(base_estimator=lr,random_state=42)
fold=KFold(n_splits=4,random_state=42,shuffle=True)
gs=RandomizedSearchCV(lr_bag,param_distributions=grid,scoring='roc_auc',n_jobs=-1,n_iter=50,cv=fold,random_state=42)
gs.fit(x_train,y_train)

print ('gs.best_score_:', gs.best_score_)

gs.best_score_: 0.7143424686816553


In [41]:
gs.best_estimator_

BaggingClassifier(base_estimator=LogisticRegression(C=1.0,
                                                    class_weight='balanced',
                                                    dual=False,
                                                    fit_intercept=True,
                                                    intercept_scaling=1,
                                                    l1_ratio=None, max_iter=50,
                                                    multi_class='auto',
                                                    n_jobs=-1, penalty='l2',
                                                    random_state=42,
                                                    solver='liblinear',
                                                    tol=0.0001, verbose=0,
                                                    warm_start=False),
                  bootstrap=True, bootstrap_features=False, max_features=6,
                  max_samples=7, n_estimators=10, n_jobs=None, 

In [54]:
lr_bag=BaggingClassifier(base_estimator=lr,max_samples= 7,random_state=42,max_features=6)
lr_bag.fit(x_train,y_train)
y_train_pred=lr_bag.predict(x_train)
y_train_prob=lr_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
lr_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
lr_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print('F1_Score train: ',f1_score(y_train,y_train_pred))


y_test_pred=lr_bag.predict(x_test)
y_test_prob=lr_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
lr_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
lr_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[400 199]
 [150 455]]
Accuracy of train: 0.7101328903654485
AUC of train: 0.7612605030422606
F1_Score train:  0.7227958697378871



Confusion matrix 
 [[107  47]
 [ 53  95]]
Accuracy of test: 0.6688741721854304
AUC of test: 0.7027465777465777
F1_Score test:  0.6551724137931034



Classification report: '
'               precision    recall  f1-score   support

           0       0.67      0.69      0.68       154
           1       0.67      0.64      0.66       148

    accuracy                           0.67       302
   macro avg       0.67      0.67      0.67       302
weighted avg       0.67      0.67      0.67       302



Bagged SVC

In [55]:
svc_bag=BaggingClassifier(base_estimator=svc,random_state=42)
svc_bag.fit(x_train,y_train)
y_train_pred=svc_bag.predict(x_train)
y_train_prob=svc_bag.predict_proba(x_train)[:,1]

print('Confusion matrix','\n',confusion_matrix(y_train,y_train_pred))
print("Accuracy of train:",accuracy_score(y_train,y_train_pred))
svc_bagged_train_score=roc_auc_score(y_train,y_train_prob)
print("AUC of train:",roc_auc_score(y_train,y_train_prob))
svc_bagged_train_f1_score= f1_score(y_train,y_train_pred)
print('F1_Score train: ',f1_score(y_train,y_train_pred))


y_test_pred=svc_bag.predict(x_test)
y_test_prob=svc_bag.predict_proba(x_test)[:,1]

print('\n'*2)
print('Confusion matrix','\n',confusion_matrix(y_test,y_test_pred))
print("Accuracy of test:",accuracy_score(y_test,y_test_pred))
svc_bagged_test_score=roc_auc_score(y_test,y_test_prob)
print("AUC of test:",roc_auc_score(y_test,y_test_prob))
svc_bagged_test_f1_score= f1_score(y_test,y_test_pred)
print('F1_Score test: ',f1_score(y_test,y_test_pred))
print('\n'*2)
print("Classification report: '\n'",classification_report(y_test,y_test_pred))

Confusion matrix 
 [[599   0]
 [  3 602]]
Accuracy of train: 0.9975083056478405
AUC of train: 0.9999668869603608
F1_Score train:  0.9975144987572493



Confusion matrix 
 [[151   3]
 [ 57  91]]
Accuracy of test: 0.8013245033112583
AUC of test: 0.853939978939979
F1_Score test:  0.7520661157024794



Classification report: '
'               precision    recall  f1-score   support

           0       0.73      0.98      0.83       154
           1       0.97      0.61      0.75       148

    accuracy                           0.80       302
   macro avg       0.85      0.80      0.79       302
weighted avg       0.84      0.80      0.79       302



In [45]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree ', 'Random Forest','Bagged Random Forest','Bagged Decision Tree', 
              'Support Vector Classifier','Bagged SVC','KNN','Bagged KNN','Bagged LR'
              ],
              
    'Train AUC Score': [lr_train_score, dt_train_score, rf_train_score, rf_bagged_train_score, dt_bagged_train_score,
              svc_train_score, svc_bagged_train_score, knn_train_score, knn_bagged_train_score, lr_bagged_train_score
              ],
    
    'Test AUC Score': [lr_test_score, dt_test_score, rf_test_score, rf_bagged_test_score, dt_bagged_test_score,
              svc_test_score, svc_bagged_test_score, knn_test_score, knn_bagged_test_score, lr_bagged_test_score
              ],
    
    'Train F1_Score': [lr_train_f1_score, dt_train_f1_score, rf_train_f1_score, rf_bagged_train_f1_score, dt_bagged_train_f1_score,
              svc_train_f1_score, svc_bagged_train_f1_score, knn_train_f1_score, knn_bagged_train_f1_score, lr_bagged_train_f1_score
              ],
    
    'Test F1_Score': [lr_test_f1_score, dt_test_f1_score, rf_test_f1_score, rf_bagged_test_f1_score, dt_bagged_test_f1_score,
              svc_test_f1_score, svc_bagged_test_f1_score, knn_test_f1_score, knn_bagged_test_f1_score, lr_bagged_test_f1_score
              ]
            })

result = results.sort_values(by='Test AUC Score', ascending=False)
result = result.set_index('Model')
display(result)

Unnamed: 0_level_0,Train AUC Score,Test AUC Score,Train F1_Score,Test F1_Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,1.0,0.901873,1.0,0.827586
Bagged Random Forest,0.99896,0.89968,0.983416,0.829932
Support Vector Classifier,1.0,0.85394,1.0,0.747899
Bagged SVC,0.999967,0.85394,0.997514,0.752066
KNN,1.0,0.846986,1.0,0.689826
Bagged KNN,0.999986,0.844989,0.986949,0.694789
Bagged Decision Tree,0.937585,0.824829,0.844145,0.746479
Logistic Regression,0.878986,0.815856,0.78729,0.742647
Decision Tree,0.898595,0.798482,0.832317,0.760736
Bagged LR,0.761261,0.702747,0.722796,0.655172
