- In this notebook the independent variables are scaled using MinMaxScaler()
- We are focusing on area under ROC curve(Higher the better)

In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.model_selection import train_test_split, GridSearchCV
import Build_Evaluate_Model as bem
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score


ModuleNotFoundError: No module named 'Build_Evaluate_Model'


### OBTAINING TRAIN AND TEST SET (MINMAXSCALER)

In [2]:
X_train,y_train,X_test,y_test=bem.get_xy_traintest(scale=True,scaler='MinMax')

### BASIC LOGISTIC REGRESSION MODEL

In [3]:
log_score_minmax=bem.build_basic_model(X_train,y_train,X_test,y_test,classifier='Logistic Regression')

In [4]:
# ROC AUC Score on basic logistic regression
log_score_minmax

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.18021640744759232, 0.15789543226653285, 0.3...","[0.2860633049858224, 0.343463978140098, 0.2603...",0.777212,0.754446,0.022766


### CLASS WEIGHT PARAMETER

In [5]:
w={0:22,1:77}

log_cw=LogisticRegression(max_iter=1000,random_state=42,class_weight=w,penalty='l2')

In [6]:
log_score_minmax=bem.build_model(X_train,y_train,X_test,y_test,classifier=log_cw,classifier_name='log_cw',score_df=log_score_minmax)

In [7]:
log_score_minmax

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.18021640744759232, 0.15789543226653285, 0.3...","[0.2860633049858224, 0.343463978140098, 0.2603...",0.777212,0.754446,0.022766
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4365481227588432, 0.3972976900648298, 0.707...","[0.6967057592838225, 0.6559601850205833, 0.544...",0.777627,0.755056,0.022571


### HYPERPARAMETER TUNING FOR LOGISTIC REGRESSION

In [8]:
w=[{0:1.0,1:10},{0:1,1:100}, {0:1,1:150},{0:1,1:1},
     {0:1.0,1:200},{0:1.0,1:500},{0:100,1:1000},{0:22,1:77},{0:30,1:70},{0:40,1:60},{0:20,1:80},{0:10,1:90}]
c_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]


In [9]:
hyperparam_grid = {"class_weight": w ,'solver':['liblinear'],'penalty':['l1'],'C':c_values}

In [10]:
log_search_wt=LogisticRegression(max_iter=500,random_state=42)

grid = GridSearchCV(log_search_wt,hyperparam_grid,scoring="roc_auc", cv=3, n_jobs=-1)
grid.fit(X_train,y_train)

print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.7706529891354151 with param: {'C': 0.01, 'class_weight': {0: 100, 1: 1000}, 'penalty': 'l1', 'solver': 'liblinear'}


- This above code Best score:{'C': 0.01, 'class_weight': {0: 100, 1: 1000}, 'penalty': 'l1', 'solver': 'liblinear'}

In [12]:
log_best=LogisticRegression(C=0.01, class_weight={0:100, 1:1000}, penalty='l1', solver='liblinear')

In [13]:
log_score_minmax=bem.build_model(X_train,y_train,X_test,y_test,classifier=log_best,classifier_name='log_best',score_df=log_score_minmax)

In [14]:
log_score_minmax

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.18021640744759232, 0.15789543226653285, 0.3...","[0.2860633049858224, 0.343463978140098, 0.2603...",0.777212,0.754446,0.022766
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4365481227588432, 0.3972976900648298, 0.707...","[0.6967057592838225, 0.6559601850205833, 0.544...",0.777627,0.755056,0.022571
2,log_best,"LogisticRegression(C=0.01, class_weight={0: 10...","[0.6908074590197232, 0.6535389050694503, 0.870...","[0.8668631950877487, 0.8479692342136933, 0.772...",0.777637,0.755437,0.022199


### CHANGING C PARAMETER

`c parameter`:Inverse to strength of regularization. Smaller the value higher the regularization

In [15]:
C=[0.001,0.0002,0.0005,0.0001]

for c in C:
    classifier_name='log_c_'+str(c)
    log=LogisticRegression(C=c, class_weight={0:30, 1:70}, penalty='l1', solver='liblinear')
    log_score_minmax=bem.build_model(X_train,y_train,X_test,y_test,classifier=log,classifier_name=classifier_name,score_df=log_score_minmax)

### LOGISTIC REGRESSION ENSEMBLE

In [16]:
log_score_minmax=bem.build_ensemble(X_train,y_train,X_test,y_test,classifier_name='log',score_df=log_score_minmax)

In [17]:
log_score_minmax.sort_values(by='DIFFERENCE')

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
6,log_c_0.0001,"LogisticRegression(C=0.0001, class_weight={0: ...","[0.40931312696431155, 0.39662822150709814, 0.5...","[0.41530278411762234, 0.5162386017098676, 0.47...",0.768658,0.756304,0.012355
4,log_c_0.0002,"LogisticRegression(C=0.0002, class_weight={0: ...","[0.3649483120425892, 0.3498199553402743, 0.501...","[0.37633244719602715, 0.5409804973485722, 0.45...",0.772187,0.755137,0.01705
5,log_c_0.0005,"LogisticRegression(C=0.0005, class_weight={0: ...","[0.3361783533505103, 0.32001903654777947, 0.50...","[0.31913030685936794, 0.5498732836677761, 0.44...",0.773981,0.754599,0.019382
7,log_ensemble,,"[0.3933712541285023, 0.36897402123730494, 0.56...","[0.46506113236011887, 0.5719203015698848, 0.48...",0.77658,0.756262,0.020318
3,log_c_0.001,"LogisticRegression(C=0.001, class_weight={0: 3...","[0.3355869973159465, 0.3076189078651693, 0.508...","[0.2950301289904209, 0.5489563308886034, 0.448...",0.775479,0.754911,0.020567
2,log_best,"LogisticRegression(C=0.01, class_weight={0: 10...","[0.6908074590197232, 0.6535389050694503, 0.870...","[0.8668631950877487, 0.8479692342136933, 0.772...",0.777637,0.755437,0.022199
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4365481227588432, 0.3972976900648298, 0.707...","[0.6967057592838225, 0.6559601850205833, 0.544...",0.777627,0.755056,0.022571
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.18021640744759232, 0.15789543226653285, 0.3...","[0.2860633049858224, 0.343463978140098, 0.2603...",0.777212,0.754446,0.022766


In [None]:
# Best Logistic model

In [18]:
log_score_minmax.sort_values(by='DIFFERENCE').iloc[0]

MODEL                                                log_c_0.0001
PARAMS          LogisticRegression(C=0.0001, class_weight={0: ...
y_train_prob    [0.40931312696431155, 0.39662822150709814, 0.5...
y_test_prob     [0.41530278411762234, 0.5162386017098676, 0.47...
TRAIN SCORE                                              0.768658
TEST SCORE                                               0.756304
DIFFERENCE                                              0.0123545
Name: 6, dtype: object