In [1]:
import os

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [4]:
import xgboost

In [5]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [6]:
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression

In [7]:
from sklearn.compose import ColumnTransformer

In [8]:
from sklearn.metrics import roc_auc_score

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.tree import DecisionTreeClassifier

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss

In [11]:
import weight_of_evidence

In [177]:
import importlib
importlib.reload(weight_of_evidence)

<module 'weight_of_evidence' from '/Users/olivercairns/Desktop/code/WeightOfEvidenceDemo/weight_of_evidence.py'>

In [188]:
data = pd.read_csv('~/Downloads/application_train.csv')

In [189]:
EXCLUDE_COLS = ['SK_ID_CURR','TARGET', 'CODE_GENDER',"ORGANIZATION_TYPE"]

In [190]:
CATERORICAL_COLS = data.drop(columns=EXCLUDE_COLS).select_dtypes('O').columns

In [191]:
NUMERIC_COLS = data.drop(columns=EXCLUDE_COLS).select_dtypes('int64').columns

In [192]:
data[CATERORICAL_COLS] = data[CATERORICAL_COLS].fillna('MISSING')

In [193]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [194]:
ohe_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC_COLS),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATERORICAL_COLS),
    ]
)

In [199]:
log_reg_cv = LogisticRegressionCV(penalty='l2', solver='lbfgs',max_iter=1e6,cv=3)

In [200]:
log_reg = LogisticRegression(solver='lbfgs',max_iter=1e6)

In [201]:
ohe_logit = Pipeline(steps=[
    ('preprocessor', ohe_preprocessor),
    ('log_reg_classifier',log_reg ),
])

In [202]:
ohe_logit_cv = Pipeline(steps=[
    ('preprocessor', ohe_preprocessor),
    ('log_reg_classifier',log_reg_cv ),
])

In [203]:
ohe_xgb = Pipeline(steps=[('preprocessor', ohe_preprocessor),
                      ('xgb_classifier', xgboost.XGBClassifier())
                         ]
                  )


In [204]:
ohe_lgb = Pipeline(steps=[('preprocessor', ohe_preprocessor),
                      ('lgb_classifier', lgb.LGBMClassifier())
                         ]
                  )


In [205]:
scorecardpy_logit = Pipeline(
    steps=[
        ("scorecardpy_calibrate", weight_of_evidence.ScorecardPyEncoder(filter_flag=False)),
        ('simple_imputer',SimpleImputer()),
        ("standard_scale", StandardScaler()),
         ('log_reg_classifier',log_reg ),
    ]
)


In [208]:
combined_results = pd.DataFrame()

In [209]:
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

In [210]:
X = data.drop(columns=EXCLUDE_COLS)
y = data.TARGET

In [211]:
def get_cv_results(X, y, cv, clf, model_name):
    results = {'brier':[],'auc':[],'fold':[]}
    fold = 1
    for train, test in cv.split(X,y):
        print(f'fold {fold}')
        prediction = clf.fit(X.iloc[train],y.iloc[train]).predict_proba(X.iloc[test])
        brier = brier_score_loss(y_true=y.iloc[test], y_prob=prediction[:,1])
        auc = roc_auc_score(y_true=y.iloc[test], y_score=prediction[:,1])
        results['brier'].append(brier)
        results['auc'].append(auc)
        results['fold'].append(fold)
        print(f'auc {auc}')
        fold +=1
    results_df=pd.DataFrame(results)
    results_df['model'] = model_name
    return results_df
    

In [212]:
combined_results = get_cv_results(X,y,cv,ohe_logit,'ohe logit' )

fold 1
auc 0.6611380729879756
fold 2
auc 0.665069014060448
fold 3
auc 0.6556058566642657
fold 4
auc 0.660426676353663
fold 5
auc 0.6671906486712682


In [213]:
#combined_results = get_cv_results(X,y,cv,ohe_logit_cv,'ohe logit cv' )

In [214]:
combined_results.groupby('model')[['auc','brier']].mean()

Unnamed: 0_level_0,auc,brier
model,Unnamed: 1_level_1,Unnamed: 2_level_1
ohe logit,0.661886,0.072209


In [215]:
#combined_results = combined_results.append(get_cv_results(X,y,cv,ohe_xgb,'ohe xgb' ))

In [216]:
combined_results = combined_results.append(get_cv_results(X,y,cv,ohe_lgb,'ohe lgbm' ))

fold 1
auc 0.6654692828931603
fold 2
auc 0.6663359632538226
fold 3
auc 0.6549244093125763
fold 4
auc 0.6624041068133852
fold 5
auc 0.6680383142937649


In [217]:
combined_results = combined_results.append(get_cv_results(X,y,cv,scorecardpy_logit,'scorecardpy logit' ))

fold 1
[INFO] creating woe binning ...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_i

Binning on 246008 rows and 119 columns in 00:01:14
[INFO] converting into woe values ...
Woe transformating on 246008 rows and 118 columns in 00:00:50
[INFO] converting into woe values ...
Woe transformating on 61503 rows and 118 columns in 00:00:12
auc 0.733649330733793
fold 2
[INFO] creating woe binning ...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_i

Binning on 246009 rows and 119 columns in 00:01:15
[INFO] converting into woe values ...
Woe transformating on 246009 rows and 118 columns in 00:00:49
[INFO] converting into woe values ...
Woe transformating on 61502 rows and 118 columns in 00:00:12
auc 0.7341535788280847
fold 3
[INFO] creating woe binning ...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_i

Binning on 246009 rows and 119 columns in 00:01:16
[INFO] converting into woe values ...
Woe transformating on 246009 rows and 118 columns in 00:00:51
[INFO] converting into woe values ...
Woe transformating on 61502 rows and 118 columns in 00:00:12
auc 0.7391203553907901
fold 4
[INFO] creating woe binning ...


 (ColumnNames: FLAG_MOBIL)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, ke

Binning on 246009 rows and 118 columns in 00:01:16
[INFO] converting into woe values ...
Woe transformating on 246009 rows and 117 columns in 00:00:50
[INFO] converting into woe values ...
Woe transformating on 61502 rows and 117 columns in 00:00:12
auc 0.7378976214651187
fold 5
[INFO] creating woe binning ...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_index()\
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  binning = pd.concat(bin_list, keys=bin_list.keys()).reset_i

Binning on 246009 rows and 119 columns in 00:01:22
[INFO] converting into woe values ...
Woe transformating on 246009 rows and 118 columns in 00:00:50
[INFO] converting into woe values ...
Woe transformating on 61502 rows and 118 columns in 00:00:12
auc 0.7374126214986947


In [219]:
combined_results.groupby('model')[['brier','auc']].mean()

Unnamed: 0_level_0,brier,auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1
ohe lgbm,0.072127,0.663434
ohe logit,0.072209,0.661886
scorecardpy logit,0.069228,0.736447


In [220]:
woebin_logit = Pipeline(
        steps=[
            ("tree_bin", weight_of_evidence.TreeBinner()),
            ("woe_scale", weight_of_evidence.WoeScaler()),
            ("standard_scale", StandardScaler()),
            ('log_reg_classifier',log_reg ),
        ]
    )


In [221]:
combined_results = combined_results.append(get_cv_results(X,
                                                          y,
                                                          cv,
                                                          woebin_logit,
                                                          'woe regression' ))

fold 1
auc 0.7316253678113342
fold 2
auc 0.7337678196319173
fold 3
auc 0.7391306330403349
fold 4
auc 0.7394971140734135
fold 5
auc 0.7364482021336152


In [222]:
combined_results.groupby('model')[['brier','auc']].mean()

Unnamed: 0_level_0,brier,auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1
ohe lgbm,0.072127,0.663434
ohe logit,0.072209,0.661886
scorecardpy logit,0.069228,0.736447
woe regression,0.069085,0.736094


In [None]:
sns.scatterplot(data=results_df,x='model',y='brier',hue='model')