# Home Credit Baseline Model - Logistic Regression

In [1]:
import logreg_report_v2

In [2]:
!pip install catboost
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
import pyarrow.parquet as pq
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

## imputation on 100k rows train df

In [4]:
data = pd.read_parquet('trian_sample_no_imputation.parquet')

In [5]:
data

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,ab3c25cf,,,
1,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,a55475b1,,,
2,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
3,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
4,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2017.0,,a55475b1,daf49a8a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941052,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,,2010.0,ab3c25cf,a55475b1,,,
941053,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,3.0,,,,2010.0,a55475b1,a55475b1,,,
941054,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,4.0,,0.0,,2010.0,a55475b1,a55475b1,,,
941055,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,2019.0,2007.0,ab3c25cf,ab3c25cf,,,


In [6]:
top20_cols = [
    'actualdpd_943P',
    'purposeofcred_426M',
    'amount_1115A',
    'credacc_actualbalance_314A',
    'actualdpdtolerance_344P',
    'annuity_780A',
    'numinstpaidearly_338L',
    'empl_employedtotal_800L',
    'empl_industry_691L',
    'maininc_215A',
    'debtoverdue_47A',
    'totalsettled_863A',
    'totaloutstanddebtvalue_39A',
    'avgdbddpdlast24m_3658932P',
    'avgdbddpdlast3m_4187120P',
    'clientscnt12m_3712952L',
    'applicationscnt_1086L',
    'applicationcnt_361L',
    'applications30d_658L',
    'WEEK_NUM',
    'target'
]

In [7]:
train_df = data[top20_cols]
train_df.head()

Unnamed: 0,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,maininc_215A,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
1,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
2,0.0,a55475b1,,,,6064.6,,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
3,0.0,a55475b1,,,,6064.6,,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
4,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,3932.352,,,0.0,8.0,0.0,0.0,16,0


In [8]:
train_df.shape

(941057, 21)

In [9]:
# Assuming train_df is already loaded
null_df = train_df[train_df.columns[train_df.isnull().any()]]

# Impute NaN values with 0 in numerical columns
zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
for column in zero_impute_columns:
    train_df[column] = train_df[column].fillna(0)

# Impute NaN values with the mode in categorical columns
mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

for column in mode_impute_columns:
    mode_value = train_df[column].mode()[0]
    train_df[column] = train_df[column].fillna(mode_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(mode_value)


In [10]:
train_df.head()

Unnamed: 0,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,maininc_215A,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
1,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
2,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
3,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
4,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,3932.352,0.0,0.0,0.0,8.0,0.0,0.0,16,0


In [11]:
train_df.shape

(941057, 21)

## logistic regression model on 100k rows train df

In [12]:
X, y = train_df.drop(columns=["target"]), train_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,  X_train.select_dtypes(exclude="object").columns),
        ('cat', categorical_transformer,  X_train.select_dtypes(include="object").columns)])

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", LogisticRegression(random_state=0, max_iter=1000)),
    ]
)

pipe.fit(X_train, y_train)

# y_pred = pipe.predict(X_train)
y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

In [13]:
logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

Log Loss: 0.1467831087602166
AUC: 0.6492962017383511
Gini Stability Score: 0.23487120856987692


{'AUC': 0.6492962017383511,
 'Gini Stability': 0.23487120856987692,
 'log_loss': 0.1467831087602166}

In [14]:
logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Log Loss: 0.14652831364924607
AUC: 0.6453191195966845
Gini Stability Score: 0.20642652798942238


{'AUC': 0.6453191195966845,
 'Gini Stability': 0.20642652798942238,
 'log_loss': 0.14652831364924607}

### LightGBM

In [15]:
# LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", lgb_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

[LightGBM] [Info] Number of positive: 26238, number of negative: 726607
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 752845, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034852 -> initscore=-3.321177
[LightGBM] [Info] Start training from score -3.321177
Log Loss: 0.08890141297571764
AUC: 0.9215393591303105
Gini Stability Score: 0.7752015508754609
Log Loss: 0.09033382715016416
AUC: 0.9147646780894713
Gini Stability Score: 0.7527116749052835


{'AUC': 0.9147646780894713,
 'Gini Stability': 0.7527116749052835,
 'log_loss': 0.09033382715016416}

### CatBoost

In [16]:
# CatBoost model
catboost_model = CatBoostClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", catboost_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Learning rate set to 0.174296
0:	learn: 0.4577616	total: 111ms	remaining: 1m 50s
1:	learn: 0.3279595	total: 156ms	remaining: 1m 17s
2:	learn: 0.2545917	total: 198ms	remaining: 1m 5s
3:	learn: 0.2116678	total: 239ms	remaining: 59.5s
4:	learn: 0.1869139	total: 275ms	remaining: 54.7s
5:	learn: 0.1705000	total: 326ms	remaining: 54.1s
6:	learn: 0.1611695	total: 371ms	remaining: 52.6s
7:	learn: 0.1543801	total: 415ms	remaining: 51.5s
8:	learn: 0.1493468	total: 460ms	remaining: 50.7s
9:	learn: 0.1465033	total: 500ms	remaining: 49.5s
10:	learn: 0.1446144	total: 541ms	remaining: 48.6s
11:	learn: 0.1431846	total: 583ms	remaining: 48s
12:	learn: 0.1421097	total: 622ms	remaining: 47.2s
13:	learn: 0.1412407	total: 668ms	remaining: 47s
14:	learn: 0.1406102	total: 709ms	remaining: 46.5s
15:	learn: 0.1400901	total: 750ms	remaining: 46.2s
16:	learn: 0.1396553	total: 792ms	remaining: 45.8s
17:	learn: 0.1390987	total: 837ms	remaining: 45.6s
18:	learn: 0.1386732	total: 880ms	remaining: 45.4s
19:	learn: 0.

{'AUC': 0.9644951344201025,
 'Gini Stability': 0.858088531415678,
 'log_loss': 0.06541674755735993}

### XGBoost

In [17]:
# XGBoost model
xgb_model = xgb.XGBClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", xgb_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Log Loss: 0.0757857049983682
AUC: 0.9507813570366641
Gini Stability Score: 0.8372565082148934
Log Loss: 0.07776078874066843
AUC: 0.9451837990980816
Gini Stability Score: 0.8165296591887462


{'AUC': 0.9451837990980816,
 'Gini Stability': 0.8165296591887462,
 'log_loss': 0.07776078874066843}