# Home Credit Baseline Model - Logistic Regression

In [19]:
import logreg_report_v2

In [20]:
!pip install catboost
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [21]:
# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
import pyarrow.parquet as pq
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

## imputation on 100k rows train df

In [22]:
data = pd.read_parquet('trian_sample_no_imputation.parquet')

In [23]:
data

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,ab3c25cf,,,
1,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,a55475b1,,,
2,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
3,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
4,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2017.0,,a55475b1,daf49a8a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941052,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,,2010.0,ab3c25cf,a55475b1,,,
941053,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,3.0,,,,2010.0,a55475b1,a55475b1,,,
941054,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,4.0,,0.0,,2010.0,a55475b1,a55475b1,,,
941055,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,2019.0,2007.0,ab3c25cf,ab3c25cf,,,


In [24]:
top20_cols = [
    'case_id',
    'actualdpd_943P',
    'purposeofcred_426M',
    'amount_1115A',
    'credacc_actualbalance_314A',
    'actualdpdtolerance_344P',
    'annuity_780A',
    'numinstpaidearly_338L',
    'empl_employedtotal_800L',
    'empl_industry_691L',
    'maininc_215A',
    'debtoverdue_47A',
    'totalsettled_863A',
    'totaloutstanddebtvalue_39A',
    'avgdbddpdlast24m_3658932P',
    'avgdbddpdlast3m_4187120P',
    'clientscnt12m_3712952L',
    'applicationscnt_1086L',
    'applicationcnt_361L',
    'applications30d_658L',
    'WEEK_NUM',
    'target'
]

In [25]:
train_df = data[top20_cols]
train_df.head()

Unnamed: 0,case_id,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,12866,0.0,96a8fdfe,,,,6064.6,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
1,12866,0.0,96a8fdfe,,,,6064.6,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
2,12866,0.0,a55475b1,,,,6064.6,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
3,12866,0.0,a55475b1,,,,6064.6,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
4,12866,0.0,96a8fdfe,,,,6064.6,,,,...,0.0,3932.352,,,0.0,8.0,0.0,0.0,16,0


In [26]:
train_df.shape

(941057, 22)

In [27]:
# Assuming train_df is already loaded
null_df = train_df[train_df.columns[train_df.isnull().any()]]

# Impute NaN values with 0 in numerical columns
zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
for column in zero_impute_columns:
    train_df[column] = train_df[column].fillna(0)

# Impute NaN values with the mode in categorical columns
mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

for column in mode_impute_columns:
    mode_value = train_df[column].mode()[0]
    train_df[column] = train_df[column].fillna(mode_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(mode_value)


In [28]:
train_df.head()

Unnamed: 0,case_id,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,12866,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
1,12866,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
2,12866,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
3,12866,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
4,12866,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,...,0.0,3932.352,0.0,0.0,0.0,8.0,0.0,0.0,16,0


In [29]:
train_df.shape

(941057, 22)

## logistic regression model on 100k rows train df

In [33]:
X, y = train_df.drop(columns=["target"]), train_df["target"]

# Splitting by 'case_id'
case_ids = train_df['case_id'].unique()
train_case_ids, test_case_ids = train_test_split(case_ids, test_size=0.2, random_state=42)

X_train = train_df[train_df['case_id'].isin(train_case_ids)].drop(columns=["target"])
y_train = train_df[train_df['case_id'].isin(train_case_ids)]["target"]
X_test = train_df[train_df['case_id'].isin(test_case_ids)].drop(columns=["target"])
y_test = train_df[train_df['case_id'].isin(test_case_ids)]["target"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,  X_train.select_dtypes(exclude="object").columns),
        ('cat', categorical_transformer,  X_train.select_dtypes(include="object").columns)])

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", LogisticRegression(random_state=0, max_iter=1000)),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])


Log Loss: 0.1458044682073487
AUC: 0.6546474559842228
Gini Stability Score: 0.24286156848759943
Log Loss: 0.1503287381770734
AUC: 0.6297747330541386
Gini Stability Score: 0.10314143668137307


{'AUC': 0.6297747330541386,
 'Gini Stability': 0.10314143668137307,
 'log_loss': 0.1503287381770734}

### LightGBM

In [34]:
# LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", lgb_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

[LightGBM] [Info] Number of positive: 26135, number of negative: 727712
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 753847, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034669 -> initscore=-3.326630
[LightGBM] [Info] Start training from score -3.326630
Log Loss: 0.08153854821595083
AUC: 0.9370424492896514
Gini Stability Score: 0.8035397495913719
Log Loss: 0.15655465890362613
AUC: 0.6719436886900731
Gini Stability Score: 0.0999257986667911


{'AUC': 0.6719436886900731,
 'Gini Stability': 0.0999257986667911,
 'log_loss': 0.15655465890362613}

### CatBoost

In [35]:
# CatBoost model
catboost_model = CatBoostClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", catboost_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Learning rate set to 0.174395
0:	learn: 0.4613113	total: 59.3ms	remaining: 59.3s
1:	learn: 0.3296515	total: 107ms	remaining: 53.2s
2:	learn: 0.2554036	total: 151ms	remaining: 50.1s
3:	learn: 0.2111062	total: 195ms	remaining: 48.5s
4:	learn: 0.1844707	total: 240ms	remaining: 47.7s
5:	learn: 0.1685607	total: 283ms	remaining: 46.9s
6:	learn: 0.1594595	total: 327ms	remaining: 46.4s
7:	learn: 0.1529784	total: 370ms	remaining: 45.8s
8:	learn: 0.1484097	total: 416ms	remaining: 45.8s
9:	learn: 0.1457269	total: 458ms	remaining: 45.4s
10:	learn: 0.1436777	total: 503ms	remaining: 45.2s
11:	learn: 0.1422285	total: 547ms	remaining: 45s
12:	learn: 0.1409486	total: 591ms	remaining: 44.9s
13:	learn: 0.1399598	total: 639ms	remaining: 45s
14:	learn: 0.1390563	total: 684ms	remaining: 44.9s
15:	learn: 0.1383801	total: 726ms	remaining: 44.6s
16:	learn: 0.1377341	total: 767ms	remaining: 44.4s
17:	learn: 0.1370604	total: 811ms	remaining: 44.3s
18:	learn: 0.1366919	total: 850ms	remaining: 43.9s
19:	learn: 0.1

{'AUC': 0.6167291866066124,
 'Gini Stability': 0.055004444540091096,
 'log_loss': 0.17569787738207196}

### XGBoost

In [36]:
# XGBoost model
xgb_model = xgb.XGBClassifier(random_state=0)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", xgb_model),
    ]
)

pipe.fit(X_train, y_train)

y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Log Loss: 0.06104287513135302
AUC: 0.9726355194414554
Gini Stability Score: 0.880469516345206
Log Loss: 0.17530816319142387
AUC: 0.6306868523432143
Gini Stability Score: 0.02705542537788072


{'AUC': 0.6306868523432143,
 'Gini Stability': 0.02705542537788072,
 'log_loss': 0.17530816319142387}