# Home Credit Baseline Model - Logistic Regression

In [1]:
import logreg_report_v2

In [2]:
# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
import pyarrow.parquet as pq
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

## imputation on 100k rows train df

In [3]:
data = pd.read_parquet('data/train/trian_sample_no_imputation.parquet')

In [4]:
data

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,ab3c25cf,,,
1,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,a55475b1,,,
2,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
3,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
4,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2017.0,,a55475b1,daf49a8a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941052,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,,2010.0,ab3c25cf,a55475b1,,,
941053,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,3.0,,,,2010.0,a55475b1,a55475b1,,,
941054,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,4.0,,0.0,,2010.0,a55475b1,a55475b1,,,
941055,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,2019.0,2007.0,ab3c25cf,ab3c25cf,,,


In [5]:
top20_cols = [
    'actualdpd_943P',
    'purposeofcred_426M',
    'amount_1115A',
    'credacc_actualbalance_314A',
    'actualdpdtolerance_344P',
    'annuity_780A',
    'numinstpaidearly_338L',
    'empl_employedtotal_800L',
    'empl_industry_691L',
    'maininc_215A',
    'debtoverdue_47A',
    'totalsettled_863A',
    'totaloutstanddebtvalue_39A',
    'avgdbddpdlast24m_3658932P',
    'avgdbddpdlast3m_4187120P',
    'clientscnt12m_3712952L',
    'applicationscnt_1086L',
    'applicationcnt_361L',
    'applications30d_658L',
    'WEEK_NUM',
    'target'
]

In [6]:
train_df = data[top20_cols]
train_df.head()

Unnamed: 0,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,maininc_215A,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
1,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,305398.22,,,0.0,8.0,0.0,0.0,16,0
2,0.0,a55475b1,,,,6064.6,,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
3,0.0,a55475b1,,,,6064.6,,,,,...,0.0,,,,0.0,8.0,0.0,0.0,16,0
4,0.0,96a8fdfe,,,,6064.6,,,,,...,0.0,3932.352,,,0.0,8.0,0.0,0.0,16,0


In [7]:
train_df.shape

(941057, 21)

In [8]:
# Assuming train_df is already loaded
null_df = train_df[train_df.columns[train_df.isnull().any()]]

# Impute NaN values with 0 in numerical columns
zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
for column in zero_impute_columns:
    train_df[column] = train_df[column].fillna(0)

# Impute NaN values with the mode in categorical columns
mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

for column in mode_impute_columns:
    mode_value = train_df[column].mode()[0]
    train_df[column] = train_df[column].fillna(mode_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(mode_value)


In [9]:
train_df.head()

Unnamed: 0,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,empl_industry_691L,maininc_215A,...,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L,WEEK_NUM,target
0,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
1,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,305398.22,0.0,0.0,0.0,8.0,0.0,0.0,16,0
2,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
3,0.0,a55475b1,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,16,0
4,0.0,96a8fdfe,0.0,0.0,0.0,6064.6,0.0,MORE_FIVE,OTHER,0.0,...,0.0,3932.352,0.0,0.0,0.0,8.0,0.0,0.0,16,0


In [10]:
train_df.shape

(941057, 21)

## logistic regression model on 100k rows train df

In [11]:
X, y = train_df.drop(columns=["target"]), train_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,  X_train.select_dtypes(exclude="object").columns),
        ('cat', categorical_transformer,  X_train.select_dtypes(include="object").columns)])

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", LogisticRegression(random_state=0, max_iter=1000)),
    ]
)

pipe.fit(X_train, y_train)

# y_pred = pipe.predict(X_train)
y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

In [12]:
logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

Log Loss: 0.14678357048696805
AUC: 0.649291854230281
Gini Stability Score: 0.23485037282449384


{'AUC': 0.649291854230281,
 'Gini Stability': 0.23485037282449384,
 'log_loss': 0.14678357048696805}

In [13]:
logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])

Log Loss: 0.14652903422535235
AUC: 0.64529721678667
Gini Stability Score: 0.2064278504506235


{'AUC': 0.64529721678667,
 'Gini Stability': 0.2064278504506235,
 'log_loss': 0.14652903422535235}