# Home Credit Baseline Model - Logistic Regression

In [1]:
import logreg_report_v2

In [2]:
# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
import pyarrow.parquet as pq
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

## getting data

In [3]:
data = pd.read_parquet('data/train/trian_sample_no_imputation.parquet')

In [4]:
data

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,ab3c25cf,,,
1,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2018.0,,a55475b1,a55475b1,,,
2,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
3,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,,,,,,,
4,12866,2019-04-24,201904,16,0,,,6064.6,0.0,0.0,...,,,,2017.0,,a55475b1,daf49a8a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941052,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,,2010.0,ab3c25cf,a55475b1,,,
941053,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,3.0,,,,2010.0,a55475b1,a55475b1,,,
941054,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,4.0,,0.0,,2010.0,a55475b1,a55475b1,,,
941055,168975,2019-10-21,201910,41,0,0.0,5072.0,6518.2,2536.0,0.0,...,2.0,,,2019.0,2007.0,ab3c25cf,ab3c25cf,,,


In [5]:
def impute_default(df):
    
    train_df = df.copy()

    # Assuming train_df is already loaded
    null_df = train_df[train_df.columns[train_df.isnull().any()]]

    # Impute NaN values with 0 in numerical columns
    zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
    for column in zero_impute_columns:
        train_df[column] = train_df[column].fillna(0)

    # Impute NaN values with the mode in categorical columns
    mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

    for column in mode_impute_columns:
        mode_value = train_df[column].mode()[0]
        train_df[column] = train_df[column].fillna(mode_value)
        
    return train_df

## function for logistic regression model

In [6]:
def train_model(train_df):

    X, y = train_df.drop(columns=["target"]), train_df["target"]

    # Splitting by 'case_id'
    case_ids = train_df['case_id'].unique()
    train_case_ids, test_case_ids = train_test_split(case_ids, test_size=0.2, random_state=42)

    X_train = train_df[train_df['case_id'].isin(train_case_ids)].drop(columns=["target"])
    y_train = train_df[train_df['case_id'].isin(train_case_ids)]["target"]
    X_test = train_df[train_df['case_id'].isin(test_case_ids)].drop(columns=["target"])
    y_test = train_df[train_df['case_id'].isin(test_case_ids)]["target"]

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer,  X_train.select_dtypes(exclude="object").columns),
            ('cat', categorical_transformer,  X_train.select_dtypes(include="object").columns)])

    pipe = Pipeline(
        [
            ('preprocessor', preprocessor),
            ("clf", LogisticRegression(random_state=0, max_iter=1000)),
        ]
    )

    pipe.fit(X_train, y_train)

    # y_pred = pipe.predict(X_train)
    y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]
    return pipe, X_train, y_train, X_test, y_test

## pipeline for imputation + modeling + results process

In [7]:
def model_pipeline(data, cols):
    
    base_cols = [
    'case_id',
    'WEEK_NUM',
    'target'
    ]
    
    # grab necessary cols
    train_df = data[base_cols + cols]
    
    # impute NaN
    train_df = impute_default(train_df)
    
    # fit model
    pipe, X_train, y_train, X_test, y_test = train_model(train_df)
    
    # evaluate model
    train_stats = logreg_report_v2.generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])
    test_stats = logreg_report_v2.generate_report(pipe, X_test, y_test, list(X_test.columns), ['log_loss'])
    
    return pipe, train_stats, test_stats

## accuracy when looking at all columns (is slow)

In [21]:
all_cols = list(data.columns)
all_cols.remove('date_decision')
all_cols.remove('MONTH')
all_cols.remove('WEEK_NUM')
all_cols.remove('case_id')
all_cols.remove('target')

In [50]:
model_pipeline(data, all_cols)

Log Loss: 0.00828574891750127
AUC: 0.999535653418272
Gini Stability Score: 0.9081090032688874
Log Loss: 0.22446293161177763
AUC: 0.7029879070923011
Gini Stability Score: 0.22736629708313305


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'actualdpdtolerance_344P',
        'amtinstpaidbefduel24m_4187115A', 'annuity_780A',
        'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L',
        'applicationscnt_1086L', 'applicationscnt_464L',
        ...
        'pmts_dpd_107...
        'empls_employedfrom_796D', 'empls_employer_name_740M',
        'relatedpersons_role_762T', 'collater_typofvalofguarant_298M',
        'collater_typofvalofguarant_407M', 'collaterals_typeofguarante_359M',
        'collaterals_typeofguarante_669M', 'subjectroles_name_541M',
        'subjectroles_name_838M', 'pmts_date_1107D'],
       dtype='object', length=156))])),
                 ('clf', Lo

## Use metrics to choose columns

In [8]:
metrics = pd.read_csv('model_metrics.csv')
metrics

Unnamed: 0,columns,train AUC,train gini,test AUC,test gini
0,"['actualdpdtolerance_344P', 'amtinstpaidbefdue...",0.680265,0.281248,0.655037,0.113800
1,"['amtinstpaidbefduel24m_4187115A', 'annuity_78...",0.680305,0.281470,0.655385,0.114548
2,"['annuity_780A', 'annuitynextmonth_57A', 'appl...",0.654121,0.209124,0.641964,0.105539
3,"['annuitynextmonth_57A', 'applicationcnt_361L'...",0.651671,0.203835,0.634877,0.077457
4,"['applicationcnt_361L', 'applications30d_658L'...",0.651558,0.204617,0.631739,0.068214
...,...,...,...,...,...
442,"['conts_role_79M', 'empls_economicalst_849M', ...",0.569088,0.075141,0.562909,0.019772
443,"['empls_economicalst_849M', 'empls_employedfro...",0.569041,0.075262,0.563148,0.022188
444,"['empls_employedfrom_796D', 'empls_employer_na...",0.572390,0.083731,0.565747,0.034017
445,"['empls_employer_name_740M', 'relatedpersons_r...",0.572378,0.083763,0.565762,0.034244


In [9]:
metrics['train AUC diff'] = metrics['train AUC'].diff()
metrics['train gini diff'] = metrics['train gini'].diff()
metrics['test AUC diff'] = metrics['test AUC'].diff()
metrics['test gini diff'] = metrics['test gini'].diff()

In [10]:
metrics

Unnamed: 0,columns,train AUC,train gini,test AUC,test gini,train AUC diff,train gini diff,test AUC diff,test gini diff
0,"['actualdpdtolerance_344P', 'amtinstpaidbefdue...",0.680265,0.281248,0.655037,0.113800,,,,
1,"['amtinstpaidbefduel24m_4187115A', 'annuity_78...",0.680305,0.281470,0.655385,0.114548,0.000040,0.000222,0.000348,0.000748
2,"['annuity_780A', 'annuitynextmonth_57A', 'appl...",0.654121,0.209124,0.641964,0.105539,-0.026184,-0.072346,-0.013421,-0.009009
3,"['annuitynextmonth_57A', 'applicationcnt_361L'...",0.651671,0.203835,0.634877,0.077457,-0.002449,-0.005290,-0.007086,-0.028083
4,"['applicationcnt_361L', 'applications30d_658L'...",0.651558,0.204617,0.631739,0.068214,-0.000113,0.000782,-0.003138,-0.009243
...,...,...,...,...,...,...,...,...,...
442,"['conts_role_79M', 'empls_economicalst_849M', ...",0.569088,0.075141,0.562909,0.019772,-0.000637,-0.001931,-0.000178,-0.002227
443,"['empls_economicalst_849M', 'empls_employedfro...",0.569041,0.075262,0.563148,0.022188,-0.000047,0.000121,0.000239,0.002416
444,"['empls_employedfrom_796D', 'empls_employer_na...",0.572390,0.083731,0.565747,0.034017,0.003350,0.008468,0.002599,0.011828
445,"['empls_employer_name_740M', 'relatedpersons_r...",0.572378,0.083763,0.565762,0.034244,-0.000012,0.000032,0.000015,0.000227


In [11]:
met_stats = metrics.describe()
met_stats

Unnamed: 0,train AUC,train gini,test AUC,test gini,train AUC diff,train gini diff,test AUC diff,test gini diff
count,447.0,447.0,447.0,447.0,446.0,446.0,446.0,446.0
mean,0.725484,0.363081,0.58655,0.006495,-0.000242,-0.000443,-0.0002,-0.000179
std,0.137225,0.278038,0.048107,0.080058,0.019705,0.041873,0.009044,0.024041
min,0.512553,-0.07444,0.495779,-0.171625,-0.168913,-0.36147,-0.053623,-0.136316
25%,0.591929,0.113081,0.56296,-0.04131,-0.001084,-0.003423,-0.001412,-0.006237
50%,0.713892,0.336677,0.587488,0.022117,-1.7e-05,-5.1e-05,2.3e-05,1.3e-05
75%,0.834379,0.601643,0.608714,0.06294,0.001186,0.002505,0.001486,0.005378
max,0.990925,0.910852,0.694022,0.189145,0.113702,0.224264,0.062018,0.138928


### first let's look at which column(s), when added, increases our gini and AUC scores (by at least the 75th percentile)

In [12]:
met_stats.loc['75%']['test gini']

0.06293988510764639

In [13]:
def find_inc(col_name):
    threshold = met_stats.loc['75%'][col_name + ' diff']
    metrics[col_name + ' inc'] = metrics[col_name + ' diff'] > threshold
    return metrics

In [14]:
metrics = find_inc('train AUC')
metrics = find_inc('train gini')
metrics = find_inc('test AUC')
metrics = find_inc('test gini')
metrics

Unnamed: 0,columns,train AUC,train gini,test AUC,test gini,train AUC diff,train gini diff,test AUC diff,test gini diff,train AUC inc,train gini inc,test AUC inc,test gini inc
0,"['actualdpdtolerance_344P', 'amtinstpaidbefdue...",0.680265,0.281248,0.655037,0.113800,,,,,False,False,False,False
1,"['amtinstpaidbefduel24m_4187115A', 'annuity_78...",0.680305,0.281470,0.655385,0.114548,0.000040,0.000222,0.000348,0.000748,False,False,False,False
2,"['annuity_780A', 'annuitynextmonth_57A', 'appl...",0.654121,0.209124,0.641964,0.105539,-0.026184,-0.072346,-0.013421,-0.009009,False,False,False,False
3,"['annuitynextmonth_57A', 'applicationcnt_361L'...",0.651671,0.203835,0.634877,0.077457,-0.002449,-0.005290,-0.007086,-0.028083,False,False,False,False
4,"['applicationcnt_361L', 'applications30d_658L'...",0.651558,0.204617,0.631739,0.068214,-0.000113,0.000782,-0.003138,-0.009243,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,"['conts_role_79M', 'empls_economicalst_849M', ...",0.569088,0.075141,0.562909,0.019772,-0.000637,-0.001931,-0.000178,-0.002227,False,False,False,False
443,"['empls_economicalst_849M', 'empls_employedfro...",0.569041,0.075262,0.563148,0.022188,-0.000047,0.000121,0.000239,0.002416,False,False,False,False
444,"['empls_employedfrom_796D', 'empls_employer_na...",0.572390,0.083731,0.565747,0.034017,0.003350,0.008468,0.002599,0.011828,True,True,True,True
445,"['empls_employer_name_740M', 'relatedpersons_r...",0.572378,0.083763,0.565762,0.034244,-0.000012,0.000032,0.000015,0.000227,False,False,False,False


In [15]:
best_cols = metrics[(metrics['train AUC inc'] == True) & (metrics['train gini inc'] == True)
       & (metrics['test AUC inc'] == True) & (metrics['test gini inc'] == True)]
best_cols

Unnamed: 0,columns,train AUC,train gini,test AUC,test gini,train AUC diff,train gini diff,test AUC diff,test gini diff,train AUC inc,train gini inc,test AUC inc,test gini inc
18,"['avgpmtlast12m_4525200A', 'bankacctype_710L',...",0.560252,0.029661,0.586486,0.00537,0.010604,0.046369,0.062018,0.138928,True,True,True,True
21,"['clientscnt12m_3712952L', 'clientscnt3m_37129...",0.603924,0.138641,0.592305,0.03611,0.00423,0.02559,0.004817,0.018066,True,True,True,True
27,"['clientscnt_1130L', 'clientscnt_136L', 'clien...",0.922236,0.773257,0.627422,0.035404,0.099032,0.220067,0.023759,0.057929,True,True,True,True
38,"['cntpmts24_3658933L', 'commnoinclast6m_354684...",0.984163,0.901156,0.629344,0.096698,0.03197,0.073167,0.016329,0.080442,True,True,True,True
61,"['interestrate_311L', 'interestrategrace_34L',...",0.97823,0.883504,0.565646,-0.047207,0.02088,0.037842,0.016352,0.040011,True,True,True,True
86,"['mastercontrelectronic_519L', 'mastercontrexi...",0.861131,0.647036,0.568162,-0.105166,0.001253,0.005117,0.002223,0.008796,True,True,True,True
91,"['maxdbddpdtollast12m_3658940P', 'maxdbddpdtol...",0.87819,0.695004,0.61687,-0.008531,0.01767,0.052751,0.043832,0.097911,True,True,True,True
95,"['maxdpdinstldate_3546855D', 'maxdpdinstlnum_3...",0.878577,0.69644,0.617389,-0.007227,0.001657,0.003548,0.00571,0.010985,True,True,True,True
98,"['maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'm...",0.705332,0.325446,0.650246,0.101793,0.004812,0.008155,0.004115,0.014935,True,True,True,True
99,"['maxdpdlast3m_392P', 'maxdpdlast6m_474P', 'ma...",0.711487,0.336677,0.66147,0.124726,0.006155,0.011232,0.011224,0.022933,True,True,True,True


In [16]:
def get_new_col(columns):
    columns = columns.strip('[').strip(']').split(',')[-1].strip(" '")
    return columns

cols_set_1 = best_cols['columns'].apply(get_new_col).to_list()

In [17]:
cols_set_1

['cntincpaycont9m_3716944L',
 'credamount_770A',
 'datelastunpaid_3546854D',
 'firstdatedue_489D',
 'lastrejectdate_50D',
 'maxoutstandbalancel12m_4187113A',
 'monthsannuity_845L',
 'numcontrs3months_479L',
 'numinstls_657L',
 'numinstlsallpaid_934L',
 'numrejects9m_859L',
 'pmtnum_254L',
 'dateofbirth_337D',
 'days120_123L',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtscount_423L',
 'pmtssum_45A',
 'riskassesment_302T',
 'approvaldate_319D',
 'maxdpdtolerance_577P',
 'pmtnum_8L',
 'rejectreason_755M',
 'classificationofcontr_400M',
 'dateofcredend_353D',
 'lastupdate_1112D',
 'numberofoverdueinstlmaxdat_148D',
 'periodicityofpmts_1102L',
 'contractenddate_991D',
 'birth_259D',
 'contaddr_district_15M',
 'education_927M',
 'language1_981M',
 'registaddr_district_1083M',
 'num_group2',
 'collater_typofvalofguarant_298M',
 'collater_typofvalofguarant_407M',
 'collaterals_typeofguarante_359M',
 'pmts_month_158T',
 'pmts_date_1107D']

In [40]:
model_pipeline(data, cols_set_1)

Log Loss: 0.037515780435693376
AUC: 0.9913919259077064
Gini Stability Score: 0.9184455119476553
Log Loss: 0.21504487090275615
AUC: 0.6431659550868074
Gini Stability Score: 0.1439366964165261


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'cntincpaycont9m_3716944L', 'credamount_770A',
        'maxoutstandbalancel12m_4187113A', 'monthsannuity_845L',
        'numcontrs3months_479L', 'numinstls_657L', 'numinstlsallpaid_934L',
        'numrejects9m_859L', 'pmtnum_254L', 'days1...
        'lastupdate_1112D', 'numberofoverdueinstlmaxdat_148D',
        'contractenddate_991D', 'birth_259D', 'contaddr_district_15M',
        'education_927M', 'language1_981M', 'registaddr_district_1083M',
        'collater_typofvalofguarant_298M', 'collater_typofvalofguarant_407M',
        'collaterals_typeofguarante_359M', 'pmts_date_1107D'],
       dtype='object'))])),
                 ('clf', LogisticRegressio

### seems to be overfitting, lets just look at cols where test gini and auc increase

In [41]:
cols_set_2 = metrics[(metrics['test AUC inc'] == True) & (metrics['test gini inc'] == True)]
cols_set_2 = cols_set_2['columns'].apply(get_new_col).to_list()

model_pipeline(data, cols_set_2)

Log Loss: 0.030127286517692736
AUC: 0.9949786906955759
Gini Stability Score: 0.9147935606880729
Log Loss: 0.2119675100254908
AUC: 0.6795107407896358
Gini Stability Score: 0.217892867674466


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'clientscnt_1071L', 'cntincpaycont9m_3716944L',
        'credamount_770A', 'mastercontrexist_109L',
        'maxdbddpdtollast12m_3658940P', 'maxdebt4_972A', 'maxdpdlast6m_474P',
        'maxoutstandbalancel12m_4187113A', 'mobilephncnt_59...
        'numberofoverdueinstlmaxdat_148D', 'contractenddate_991D', 'birth_259D',
        'contaddr_district_15M', 'education_927M', 'empladdr_district_926M',
        'language1_981M', 'registaddr_district_1083M',
        'collater_typofvalofguarant_298M', 'collater_typofvalofguarant_407M',
        'collaterals_typeofguarante_359M', 'pmts_date_1107D'],
       dtype='object'))])),
                 ('clf', LogisticRegr

In [42]:
cols_set_2

['clientscnt_1071L',
 'cntincpaycont9m_3716944L',
 'credamount_770A',
 'datelastunpaid_3546854D',
 'firstdatedue_489D',
 'lastapprcommoditycat_1041M',
 'lastcancelreason_561M',
 'lastdelinqdate_224D',
 'lastrejectdate_50D',
 'lastrejectreason_759M',
 'lastst_736L',
 'mastercontrexist_109L',
 'maxdbddpdtollast12m_3658940P',
 'maxdebt4_972A',
 'maxdpdlast6m_474P',
 'maxoutstandbalancel12m_4187113A',
 'mobilephncnt_593L',
 'monthsannuity_845L',
 'numcontrs3months_479L',
 'numincomingpmts_3546848L',
 'numinstls_657L',
 'numinstlsallpaid_934L',
 'numinstlswithdpd5_4187116L',
 'numinstpaidlastcontr_4325080L',
 'numinsttopaygr_769L',
 'numrejects9m_859L',
 'pctinstlsallpaidearl3d_427L',
 'pctinstlsallpaidlate4d_3546849L',
 'pmtnum_254L',
 'price_1097A',
 'assignmentdate_238D',
 'dateofbirth_337D',
 'days120_123L',
 'description_5085714M',
 'formonth_535L',
 'fortoday_1092L',
 'forweek_1077L',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtscount_423L',
 'pmtssum_45A',

In [43]:
len(cols_set_2)

78

### let's instead look at which columns seem to decrease performance, and remove those

In [18]:
def find_dec(col_name):
    threshold = met_stats.loc['25%'][col_name + ' diff']
    metrics[col_name + ' dec'] = metrics[col_name + ' diff'] < threshold
    return metrics

In [19]:
metrics = find_dec('train AUC')
metrics = find_dec('train gini')
metrics = find_dec('test AUC')
metrics = find_dec('test gini')
metrics

Unnamed: 0,columns,train AUC,train gini,test AUC,test gini,train AUC diff,train gini diff,test AUC diff,test gini diff,train AUC inc,train gini inc,test AUC inc,test gini inc,train AUC dec,train gini dec,test AUC dec,test gini dec
0,"['actualdpdtolerance_344P', 'amtinstpaidbefdue...",0.680265,0.281248,0.655037,0.113800,,,,,False,False,False,False,False,False,False,False
1,"['amtinstpaidbefduel24m_4187115A', 'annuity_78...",0.680305,0.281470,0.655385,0.114548,0.000040,0.000222,0.000348,0.000748,False,False,False,False,False,False,False,False
2,"['annuity_780A', 'annuitynextmonth_57A', 'appl...",0.654121,0.209124,0.641964,0.105539,-0.026184,-0.072346,-0.013421,-0.009009,False,False,False,False,True,True,True,True
3,"['annuitynextmonth_57A', 'applicationcnt_361L'...",0.651671,0.203835,0.634877,0.077457,-0.002449,-0.005290,-0.007086,-0.028083,False,False,False,False,True,True,True,True
4,"['applicationcnt_361L', 'applications30d_658L'...",0.651558,0.204617,0.631739,0.068214,-0.000113,0.000782,-0.003138,-0.009243,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,"['conts_role_79M', 'empls_economicalst_849M', ...",0.569088,0.075141,0.562909,0.019772,-0.000637,-0.001931,-0.000178,-0.002227,False,False,False,False,False,False,False,False
443,"['empls_economicalst_849M', 'empls_employedfro...",0.569041,0.075262,0.563148,0.022188,-0.000047,0.000121,0.000239,0.002416,False,False,False,False,False,False,False,False
444,"['empls_employedfrom_796D', 'empls_employer_na...",0.572390,0.083731,0.565747,0.034017,0.003350,0.008468,0.002599,0.011828,True,True,True,True,False,False,False,False
445,"['empls_employer_name_740M', 'relatedpersons_r...",0.572378,0.083763,0.565762,0.034244,-0.000012,0.000032,0.000015,0.000227,False,False,False,False,False,False,False,False


In [22]:
bad_cols_1 = metrics[(metrics['train AUC dec'] == True) & (metrics['train gini dec'] == True)
       & (metrics['test AUC dec'] == True) & (metrics['test gini dec'] == True)]
bad_cols_1 = bad_cols_1['columns'].apply(get_new_col).to_list()
cols_set_3 = [x for x in all_cols if x not in bad_cols_1]

bad_cols_2 = metrics[(metrics['test AUC dec'] == True) & (metrics['test gini dec'] == True)]
bad_cols_2 = bad_cols_2['columns'].apply(get_new_col).to_list()
cols_set_4 = [x for x in all_cols if x not in bad_cols_2]

In [55]:
model_pipeline(data, cols_set_3)

Log Loss: 0.008838026142817477
AUC: 0.999482378316923
Gini Stability Score: 0.9081137829702375
Log Loss: 0.22090017066319875
AUC: 0.7045680047703939
Gini Stability Score: 0.22893809152900504


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'actualdpdtolerance_344P',
        'amtinstpaidbefduel24m_4187115A', 'annuity_780A',
        'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L',
        'applicationscnt_1086L', 'applicationscnt_464L',
        ...
        'pmts_dpd_107...
        'empls_employedfrom_796D', 'empls_employer_name_740M',
        'relatedpersons_role_762T', 'collater_typofvalofguarant_298M',
        'collater_typofvalofguarant_407M', 'collaterals_typeofguarante_359M',
        'collaterals_typeofguarante_669M', 'subjectroles_name_541M',
        'subjectroles_name_838M', 'pmts_date_1107D'],
       dtype='object', length=148))])),
                 ('clf', Lo

In [56]:
model_pipeline(data, cols_set_4)

Log Loss: 0.014116755831871424
AUC: 0.9988862693644347
Gini Stability Score: 0.9092075164343567
Log Loss: 0.22161969471354007
AUC: 0.6951387083237094
Gini Stability Score: 0.21852144008490165


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'actualdpdtolerance_344P',
        'amtinstpaidbefduel24m_4187115A', 'annuity_780A',
        'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L',
        'applicationscnt_1086L', 'applicationscnt_464L',
        ...
        'pmts_dpd_107...
        'empls_employedfrom_796D', 'empls_employer_name_740M',
        'relatedpersons_role_762T', 'collater_typofvalofguarant_298M',
        'collater_typofvalofguarant_407M', 'collaterals_typeofguarante_359M',
        'collaterals_typeofguarante_669M', 'subjectroles_name_541M',
        'subjectroles_name_838M', 'pmts_date_1107D'],
       dtype='object', length=128))])),
                 ('clf', Lo

In [59]:
len(cols_set_3)

424

### this set of columns gives us the best test gini stability, with only a small (1%) decrease AUC compared to using all the columns

In [60]:
np.array(cols_set_3)

array(['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A',
       'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L',
       'applications30d_658L', 'applicationscnt_1086L',
       'applicationscnt_464L', 'applicationscnt_629L',
       'applicationscnt_867L', 'avgdbddpdlast24m_3658932P',
       'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P',
       'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A',
       'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P',
       'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A',
       'bankacctype_710L', 'cardtype_51L', 'clientscnt6m_3712949L',
       'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L',
       'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L',
       'clientscnt_304L', 'clientscnt_360L', 'clientscnt_946L',
       'cntincpaycont9m_3716944L', 'cntpmts24_3658933L',
       'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L',
       'currdebt_22A', 'currdebtcredtyp

## let's work on imputation for this subset of columns

In [38]:
subset3 = data[cols_set_3]
subset3

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,...,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,,,6064.6,0.0,0.0,0.0,8.0,0.0,0.0,1.0,...,,,,2018.0,,a55475b1,ab3c25cf,,,
1,,,6064.6,0.0,0.0,0.0,8.0,0.0,0.0,1.0,...,,,,2018.0,,a55475b1,a55475b1,,,
2,,,6064.6,0.0,0.0,0.0,8.0,0.0,0.0,1.0,...,,,,,,,,,,
3,,,6064.6,0.0,0.0,0.0,8.0,0.0,0.0,1.0,...,,,,,,,,,,
4,,,6064.6,0.0,0.0,0.0,8.0,0.0,0.0,1.0,...,,,,2017.0,,a55475b1,daf49a8a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941052,0.0,5072.0,6518.2,2536.0,0.0,3.0,0.0,0.0,0.0,8.0,...,2.0,,,,2010.0,ab3c25cf,a55475b1,,,
941053,0.0,5072.0,6518.2,2536.0,0.0,3.0,0.0,0.0,0.0,8.0,...,3.0,,,,2010.0,a55475b1,a55475b1,,,
941054,0.0,5072.0,6518.2,2536.0,0.0,3.0,0.0,0.0,0.0,8.0,...,4.0,,0.0,,2010.0,a55475b1,a55475b1,,,
941055,0.0,5072.0,6518.2,2536.0,0.0,3.0,0.0,0.0,0.0,8.0,...,2.0,,,2019.0,2007.0,ab3c25cf,ab3c25cf,,,


In [42]:
def get_nan_counts(subset):
    # Count the number of NaN values in each column
    nan_counts = subset.isna().sum()

    # Calculate the percentage of NaN values in each column
    nan_percentage = (nan_counts / len(subset)) * 100

    # Combine the counts and percentages into a DataFrame for better visualization
    nan_distribution = pd.DataFrame({
        'Number of NaNs': nan_counts,
        'Percentage of NaNs': nan_percentage
    })

    # Print the distribution of NaN values
    return nan_distribution

In [43]:
get_nan_counts(subset3)

Unnamed: 0,Number of NaNs,Percentage of NaNs
actualdpdtolerance_344P,43240,4.594833
amtinstpaidbefduel24m_4187115A,129178,13.726905
annuity_780A,0,0.000000
annuitynextmonth_57A,0,0.000000
applicationcnt_361L,0,0.000000
...,...,...
subjectroles_name_541M,309552,32.894075
subjectroles_name_838M,309552,32.894075
pmts_date_1107D,933556,99.202918
pmts_dpdvalue_108P,933612,99.208868


In [59]:
cols_set_5 = (
    metrics[(metrics['test AUC inc'] == True) & (metrics['test gini inc'] == True)]
    .sort_values('test gini diff', ascending=False).reset_index().iloc[0:41]['columns']
    .apply(get_new_col).to_list()
)
cols_set_5.remove('num_group2')

In [60]:
model_pipeline(data, cols_set_5)

Log Loss: 0.039688173217185437
AUC: 0.9900648384882131
Gini Stability Score: 0.9128871088882473
Log Loss: 0.2108549020252464
AUC: 0.6527893644485451
Gini Stability Score: 0.1800044012860304


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'cntincpaycont9m_3716944L',
        'numincomingpmts_3546848L', 'monthsannuity_845L', 'price_1097A',
        'pctinstlsallpaidearl3d_427L', 'numberofqueries_373L', 'pmtnum_254L',
        'maxdebt4_972A', 'forweek_1077L', 'numinsttopaygr_7...
        'lastdelinqdate_224D', 'birth_259D', 'collaterals_typeofguarante_359M',
        'lastrejectdate_50D', 'approvaldate_319D', 'lastcancelreason_561M',
        'numberofoverdueinstlmaxdat_148D', 'language1_981M', 'education_927M',
        'credtype_587L', 'employedfrom_700D', 'contaddr_district_15M',
        'riskassesment_302T'],
       dtype='object'))])),
                 ('clf', LogisticRegression(max_iter=

In [103]:
subset5 = data[cols_set_5]
subset5_nans = get_nan_counts(subset5)
subset5_nans

Unnamed: 0,Number of NaNs,Percentage of NaNs
cntincpaycont9m_3716944L,0,0.0
numincomingpmts_3546848L,59694,6.343293
monthsannuity_845L,59141,6.284529
classificationofcontr_400M,107687,11.443196
price_1097A,201196,21.379789
firstdatedue_489D,67496,7.17236
datelastunpaid_3546854D,296033,31.457499
pctinstlsallpaidearl3d_427L,61051,6.487492
collater_typofvalofguarant_407M,309552,32.894075
lastdelinqdate_224D,379041,40.278219


In [69]:
def unique_in_col(col):
    return data[col].unique()

In [75]:
defs = pd.read_csv('feature_definitions.csv')
defs = defs.set_index('Variable')
defs

Unnamed: 0_level_0,Description
Variable,Unnamed: 1_level_1
actualdpd_943P,Days Past Due (DPD) of previous contract (actu...
actualdpdtolerance_344P,DPD of client with tolerance.
addres_district_368M,District of the person's address.
addres_role_871L,Role of person's address.
addres_zip_823M,Zip code of the address.
...,...
totinstallast1m_4525188A,Total amount of monthly instalments paid in th...
twobodfilling_608L,Type of application process.
type_25L,Contact type of a person.
typesuite_864L,Persons accompanying the client during the loa...


In [83]:
subset5_values = defs.loc[cols_set_5]
subset5_values

Unnamed: 0_level_0,Description
Variable,Unnamed: 1_level_1
cntincpaycont9m_3716944L,Number of incoming payments in the past 9 months.
numincomingpmts_3546848L,Number of incoming payments.
monthsannuity_845L,Monthly annuity amount for the applicant.
classificationofcontr_400M,Classificiation of the closed contract.
price_1097A,Credit price.
firstdatedue_489D,Date of the first due date.
datelastunpaid_3546854D,Date of the last unpaid instalment.
pctinstlsallpaidearl3d_427L,Percentage of installments paid at least 3 day...
collater_typofvalofguarant_407M,Collateral valuation type (closed contract).
lastdelinqdate_224D,Date of the last delinquency occurrence.


In [84]:
subset5_values = subset5_values.reset_index()
subset5_values['Unique'] = subset5_values['Variable'].apply(unique_in_col)
subset5_values

Unnamed: 0,Variable,Description,Unique
0,cntincpaycont9m_3716944L,Number of incoming payments in the past 9 months.,"[nan, 0.0, 9.0, 7.0, 6.0, 5.0, 12.0, 4.0, 1.0,..."
1,numincomingpmts_3546848L,Number of incoming payments.,"[nan, 10.0, 152.0, 12.0, 49.0, 44.0, 6.0, 20.0..."
2,monthsannuity_845L,Monthly annuity amount for the applicant.,"[nan, 12.0, 104.0, 42.0, 43.0, 6.0, 9.0, 14.0,..."
3,classificationofcontr_400M,Classificiation of the closed contract.,"[a55475b1, None, 00135d9c, ea6782cc, 01f63ac8,..."
4,price_1097A,Credit price.,"[nan, 78978.0, 19080.0, 17000.0, 24580.0, 5159..."
5,firstdatedue_489D,Date of the first due date.,"[None, 2012-10-25, 2010-03-14, 2019-09-11, 201..."
6,datelastunpaid_3546854D,Date of the last unpaid instalment.,"[None, 2015-05-21, 2015-09-07, 2019-11-07, 201..."
7,pctinstlsallpaidearl3d_427L,Percentage of installments paid at least 3 day...,"[nan, 0.66667, 0.89103, 0.91667, 0.54348, 0.81..."
8,collater_typofvalofguarant_407M,Collateral valuation type (closed contract).,"[a55475b1, None, 8fd95e4b, 9a0c095e, 06fb9ba8,..."
9,lastdelinqdate_224D,Date of the last delinquency occurrence.,"[None, 2015-05-21, 2016-09-07, 2019-11-07, 201..."


In [86]:
subset5_values.merge(subset5_nans, left_on='Variable', right_index=True)

Unnamed: 0,Variable,Description,Unique,Number of NaNs,Percentage of NaNs
0,cntincpaycont9m_3716944L,Number of incoming payments in the past 9 months.,"[nan, 0.0, 9.0, 7.0, 6.0, 5.0, 12.0, 4.0, 1.0,...",59746,6.348818
1,numincomingpmts_3546848L,Number of incoming payments.,"[nan, 10.0, 152.0, 12.0, 49.0, 44.0, 6.0, 20.0...",59694,6.343293
2,monthsannuity_845L,Monthly annuity amount for the applicant.,"[nan, 12.0, 104.0, 42.0, 43.0, 6.0, 9.0, 14.0,...",59141,6.284529
3,classificationofcontr_400M,Classificiation of the closed contract.,"[a55475b1, None, 00135d9c, ea6782cc, 01f63ac8,...",107687,11.443196
4,price_1097A,Credit price.,"[nan, 78978.0, 19080.0, 17000.0, 24580.0, 5159...",201196,21.379789
5,firstdatedue_489D,Date of the first due date.,"[None, 2012-10-25, 2010-03-14, 2019-09-11, 201...",67496,7.17236
6,datelastunpaid_3546854D,Date of the last unpaid instalment.,"[None, 2015-05-21, 2015-09-07, 2019-11-07, 201...",296033,31.457499
7,pctinstlsallpaidearl3d_427L,Percentage of installments paid at least 3 day...,"[nan, 0.66667, 0.89103, 0.91667, 0.54348, 0.81...",61051,6.487492
8,collater_typofvalofguarant_407M,Collateral valuation type (closed contract).,"[a55475b1, None, 8fd95e4b, 9a0c095e, 06fb9ba8,...",309552,32.894075
9,lastdelinqdate_224D,Date of the last delinquency occurrence.,"[None, 2015-05-21, 2016-09-07, 2019-11-07, 201...",379041,40.278219


In [97]:
def impute_specific(df, cols, value):

    # Impute NaN values with specific value provided
    for col in cols:
        df[col] = df[col].fillna(value)
    
    return df

In [104]:
subset5 = impute_specific(subset5, ['cntincpaycont9m_3716944L'], 0)
subset5 = impute_specific(subset5, ['credtype_587L', 'lastrejectdate_50D', ], 'None')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(value)


In [105]:
get_nan_counts(subset5)

Unnamed: 0,Number of NaNs,Percentage of NaNs
cntincpaycont9m_3716944L,0,0.0
numincomingpmts_3546848L,59694,6.343293
monthsannuity_845L,59141,6.284529
classificationofcontr_400M,107687,11.443196
price_1097A,201196,21.379789
firstdatedue_489D,67496,7.17236
datelastunpaid_3546854D,296033,31.457499
pctinstlsallpaidearl3d_427L,61051,6.487492
collater_typofvalofguarant_407M,309552,32.894075
lastdelinqdate_224D,379041,40.278219


In [106]:
base_cols = [
    'case_id',
    'WEEK_NUM',
    'target'
    ]

subset5_df = data[base_cols + cols_set_5]

model_pipeline(subset5_df, cols_set_5)

Log Loss: 0.039659559493115414
AUC: 0.9901335928903418
Gini Stability Score: 0.9125739883774643
Log Loss: 0.21093026839241183
AUC: 0.6528303951460666
Gini Stability Score: 0.1802885345233621


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   Index(['case_id', 'WEEK_NUM', 'cntincpaycont9m_3716944L',
        'numincomingpmts_3546848L', 'monthsannuity_845L', 'price_1097A',
        'pctinstlsallpaidearl3d_427L', 'numberofqueries_373L', 'pmtnum_254L',
        'maxdebt4_972A', 'forweek_1077L', 'numinsttopaygr_7...
        'lastdelinqdate_224D', 'birth_259D', 'collaterals_typeofguarante_359M',
        'lastrejectdate_50D', 'approvaldate_319D', 'lastcancelreason_561M',
        'numberofoverdueinstlmaxdat_148D', 'language1_981M', 'education_927M',
        'credtype_587L', 'employedfrom_700D', 'contaddr_district_15M',
        'riskassesment_302T'],
       dtype='object'))])),
                 ('clf', LogisticRegression(max_iter=