# Logistic Regression Baseline Model

In [49]:
# standard library imports
import gc
import os
import random
import sys
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# third party imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import polars as pl
import pyarrow.parquet as pq
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    log_loss,
    SCORERS, 
    get_scorer,
    classification_report, 
    ConfusionMatrixDisplay, 
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

# dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
DATA_PATH = ""

## Load the test data

In [65]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # make case_id int
        if col == 'case_id':
            df = df.with_columns(pl.col(col).cast(pl.Int64).alias(col))
        if col == 'num_group1':
            df = df.with_columns(pl.col(col).cast(pl.Int64).alias(col))
        if col == 'num_group2':
            df = df.with_columns(pl.col(col).cast(pl.Int64).alias(col))
        

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [66]:
test_basetable = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_base.csv"))

test_static = pl.concat(
    [
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_static_0_0.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_static_0_1.csv")).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

test_static_cb = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_static_cb_0.csv")).pipe(set_table_dtypes)

test_applprev_1 = pl.concat(
    [
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_applprev_1_0.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_applprev_1_1.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_applprev_1_2.csv")).pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
) 

test_applprev_2 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_applprev_2.csv")).pipe(set_table_dtypes)

test_other_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_other_1.csv")).pipe(set_table_dtypes) 

test_tax_registry_a_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_tax_registry_a_1.csv")).pipe(set_table_dtypes)  

test_tax_registry_b_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_tax_registry_b_1.csv")).pipe(set_table_dtypes)  

test_tax_registry_c_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_tax_registry_c_1.csv")).pipe(set_table_dtypes)  

test_credit_bureau_a_1 = pl.concat(
    [
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_1_0.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_1_1.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_1_2.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_1_3.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_1_4.csv")).pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
)  

test_credit_bureau_a_2 = pl.concat(
    [
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_0.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_1.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_2.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_3.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_4.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_5.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_6.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_7.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_8.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_9.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_10.csv")).pipe(set_table_dtypes),
        pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_a_2_11.csv")).pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
) 

test_credit_bureau_b_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_b_1.csv")).pipe(set_table_dtypes)   

test_credit_bureau_b_2 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_b_2.csv")).pipe(set_table_dtypes)   

test_deposit_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_deposit_1.csv")).pipe(set_table_dtypes)   

test_person_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_person_1.csv")).pipe(set_table_dtypes)   

test_debitcard_1 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_debitcard_1.csv")).pipe(set_table_dtypes)   

test_applprev_2 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_applprev_2.csv")).pipe(set_table_dtypes)   

test_person_2 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_person_2.csv")).pipe(set_table_dtypes)   

test_credit_bureau_b_2 = pl.read_csv(os.path.join(DATA_PATH, "csv_files/test/test_credit_bureau_b_2.csv")).pipe(set_table_dtypes)

In [74]:
test_data = (
    test_basetable
    .join(test_static, how="left", on="case_id")
    .join(test_static_cb, how="left", on="case_id")
    .join(test_applprev_1, how="left", on="case_id")
    .join(test_applprev_2, how="left", on="case_id")
    .join(test_other_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_a_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_b_1, how="left", on=["case_id", "num_group1"])
    .join(test_tax_registry_c_1, how="left", on=["case_id", "num_group1"])
    .join(test_credit_bureau_a_1, how="left", on=["case_id", "num_group1"])
    .join(test_credit_bureau_b_1, how="left", on=["case_id", "num_group1"])
    .join(test_deposit_1, how="left", on=["case_id", "num_group1"])
    .join(test_person_1, how="left", on=["case_id", "num_group1"])
    .join(test_debitcard_1, how="left", on=["case_id", "num_group1"])
    .join(test_applprev_2, how="left", on=["case_id", "num_group1"])
    .join(test_person_2, how="left", on=["case_id", "num_group1", "num_group2"])
    .join(test_credit_bureau_b_1, how="left", on=["case_id", "num_group1"])
    .join(
        test_credit_bureau_b_2, how="left", on=["case_id", "num_group1", "num_group2"]
    )
)

In [75]:
test_data

case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,…,credor_3940957M_right,credquantity_1099L_right,credquantity_984L_right,debtpastduevalue_732A_right,debtvalue_227A_right,dpd_550P_right,dpd_733P_right,dpdmax_851P_right,dpdmaxdatemonth_804T_right,dpdmaxdateyear_742T_right,installmentamount_644A_right,installmentamount_833A_right,instlamount_892A_right,interesteffectiverate_369L_right,interestrateyearly_538L_right,lastupdate_260D_right,maxdebtpduevalodued_3940955A_right,numberofinstls_810L_right,overdueamountmax_950A_right,overdueamountmaxdatemonth_494T_right,overdueamountmaxdateyear_432T_right,periodicityofpmts_997L_right,periodicityofpmts_997M_right,pmtdaysoverdue_1135P_right,pmtmethod_731M_right,pmtnumpending_403L_right,purposeofcred_722M_right,residualamount_1093A_right,residualamount_127A_right,residualamount_3940956A_right,subjectrole_326M_right,subjectrole_43M_right,totalamount_503A_right,totalamount_881A_right,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
i64,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,…,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,str,str,f64,str,f64,str,f64,f64,f64,str,str,f64,f64,str,f64,f64
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57630,"""2021-03-16""",202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57631,"""2022-06-04""",202201,100,0.0,,2540.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57632,"""2022-02-05""",202201,100,0.0,63647.402,4732.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-7.0,-6.0,-7.0,0.0,3536.0,,0.0,10581.714,3536.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
57633,"""2022-01-25""",202201,100,0.0,,8273.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,,,,,,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [76]:
test_data = test_data.to_pandas()
test_data.head()

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,residualamount_1093A_right,residualamount_127A_right,residualamount_3940956A_right,subjectrole_326M_right,subjectrole_43M_right,totalamount_503A_right,totalamount_881A_right,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
0,57543,2021-05-14,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,...,,,,,,,,,,
1,57543,2021-05-14,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,...,,,,,,,,,,
2,57543,2021-05-14,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,...,,,,,,,,,,
3,57543,2021-05-14,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,...,,,,,,,,,,
4,57543,2021-05-14,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,...,,,,,,,,,,


## Load the train data

In [44]:
train_data = pd.read_parquet('data/train/trian_sample_no_imputation.parquet')

In [31]:
base_cols = [
    'case_id',
    'WEEK_NUM',
    'target'
]

top20_cols = [
    'actualdpd_943P',
    'purposeofcred_426M',
    'amount_1115A',
    'credacc_actualbalance_314A',
    'actualdpdtolerance_344P',
    'annuity_780A',
    'numinstpaidearly_338L',
    'empl_employedtotal_800L',
    'empl_industry_691L',
    'maininc_215A',
    'debtoverdue_47A',
    'totalsettled_863A',
    'totaloutstanddebtvalue_39A',
    'avgdbddpdlast24m_3658932P',
    'avgdbddpdlast3m_4187120P',
    'clientscnt12m_3712952L',
    'applicationscnt_1086L',
    'applicationcnt_361L',
    'applications30d_658L'
]

In [46]:
train_df = train_data[base_cols + top20_cols]
train_df.head()

Unnamed: 0,case_id,WEEK_NUM,target,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,...,maininc_215A,debtoverdue_47A,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L
0,12866,16,0,0.0,96a8fdfe,,,,6064.6,,...,,,0.0,305398.22,,,0.0,8.0,0.0,0.0
1,12866,16,0,0.0,96a8fdfe,,,,6064.6,,...,,,0.0,305398.22,,,0.0,8.0,0.0,0.0
2,12866,16,0,0.0,a55475b1,,,,6064.6,,...,,,0.0,,,,0.0,8.0,0.0,0.0
3,12866,16,0,0.0,a55475b1,,,,6064.6,,...,,,0.0,,,,0.0,8.0,0.0,0.0
4,12866,16,0,0.0,96a8fdfe,,,,6064.6,,...,,0.0,0.0,3932.352,,,0.0,8.0,0.0,0.0


## Training Logistic Regression

Training our baseline model.

### Imputation

In [47]:
# Assuming train_df is already loaded
null_df = train_df[train_df.columns[train_df.isnull().any()]]

# Impute NaN values with 0 in numerical columns
zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
for column in zero_impute_columns:
    train_df[column] = train_df[column].fillna(0)

# Impute NaN values with the mode in categorical columns
mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

for column in mode_impute_columns:
    mode_value = train_df[column].mode()[0]
    train_df[column] = train_df[column].fillna(mode_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[column] = train_df[column].fillna(mode_value)


### Model

In [50]:
X, y = train_df.drop(columns=["target"]), train_df["target"]

# Splitting by 'case_id'
case_ids = train_df['case_id'].unique()
train_case_ids, test_case_ids = train_test_split(case_ids, test_size=0.2, random_state=42)

X_train = train_df[train_df['case_id'].isin(train_case_ids)].drop(columns=["target"])
y_train = train_df[train_df['case_id'].isin(train_case_ids)]["target"]
X_test = train_df[train_df['case_id'].isin(test_case_ids)].drop(columns=["target"])
y_test = train_df[train_df['case_id'].isin(test_case_ids)]["target"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,  X_train.select_dtypes(exclude="object").columns),
        ('cat', categorical_transformer,  X_train.select_dtypes(include="object").columns)])

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("clf", LogisticRegression(random_state=0, max_iter=1000)),
    ]
)

pipe.fit(X_train, y_train)

# y_pred = pipe.predict(X_train)
y_pred = [probs[1] for probs in pipe.predict_proba(X_train)]

## Evaluation

In [51]:
def make_base(X, y, y_pred):
    base = pd.concat([X, y.to_frame()], axis=1).reset_index()
    base = pd.concat([base, pd.Series(y_pred, name='score').to_frame()], axis=1)
    return base

def gini(target, score):
    try:
        gini = 2*roc_auc_score(target, score)-1
    except ValueError:
        gini = 0
        pass
    return gini

def gini_stability(X, y, y_pred, w_fallingrate=88.0, w_resstd=-0.5):
    base = make_base(X, y, y_pred)
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: gini(x.target, x.score)).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

def generate_report(mdl, X, y, cols, extra_metrics=None):
    '''Takes in a [what kind?] model as a string, pandas dataframe of a subset of the test data, 
    and list of columns to be used in the model.
    If necessary, can include a list of other metrics to be outputted, but default is None.
    Returns a dictionary of the following breakdown of scores: AUC, F1-Score, and any extra metrics
    '''
    # Load in a model we have trained and saved somewhere else in the repo
#     model = load(os.path.join(MODELS_DIR, mdl))

    # for now, we will be using a model trained in the notebook
    model = mdl
#     X_test, y_test = df[cols], df["target"]
#     y_pred = model.predict(X)
    y_pred = [probs[1] for probs in model.predict_proba(X)]
    
    res = dict()
    res['AUC'] = roc_auc_score(y, y_pred)
    res['Gini Stability'] = gini_stability(X, y, y_pred)
        
    for metric in extra_metrics:
        if metric == 'log_loss':
            res[metric] = log_loss(y, y_pred)
            print(f"Log Loss: {log_loss(y, y_pred)}")
        else:
            res[metric] = get_scorer(metric)._score_func(y, y_pred)
            print(f"{metric}: {get_scorer(metric)._score_func(y, y_pred)}")
        

    # Results
    print(f"AUC: {roc_auc_score(y, y_pred)}")
    print(f"Gini Stability Score: {gini_stability(X, y, y_pred)}")
        
    return res

In [52]:
generate_report(pipe, X_train, y_train, list(X_train.columns), ['log_loss'])

Log Loss: 0.14580354890288633
AUC: 0.6546572002612966
Gini Stability Score: 0.24296974748302075


{'AUC': 0.6546572002612966,
 'Gini Stability': 0.24296974748302075,
 'log_loss': 0.14580354890288633}

In [53]:
generate_report(pipe, X_test, y_test, list(X_train.columns), ['log_loss'])

Log Loss: 0.15032929353848243
AUC: 0.629782372873686
Gini Stability Score: 0.10315543470215377


{'AUC': 0.629782372873686,
 'Gini Stability': 0.10315543470215377,
 'log_loss': 0.15032929353848243}

# Test predictions

### Make test df

In [61]:
base_cols = [
    'case_id',
    'WEEK_NUM'
]

top20_cols = [
    'actualdpd_943P',
    'purposeofcred_426M',
    'amount_1115A',
    'credacc_actualbalance_314A',
    'actualdpdtolerance_344P',
    'annuity_780A',
    'numinstpaidearly_338L',
    'empl_employedtotal_800L',
    'empl_industry_691L',
    'maininc_215A',
    'debtoverdue_47A',
    'totalsettled_863A',
    'totaloutstanddebtvalue_39A',
    'avgdbddpdlast24m_3658932P',
    'avgdbddpdlast3m_4187120P',
    'clientscnt12m_3712952L',
    'applicationscnt_1086L',
    'applicationcnt_361L',
    'applications30d_658L'
]

In [77]:
test_df = test_data[base_cols + top20_cols]

### Imputation

In [89]:
for column in mode_impute_columns:
    mode_value = test_df[column].mode()
    print(mode_value)

0    a55475b1
dtype: object
0    17800.0
dtype: object
Series([], dtype: object)
Series([], dtype: object)


In [94]:
# Assuming train_df is already loaded
null_df = test_df[test_df.columns[test_df.isnull().any()]]

# Impute NaN values with 0 in numerical columns
zero_impute_columns = list(null_df.select_dtypes(exclude="object").columns)
for column in zero_impute_columns:
    test_df[column] = test_df[column].fillna(0)

# Impute NaN values with the mode in categorical columns
mode_impute_columns = list(null_df.select_dtypes(include="object").columns)

for column in mode_impute_columns:
    # if no mode, use forward fill
    if len(test_df[column].mode()) == 0:
        test_df[column] = test_df[column].fillna('ffill')
    else:
        mode_value = test_df[column].mode()[0]
        test_df[column] = test_df[column].fillna(mode_value)

In [95]:
X_valid = test_df
y_pred_valid = [probs[1] for probs in pipe.predict_proba(X_valid)]

In [98]:
test_df.head()

Unnamed: 0,case_id,WEEK_NUM,actualdpd_943P,purposeofcred_426M,amount_1115A,credacc_actualbalance_314A,actualdpdtolerance_344P,annuity_780A,numinstpaidearly_338L,empl_employedtotal_800L,...,maininc_215A,debtoverdue_47A,totalsettled_863A,totaloutstanddebtvalue_39A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,clientscnt12m_3712952L,applicationscnt_1086L,applicationcnt_361L,applications30d_658L
0,57543,100,0.0,a55475b1,0.0,17800.0,0.0,3674.6,25.0,ffill,...,34000.0,0.0,456031.1,0.0,1.0,2.0,0.0,0.0,0.0,0.0
1,57543,100,0.0,a55475b1,0.0,17800.0,0.0,3674.6,25.0,ffill,...,34000.0,0.0,456031.1,0.0,1.0,2.0,0.0,0.0,0.0,0.0
2,57543,100,0.0,a55475b1,0.0,17800.0,0.0,3674.6,25.0,ffill,...,34000.0,0.0,456031.1,0.0,1.0,2.0,0.0,0.0,0.0,0.0
3,57543,100,0.0,a55475b1,0.0,17800.0,0.0,3674.6,25.0,ffill,...,34000.0,0.0,456031.1,0.0,1.0,2.0,0.0,0.0,0.0,0.0
4,57543,100,0.0,a55475b1,0.0,17800.0,0.0,3674.6,25.0,ffill,...,34000.0,0.0,456031.1,0.0,1.0,2.0,0.0,0.0,0.0,0.0


## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [99]:
submission = pd.DataFrame({
    "case_id": test_df["case_id"].to_numpy(),
    "score": y_pred_valid
}).set_index('case_id')
submission.to_csv("./submission.csv")