Credit:

https://www.kaggle.com/code/lorresprz/home-credit-lgb-cat-ensemble

https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook

In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

# Data preprocessing functions

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

# 1. Data Loading

In [3]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
#Depth 1
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [5]:
train_static.shape, test_static.shape, train_basetable.shape, train_static_cb.shape

((1526659, 168), (30, 168), (1526659, 5), (1500476, 53))

In [6]:
# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "P"):
        selected_static_cols.append(col)
print(selected_static_cols)
print('\n')
selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "P"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcredamount_781A', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcredamount_222A', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdebt4_972A', 'maxdpdfrom6mto36m_3546853P', 'maxdpdinstlnum_3546846P', 'maxdpdlast12m_727P', 'maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'maxdpdlast6m_474P', 'maxdpdlast9m_1059P', 'maxdpdtolerance_374P', 'maxinstallast24m_3658928A', 'maxln

# Depth-1 data

In [7]:
#Look at the depth 1 table
train_person_1.columns

['case_id',
 'birth_259D',
 'birthdate_87D',
 'childnum_185L',
 'contaddr_district_15M',
 'contaddr_matchlist_1032L',
 'contaddr_smempladdr_334L',
 'contaddr_zipcode_807M',
 'education_927M',
 'empl_employedfrom_271D',
 'empl_employedtotal_800L',
 'empl_industry_691L',
 'empladdr_district_926M',
 'empladdr_zipcode_114M',
 'familystate_447L',
 'gender_992L',
 'housetype_905L',
 'housingtype_772L',
 'incometype_1044T',
 'isreference_387L',
 'language1_981M',
 'mainoccupationinc_384A',
 'maritalst_703L',
 'num_group1',
 'personindex_1023L',
 'persontype_1072L',
 'persontype_792L',
 'registaddr_district_1083M',
 'registaddr_zipcode_184M',
 'relationshiptoclient_415T',
 'relationshiptoclient_642T',
 'remitter_829L',
 'role_1084L',
 'role_993L',
 'safeguarantyflag_411L',
 'sex_738L',
 'type_25L']

In [8]:
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

train_person_1_feats_1 

case_id,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed
i64,f64,bool
116195,8994.4,false
27990,30000.0,false
649897,30000.0,false
175964,18000.0,false
702917,48000.0,false
…,…,…
1479422,38000.0,false
2660494,160000.0,false
1262414,40000.0,false
1697979,60000.0,false


In [9]:
# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

train_person_1_feats_2.head()

case_id,person_housetype
i64,str
0,
1,
2,
3,
4,


In [10]:
# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

train_credit_bureau_b_2_feats.head()

case_id,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,f64,bool
218515,9.400001,True
1899122,1.4,False
185881,11.2,True
1327403,0.0,False
982880,0.0,False


# Depth-1 data for test set

In [11]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"))

# Join all selected tables

In [12]:
# Join all tables together.
train_data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [13]:
test_data = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

# Categorical data

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [15]:
#convert to pandas
train_data=train_data.to_pandas()
test_data=test_data.to_pandas()

In [16]:
train_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
test_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
train_data = convert_strings(train_data)
test_data = convert_strings(test_data)

In [17]:
train_data.dtypes

target                                        int64
actualdpdtolerance_344P                     float64
amtinstpaidbefduel24m_4187115A              float64
annuity_780A                                float64
annuitynextmonth_57A                        float64
                                             ...   
mainoccupationinc_384A_max                  float64
mainoccupationinc_384A_any_selfemployed        bool
person_housetype                           category
pmts_pmtsoverdue_635A_max                   float64
pmts_dpdvalue_108P_over31                  category
Length: 63, dtype: object

In [18]:
test_data.dtypes

actualdpdtolerance_344P                     float64
amtinstpaidbefduel24m_4187115A              float64
annuity_780A                                float64
annuitynextmonth_57A                        float64
avgdbddpdlast24m_3658932P                   float64
                                             ...   
mainoccupationinc_384A_max                  float64
mainoccupationinc_384A_any_selfemployed    category
person_housetype                           category
pmts_pmtsoverdue_635A_max                   float64
pmts_dpdvalue_108P_over31                  category
Length: 62, dtype: object

In [19]:
# one-hot encode the categorical features
cat_attribs = ['mainoccupationinc_384A_any_selfemployed','person_housetype','pmts_dpdvalue_108P_over31']
full_pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs)], remainder='passthrough')

In [20]:
X = train_data.drop(['target'], axis = 1)
y = train_data['target']

In [21]:
encoder = full_pipeline.fit(X)
X = encoder.transform(X)
test_data= encoder.transform(test_data)

In [22]:
#There are NaN values, which can be handled by XGBoost, CatBoost and LGB
X.shape, test_data.shape

((1526659, 71), (10, 71))

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1373993, 71), (152666, 71), (1373993,), (152666,))

# 3. XGBoost modelling

In [24]:
from sklearn.metrics import  roc_auc_score
from xgboost import XGBClassifier

In [25]:
params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,  
    "lambda": 10,  
    #"tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [26]:
%%time
#xgb_classifier = XGBClassifier(**params2)
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred2 = xgb_classifier.predict_proba(X_test)[:,1]
auc_score_xgb = roc_auc_score(y_test, y_pred2)
print(f'ROC = {auc_score_xgb.round(3)}')

ROC = 0.775
CPU times: user 1min 24s, sys: 774 ms, total: 1min 25s
Wall time: 23.7 s


In [27]:
#https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
xgb_classifier.save_model('/kaggle/working/xgb_no_params.json')

#Loading
#xgb_model = xgb.Booster()
#xgb_model.load_model(path_to_file)

# 4. CatBoostClassifier

In [28]:
%%time
#https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit

from catboost import CatBoostClassifier
clf = CatBoostClassifier(eval_metric='AUC')
#clf = CatBoostClassifier()

clf.fit(X_train, y=y_train, eval_set = (X_test, y_test), verbose = False)
y_pred_valid = clf.predict_proba(X_test)[:,1]
auc_score_cat = roc_auc_score(y_test, y_pred_valid)
print(f'ROC = {auc_score_cat.round(3)}')

ROC = 0.779
CPU times: user 21min 48s, sys: 12.9 s, total: 22min 1s
Wall time: 6min 40s


In [29]:
#https://stackoverflow.com/questions/51895761/how-to-correctly-load-pretrained-model-in-catboost-in-python
clf.save_model('/kaggle/working/catboost_no_params')

# LGB

In [30]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    #"device": device, 
    "verbose": -1,
}

In [31]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgb_model = lgb.LGBMClassifier(**params)
#lgb_model = lgb.LGBMClassifier()
lgb_model.fit(
        X_train, y_train,
        eval_set = [(X_test, y_test)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
y_pred_lgb = lgb_model.predict_proba(X_test)[:,1]
auc_lgb = roc_auc_score(y_test, y_pred_lgb)
print(f'AUC {auc_lgb}')

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.77459
[400]	valid_0's auc: 0.778928
[600]	valid_0's auc: 0.780495
[800]	valid_0's auc: 0.781339
[1000]	valid_0's auc: 0.781962
[1200]	valid_0's auc: 0.782088
Early stopping, best iteration is:
[1245]	valid_0's auc: 0.782179
AUC 0.7821790962559


In [32]:
import pickle
file = '/kaggle/working/lgb_model.pkl'
pickle.dump(lgb_model, open(file, 'wb'))
print('Trained LGB model was saved!')

Trained LGB model was saved!


# 5. Submit

In [33]:
pred_xgb = xgb_classifier.predict_proba(test_data)[:, 1]
pred_cat = clf.predict_proba(test_data)[:, 1]
pred_lgb = lgb_model.predict_proba(test_data)[:,1]
predictions = (pred_xgb+pred_cat+pred_lgb)/2

In [34]:
subs = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv')
subs['score'] = predictions
subs.to_csv('submission.csv', index=False)
subs.head()

Unnamed: 0,case_id,score
0,57543,0.044167
1,57549,0.025323
2,57551,0.021083
3,57552,0.014957
4,57569,0.101973
