Credit:

https://www.kaggle.com/code/lorresprz/home-credit-lgb-cat-ensemble

https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook

In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

# Data preprocessing functions

In [3]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

#this is to convert string, object dtype to categorical which is one-hot-encoded later
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

# Data Loading

In [4]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
#Depth 1
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [5]:
# Do the same thing for test data
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [6]:
train_static.shape, test_static.shape, train_basetable.shape, train_static_cb.shape

((1526659, 168), (30, 168), (1526659, 5), (1500476, 53))

In [7]:
# We will process in this examples only A-type, P-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "P", "M"):
        selected_static_cols.append(col)
print(f'Selected static cols: {len(selected_static_cols)}')
print('\n')
selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "P", "M"):
        selected_static_cb_cols.append(col)
print(f'Selected static cb_cols: {len(selected_static_cb_cols)}')

Selected static cols: 61


Selected static cb_cols: 9


# Depth-1 data

In [8]:
#Look at the depth 1 table
train_person_1.columns

['case_id',
 'birth_259D',
 'birthdate_87D',
 'childnum_185L',
 'contaddr_district_15M',
 'contaddr_matchlist_1032L',
 'contaddr_smempladdr_334L',
 'contaddr_zipcode_807M',
 'education_927M',
 'empl_employedfrom_271D',
 'empl_employedtotal_800L',
 'empl_industry_691L',
 'empladdr_district_926M',
 'empladdr_zipcode_114M',
 'familystate_447L',
 'gender_992L',
 'housetype_905L',
 'housingtype_772L',
 'incometype_1044T',
 'isreference_387L',
 'language1_981M',
 'mainoccupationinc_384A',
 'maritalst_703L',
 'num_group1',
 'personindex_1023L',
 'persontype_1072L',
 'persontype_792L',
 'registaddr_district_1083M',
 'registaddr_zipcode_184M',
 'relationshiptoclient_415T',
 'relationshiptoclient_642T',
 'remitter_829L',
 'role_1084L',
 'role_993L',
 'safeguarantyflag_411L',
 'sex_738L',
 'type_25L']

In [9]:
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

train_person_1_feats_1 

case_id,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed
i64,f64,bool
2666652,170000.0,false
2702239,56000.0,false
1691403,60000.0,false
116734,60000.0,false
1937511,24000.0,false
…,…,…
1896317,30000.0,false
825926,80000.0,false
2566242,76000.0,false
2610538,130000.0,false


In [10]:
# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

train_person_1_feats_2.head()

case_id,person_housetype
i64,str
0,
1,
2,
3,
4,


In [11]:
train_credit_bureau_b_2.columns

['case_id',
 'num_group1',
 'num_group2',
 'pmts_date_1107D',
 'pmts_dpdvalue_108P',
 'pmts_pmtsoverdue_635A']

In [12]:
# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

train_credit_bureau_b_2_feats.head()

case_id,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,f64,bool
908671,2.8,True
1946803,5.0,True
839717,0.0,False
1926150,0.0,False
1006198,0.6,True


# Depth-1 data for test set

In [13]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"))

# Join all selected tables

In [16]:
# Join all train tables together.
train_data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [15]:
# Join all test tables together.
test_data = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

# Categorical data

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [18]:
#convert to pandas
train_data=train_data.to_pandas()
test_data=test_data.to_pandas()

In [19]:
train_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
test_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
train_data = convert_strings(train_data)
test_data = convert_strings(test_data)

In [None]:
#list(train_data.dtypes)

In [None]:
#list(test_data.dtypes)

In [51]:
train_data.shape

(1526659, 76)

In [20]:
#find out the categorical columns of train_data (now same as test_data)
cat_cols = []
for i in range(len(list(train_data.dtypes))):
    if list(train_data.dtypes)[i].type != np.int64 and list(train_data.dtypes)[i].type != np.float64:
     #if list(train_data.dtypes)[i].type == pd.core.dtypes.dtypes.CategoricalDtypeType:
        cat_cols.append(train_data.columns[i])    
        
#find out the categorical columns of test_data
cat_cols_test = []
for i in range(len(list(test_data.dtypes))):
    #if list(train_data.dtypes)[i].type != numpy.int64 and list(train_data.dtypes)[i].type != numpy.float64:
     if list(test_data.dtypes)[i].type == pd.core.dtypes.dtypes.CategoricalDtypeType:
        cat_cols_test.append(test_data.columns[i])   

In [25]:
len(cat_cols),len(cat_cols_test), cat_cols

(16,
 16,
 ['lastapprcommoditycat_1041M',
  'lastapprcommoditytypec_5251766M',
  'lastcancelreason_561M',
  'lastrejectcommoditycat_161M',
  'lastrejectcommodtypec_5251769M',
  'lastrejectreason_759M',
  'lastrejectreasonclient_4145040M',
  'previouscontdistrict_112M',
  'description_5085714M',
  'education_1103M',
  'education_88M',
  'maritalst_385M',
  'maritalst_893M',
  'mainoccupationinc_384A_any_selfemployed',
  'person_housetype',
  'pmts_dpdvalue_108P_over31'])

In [26]:
# one-hot encode the categorical features
full_pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols_test)], remainder='passthrough')

In [27]:
X = train_data.drop(['target'], axis = 1)
y = train_data['target']

In [28]:
encoder = full_pipeline.fit(X)
X = encoder.transform(X)
test_data= encoder.transform(test_data)

In [29]:
#There are NaN values, which can be handled by XGBoost, CatBoost and LGB
X.shape, test_data.shape

((1526659, 920), (10, 920))

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1373993, 920), (152666, 920), (1373993,), (152666,))

# XGBoost modelling

In [31]:
from sklearn.metrics import  roc_auc_score
from xgboost import XGBClassifier

In [32]:
params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,  
    "lambda": 10,  
    #"tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [33]:
%%time
xgb_classifier = XGBClassifier(**params2)
#xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred2 = xgb_classifier.predict_proba(X_test)[:,1]
auc_score_xgb = roc_auc_score(y_test, y_pred2)
print(f'ROC = {auc_score_xgb.round(3)}')

ROC = 0.793
CPU times: user 18min 6s, sys: 2.2 s, total: 18min 8s
Wall time: 4min 43s


In [34]:
#https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
xgb_classifier.save_model('/kaggle/working/xgb_no_params.json')

#Loading
#xgb_model = xgb.Booster()
#xgb_model.load_model(path_to_file)

In [62]:
#FEATURE IMPORTANCE
xgb_feats = xgb_classifier.feature_importances_
xgb_feats.shape, xgb_feats.max(), xgb_feats.argmax()

((920,), 0.028746726, 582)

# CatBoostClassifier

In [35]:
%%time
#https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit

from catboost import CatBoostClassifier
clf = CatBoostClassifier(eval_metric='AUC')
#clf = CatBoostClassifier()

clf.fit(X_train, y=y_train, eval_set = (X_test, y_test), verbose = False)
y_pred_valid = clf.predict_proba(X_test)[:,1]
auc_score_cat = roc_auc_score(y_test, y_pred_valid)
print(f'ROC = {auc_score_cat.round(3)}')

ROC = 0.789
CPU times: user 21min 28s, sys: 16.8 s, total: 21min 45s
Wall time: 5min 45s


In [36]:
#https://stackoverflow.com/questions/51895761/how-to-correctly-load-pretrained-model-in-catboost-in-python
clf.save_model('/kaggle/working/catboost_no_params')

In [54]:
clf_feat_imp = clf.get_feature_importance()
clf_feat_imp.shape, clf_feat_imp.max(), clf_feat_imp.argmax()

((920,), 8.803075598383154, 912)

# LGB

In [37]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    #"device": device, 
    "verbose": -1,
}

In [38]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgb_model = lgb.LGBMClassifier(**params)
#lgb_model = lgb.LGBMClassifier()
lgb_model.fit(
        X_train, y_train,
        eval_set = [(X_test, y_test)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
y_pred_lgb = lgb_model.predict_proba(X_test)[:,1]
auc_lgb = roc_auc_score(y_test, y_pred_lgb)
print(f'AUC {auc_lgb}')

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.785665
[400]	valid_0's auc: 0.789824
[800]	valid_0's auc: 0.792733
[1000]	valid_0's auc: 0.793127
[1200]	valid_0's auc: 0.793846
[1400]	valid_0's auc: 0.794106
[1600]	valid_0's auc: 0.794229
Early stopping, best iteration is:
[1686]	valid_0's auc: 0.794343
AUC 0.7943432696285865


In [58]:
#Feature importance
lgb_feat_imp = lgb_model.feature_importances_
lgb_feat_imp.shape, lgb_feat_imp.max(), lgb_feat_imp.argmax()

((920,), 3126, 908)

In [39]:
import pickle
file = '/kaggle/working/lgb_model.pkl'
pickle.dump(lgb_model, open(file, 'wb'))
print('Trained LGB model was saved!')

Trained LGB model was saved!


# Submission

In [40]:
pred_xgb = xgb_classifier.predict_proba(test_data)[:, 1]
pred_cat = clf.predict_proba(test_data)[:, 1]
pred_lgb = lgb_model.predict_proba(test_data)[:,1]
#predictions = (pred_xgb+pred_cat+pred_lgb)/3
predictions = pred_cat

In [41]:
subs = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv')
subs['score'] = predictions
subs.to_csv('submission.csv', index=False)
subs.head()

Unnamed: 0,case_id,score
0,57543,0.010694
1,57549,0.014251
2,57551,0.006429
3,57552,0.016566
4,57569,0.068441
