# Home-credit-24: Ensemble of boosting models

In [1]:
import numpy as np
import polars as pl
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
device='gpu'

# Load data

In [4]:
dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [5]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

#this is to convert string, object dtype to categorical which is one-hot-encoded later
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [6]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
#Depth 1
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [7]:
# Do the same thing for test data
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [8]:
# We will process in this examples only A-type, P-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    #if col[-1] in ("A", "P", "M"):
    if col[-1] in ("A",  "M"):
        selected_static_cols.append(col)
print(f'Selected static cols: {len(selected_static_cols)}')
print('\n')
selected_static_cb_cols = []
for col in train_static_cb.columns:
    #if col[-1] in ("A", "P", "M"):
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(f'Selected static cb_cols: {len(selected_static_cb_cols)}')

Selected static cols: 39


Selected static cb_cols: 9


# Depth-1 data

In [9]:
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

In [10]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"))

# Final dataframe in tables

In [11]:
# Join all train tables together.
train_data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [12]:
# Join all test tables together.
test_data = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [13]:
test_index = test_data['case_id']
#test_index

# Categorical data transformation

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [15]:
#convert to pandas
train_data=train_data.to_pandas()
test_data=test_data.to_pandas()

In [16]:
train_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
test_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
train_data = convert_strings(train_data)
test_data = convert_strings(test_data)

In [17]:
#find out the categorical columns of train_data 
cat_cols = []
for i in range(len(list(train_data.dtypes))):
    if list(train_data.dtypes)[i].type != np.int64 and list(train_data.dtypes)[i].type != np.float64:
     #if list(train_data.dtypes)[i].type == pd.core.dtypes.dtypes.CategoricalDtypeType:
        cat_cols.append(train_data.columns[i])    
        
#find out the categorical columns of test_data
cat_cols_test = []
for i in range(len(list(test_data.dtypes))):
    #if list(train_data.dtypes)[i].type != numpy.int64 and list(train_data.dtypes)[i].type != numpy.float64:
     if list(test_data.dtypes)[i].type == pd.core.dtypes.dtypes.CategoricalDtypeType:
        cat_cols_test.append(test_data.columns[i])   

In [18]:
len(cat_cols),len(cat_cols_test), cat_cols

(16,
 16,
 ['lastapprcommoditycat_1041M',
  'lastapprcommoditytypec_5251766M',
  'lastcancelreason_561M',
  'lastrejectcommoditycat_161M',
  'lastrejectcommodtypec_5251769M',
  'lastrejectreason_759M',
  'lastrejectreasonclient_4145040M',
  'previouscontdistrict_112M',
  'description_5085714M',
  'education_1103M',
  'education_88M',
  'maritalst_385M',
  'maritalst_893M',
  'mainoccupationinc_384A_any_selfemployed',
  'person_housetype',
  'pmts_dpdvalue_108P_over31'])

In [19]:
# one-hot encode the categorical features
full_pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols_test)], remainder='passthrough')

In [20]:
X = train_data.drop(['target'], axis = 1)
y = train_data['target']

In [21]:
encoder = full_pipeline.fit(X)
X = encoder.transform(X)
test_data= encoder.transform(test_data)

X.shape, test_data.shape

((1526659, 898), (10, 898))

# First layer ensemble of models [XGBoost, CatBoost, Light LGB]

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

https://www.kaggle.com/code/harrychan123/lgb-cat-ensemble-stacking

In [22]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
    "verbose": -1,
}

params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,  
    "lambda": 10,  
    "tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [23]:
n_est = 6000
models = [
    ('CatBoost', CatBoostClassifier(eval_metric='AUC', task_type='GPU',
                                    learning_rate=0.03, iterations=n_est, random_seed=3107)),
    ('LightGBM', LGBMClassifier(**params)),
    ('XGBoost', XGBClassifier(**params2))]

In [24]:
fitted_models_cb = []
fitted_models_lgb = []
fitted_models_xgb = []
cv_scores_cb = []
cv_scores_lgb = []
cv_scores_xgb = []

# Store the prediction results of the first layer model
meta_features = pd.DataFrame(index=train_data.index, columns=['CatBoost','LightGBM', 'XGBoost'])
meta_features

Unnamed: 0,CatBoost,LightGBM,XGBoost
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
1526654,,,
1526655,,,
1526656,,,
1526657,,,


In [25]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=False)
#The format of cv is [[train,val], [train,val], [train,val], ...]
list(enumerate(cv.split(X, y)))

[(0,
  (array([ 305332,  305333,  305334, ..., 1526656, 1526657, 1526658]),
   array([     0,      1,      2, ..., 305329, 305330, 305331]))),
 (1,
  (array([      0,       1,       2, ..., 1526656, 1526657, 1526658]),
   array([305332, 305333, 305334, ..., 610661, 610662, 610663]))),
 (2,
  (array([      0,       1,       2, ..., 1526656, 1526657, 1526658]),
   array([610664, 610665, 610666, ..., 915993, 915994, 915995]))),
 (3,
  (array([      0,       1,       2, ..., 1526656, 1526657, 1526658]),
   array([ 915996,  915997,  915998, ..., 1221325, 1221326, 1221327]))),
 (4,
  (array([      0,       1,       2, ..., 1221325, 1221326, 1221327]),
   array([1221328, 1221329, 1221330, ..., 1526656, 1526657, 1526658])))]

In [26]:
len(list(cv.split(X, y))[0][0]), len(list(cv.split(X, y))[0][1])

(1221327, 305332)

In [27]:
for name, model in models:
    for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
        X_train, y_train = X[idx_train], y[idx_train]
        X_valid, y_valid = X[idx_valid], y[idx_valid]
        print(f'X_train shape and indices {X_train.shape}, {idx_train}')
        print(f'X_valid shape and indices {X_valid.shape}, {idx_valid}')
        if name == 'CatBoost':
            model.fit(X_train, y=y_train, 
                      eval_set = (X_valid, y_valid), verbose = False)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            fitted_models_cb.append(model)
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_cb.append(auc_score)
        elif name == 'LightGBM':
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)])
            fitted_models_lgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_lgb.append(auc_score)
        else:  # XGBoost
            model.fit(X_train, y_train, 
                      eval_set=[(X_valid, y_valid)], 
                      early_stopping_rounds=100, verbose=False)
            fitted_models_xgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_xgb.append(auc_score)

        #meta_features.loc[X_valid.index, name] = y_pred_valid
        #write the prediction into the dataframe
        meta_features.loc[list(cv.split(X, y))[i][1], name] = y_pred_valid


X_train shape and indices (1221327, 898), [ 305332  305333  305334 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [     0      1      2 ... 305329 305330 305331]


Default metric period is 5 because AUC is/are not implemented for GPU


X_train shape and indices (1221327, 898), [      0       1       2 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [305332 305333 305334 ... 610661 610662 610663]


Default metric period is 5 because AUC is/are not implemented for GPU


X_train shape and indices (1221327, 898), [      0       1       2 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [610664 610665 610666 ... 915993 915994 915995]


Default metric period is 5 because AUC is/are not implemented for GPU


X_train shape and indices (1221327, 898), [      0       1       2 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [ 915996  915997  915998 ... 1221325 1221326 1221327]


Default metric period is 5 because AUC is/are not implemented for GPU


X_train shape and indices (1221328, 898), [      0       1       2 ... 1221325 1221326 1221327]
X_valid shape and indices (305331, 898), [1221328 1221329 1221330 ... 1526656 1526657 1526658]


Default metric period is 5 because AUC is/are not implemented for GPU


X_train shape and indices (1221327, 898), [ 305332  305333  305334 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [     0      1      2 ... 305329 305330 305331]




Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.698668
Early stopping, best iteration is:
[290]	valid_0's auc: 0.699672
X_train shape and indices (1221327, 898), [      0       1       2 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [305332 305333 305334 ... 610661 610662 610663]
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.699143
[400]	valid_0's auc: 0.704701
[600]	valid_0's auc: 0.707535
[800]	valid_0's auc: 0.709085
[1000]	valid_0's auc: 0.709668
[1200]	valid_0's auc: 0.710413
[1400]	valid_0's auc: 0.710928
[1600]	valid_0's auc: 0.71144
[1800]	valid_0's auc: 0.711822
Early stopping, best iteration is:
[1876]	valid_0's auc: 0.711962
X_train shape and indices (1221327, 898), [      0       1       2 ... 1526656 1526657 1526658]
X_valid shape and indices (305332, 898), [610664 610665 610666 ... 915993 915994 915995]
Training until validation scores don't improve for 100 rounds
[200]	val

In [28]:
meta_features

Unnamed: 0,CatBoost,LightGBM,XGBoost
0,0.091534,0.066972,0.054619
1,0.083281,0.06305,0.038843
2,0.111591,0.095145,0.058984
3,0.128839,0.122828,0.085858
4,0.158115,0.105206,0.119872
...,...,...,...
1526654,0.004325,0.006666,0.003424
1526655,0.003191,0.004267,0.004651
1526656,0.031893,0.032301,0.044135
1526657,0.003372,0.003751,0.005775


# Second layer ensemble model

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'n_estimators': 12,
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_samples_split': 3,
    'min_samples_leaf': 1
}

meta_model = GradientBoostingClassifier(**params)

meta_model.fit(meta_features, y)

# Build the same 2-layer ensemble for the test data

In [30]:
test_meta_features = pd.DataFrame(index=test_index, columns=['CatBoost','LightGBM', 'XGBoost'])

In [31]:
# FIRST LAYER ENSEMBLE MODEL ON TEST DATA
for model in fitted_models_cb:
    y_pred_test = model.predict_proba(test_data)[:, 1]
    test_meta_features['CatBoost'] = test_meta_features['CatBoost'].add(y_pred_test, fill_value=0)

test_meta_features['CatBoost'] /= len(fitted_models_cb)

# LightGBM
for model in fitted_models_lgb:
    y_pred_test = model.predict_proba(test_data)[:, 1]
    test_meta_features['LightGBM'] = test_meta_features['LightGBM'].add(y_pred_test, fill_value=0)

test_meta_features['LightGBM'] /= len(fitted_models_lgb)

# XGBoost
for model in fitted_models_xgb:
    y_pred_test = model.predict_proba(test_data)[:, 1]
    test_meta_features['XGBoost'] = test_meta_features['XGBoost'].add(y_pred_test, fill_value=0)

test_meta_features['XGBoost'] /= len(fitted_models_xgb)

In [32]:
test_meta_features

Unnamed: 0,CatBoost,LightGBM,XGBoost
57543,0.011268,0.007732,0.008411
57549,0.01477,0.018745,0.023321
57551,0.005865,0.010339,0.005932
57552,0.009653,0.012794,0.0085
57569,0.0493,0.065197,0.05126
57630,0.009703,0.01093,0.010986
57631,0.027086,0.039044,0.018703
57632,0.000925,0.002821,0.002168
57633,0.052585,0.074046,0.036282
57634,0.003912,0.008816,0.008711


In [33]:
# USE THE SECOND-LAYER MODEL ON THE TEST META FEATURE
y_pred = pd.Series(meta_model.predict_proba(test_meta_features)[:, 1], index=test_index)

In [35]:
y_pred 

57543    0.014687
57549    0.025009
57551    0.014746
57552    0.016374
57569    0.053341
57630    0.017032
57631    0.025627
57632    0.013144
57633    0.049804
57634    0.014163
dtype: float64

# Submission

In [34]:
subs = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv')
subs['score'] = y_pred 
subs.to_csv('submission.csv', index=False)
subs.head()

Unnamed: 0,case_id,score
0,57543,
1,57549,
2,57551,
3,57552,
4,57569,
