Credit:

https://www.kaggle.com/code/lorresprz/home-credit-lgb-cat-ensemble

In [1]:
import numpy as np
import pandas as pd
import warnings as wr
wr.filterwarnings('ignore')

# 1. Data Loading

In [2]:
# Using only depth=0 data
directory = "/kaggle/input/home-credit-credit-risk-model-stability/csv_files/"

train_base_df = pd.read_csv(directory + "train/train_base.csv")
test_base_df  = pd.read_csv(directory + "test/test_base.csv")

train_static0 = pd.read_csv(directory + "train/train_static_0_0.csv")
train_static1 = pd.read_csv(directory + "train/train_static_0_1.csv")
train_static  = pd.concat([train_static0, train_static1], ignore_index=True)
del train_static0, train_static1

test_static0 = pd.read_csv(directory + "test/test_static_0_0.csv")
test_static1 = pd.read_csv(directory + "test/test_static_0_1.csv")
test_static2 = pd.read_csv(directory + "test/test_static_0_2.csv")
test_static  = pd.concat([test_static0, test_static1, test_static2], ignore_index=True)
del test_static0, test_static1, test_static2

train_static_cb = pd.read_csv(directory + "train/train_static_cb_0.csv")
test_static_cb  = pd.read_csv(directory + "test/test_static_cb_0.csv")

In [3]:
train_base_df.columns, len(train_base_df)

(Index(['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target'], dtype='object'),
 1526659)

In [4]:
len(train_static.columns), train_static.columns[:10], len(train_static)

(168,
 Index(['case_id', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A',
        'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L',
        'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L',
        'applicationscnt_629L'],
       dtype='object'),
 1526659)

In [5]:
# Add depth-1 data
#tr_d1_1 = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_0.csv')

In [6]:
#len(tr_d1_1.columns),len(tr_d1_1)

# 2. Data Preparation

### Merge "train":

In [7]:
# For simplicity, select only columns ending in "A" or "P" (columns of float type)
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "P"):
        selected_static_cols.append(col)
# print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "P"):
        selected_static_cb_cols.append(col)
# print(selected_static_cb_cols)

train_data = pd.merge(train_base_df, train_static[["case_id"]+selected_static_cols],    how="left", on="case_id")
train_data = pd.merge(train_data, train_static_cb[["case_id"]+selected_static_cb_cols], how="left", on="case_id")

In [8]:
len(train_data.columns), len(train_data)

(62, 1526659)

### Merge "test":

In [9]:
# For simplicity, select only columns ending in "A" or "P" (columns of float type)
selected_static_cols = []
for col in test_static.columns:
    if col[-1] in ("A", "P"):
        selected_static_cols.append(col)
# print(selected_static_cols)

selected_static_cb_cols = []
for col in test_static_cb.columns:
    if col[-1] in ("A", "P"):
        selected_static_cb_cols.append(col)
# print(selected_static_cb_cols)

test_data = pd.merge(test_base_df, test_static[["case_id"]+selected_static_cols], how="left", on="case_id")
test_data = pd.merge(test_data, test_static_cb[["case_id"]+selected_static_cb_cols], how="left", on="case_id")

In [10]:
train_data.drop(columns = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
test_data.drop(columns  = ["case_id", "MONTH", "WEEK_NUM", "date_decision"], inplace = True)
X = train_data.drop(['target'], axis=1)
y = train_data['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1221327, 57), (305332, 57), (1221327,), (305332,))

# 3. XGBoost modelling

In [12]:
from sklearn.metrics import  roc_auc_score
from xgboost import XGBClassifier

In [13]:
params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,  
    "lambda": 10,  
    #"tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [14]:
%%time
xgb_classifier = XGBClassifier(**params2)
#xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred2 = xgb_classifier.predict_proba(X_test)[:,1]
auc_score_xgb = roc_auc_score(y_test, y_pred2)
print(f'ROC = {auc_score_xgb.round(3)}')

ROC = 0.776
CPU times: user 13min 50s, sys: 1.56 s, total: 13min 52s
Wall time: 3min 33s


# 4. CatBoostClassifier

In [15]:
%%time
#https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit

from catboost import CatBoostClassifier
clf = CatBoostClassifier(eval_metric='AUC',learning_rate=0.03)

clf.fit(X_train, y=y_train, eval_set = (X_test, y_test), verbose = False)
y_pred_valid = clf.predict_proba(X_test)[:,1]
auc_score_cat = roc_auc_score(y_test, y_pred_valid)
print(f'ROC = {auc_score_cat.round(3)}')

ROC = 0.771
CPU times: user 19min 1s, sys: 12.7 s, total: 19min 13s
Wall time: 5min 6s


# LGB

In [16]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    #"device": device, 
    "verbose": -1,
}

In [17]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(
        X_train, y_train,
        eval_set = [(X_test, y_test)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
y_pred_lgb = lgb_model.predict_proba(X_test)[:,1]
auc_lgb = roc_auc_score(y_test, y_pred_lgb)
print(f'AUC {auc_lgb}')

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.769884
[400]	valid_0's auc: 0.773812
[600]	valid_0's auc: 0.775114
[800]	valid_0's auc: 0.775506
[1000]	valid_0's auc: 0.775707
[1200]	valid_0's auc: 0.775955
[1400]	valid_0's auc: 0.776063
Early stopping, best iteration is:
[1419]	valid_0's auc: 0.776086
AUC 0.7760857040194817


# 5. Submit

In [18]:
pred_xgb = xgb_classifier.predict_proba(test_data)[:, 1]
pred_cat = clf.predict_proba(test_data)[:, 1]
pred_lgb = lgb_model.predict_proba(test_data)[:,1]

predictions = (pred_xgb+pred_cat+pred_lgb)/3

In [19]:

test_Id = test_base_df["case_id"]
submission = pd.DataFrame({
    'case_id': test_Id,
    'score': predictions
})

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,case_id,score
0,57543,0.040254
1,57549,0.00931
2,57551,0.010064
3,57552,0.017014
4,57569,0.076819
