In [1]:
import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import gc
import joblib

from sklearn.metrics import accuracy_score

import optuna
from optuna.samplers import TPESampler

pd.set_option('display.max_columns', None)

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
# helper functions
# copied from
# https://www.kaggle.com/code/liamhealy/lightgbm-feature-importance-all-datasets

def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

def summary(df):
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#total'] = df.shape[0]
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df)* 100
    summ['#unique'] = df.nunique().values
    summ['#duplicates'] = summ['#total'] - summ['#unique']
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    return summ

def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

def drop_outliers(df, field_name):
    iqr = 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
    df.drop(df[df[field_name] > (iqr + np.percentile(df[field_name], 75))].index, inplace=True)
    df.drop(df[df[field_name] < (np.percentile(df[field_name], 25) - iqr)].index, inplace=True)

In [16]:
#############################################################################################
# TRAINING DATA SET
#############################################################################################

### BASE TABLE
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")

### FEATURES
train_feature_set = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

#############################################################################################
# JOIN TABLES TOGETHER
#############################################################################################
data = train_basetable.join( 
    train_feature_set, how="left", on="case_id"
)

#############################################################################################
# TRAINING AND TESTING SAMPLES
#############################################################################################
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

#############################################################################################
# TRAINING LIGHTGBM
#############################################################################################

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params =  {
    'device': 'gpu',
    'boosting_type': "gbdt",
    "objective": "binary",
    "metric": "auc",
    'lambda_l2': 1.0782358229300964e-07,
    'num_leaves': 244,
    'feature_fraction': 0.6699472800879643,
    'bagging_fraction': 0.747033768632279,
    'bagging_freq': 3,
    'min_child_samples': 27,
    'max_depth': 19,
    'min_data_in_leaf': 14,
    'learning_rate': 0.012841623760847382,
    'n_estimators': 656
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
)

#############################################################################################
# STABILITY METRICS
#############################################################################################

for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  



[LightGBM] [Info] Number of positive: 28872, number of negative: 887123
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4314
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (17.47 MB) transferred to GPU in 0.017022 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031520 -> initscore=-3.425111
[LightGBM] [Info] Start training from score -3.425111
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (13.06 MB) transferred to GPU in 0.014695 secs. 1 sparse feature groups
Training until validation scores don't improve for 100 rounds
[LightGBM] [Info] Size of histogram bin ent

In [24]:

def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
#         'device': 'gpu',
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
#         "num_class": 2,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        'min_data_in_bin':trial.suggest_int('min_data_in_bin', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 128, 1024)
    }
    
    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="lightgbm", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

print('Best value:', study.best_value)
print('Best parameters:', study.best_params)

[I 2024-05-23 18:49:05,661] A new study created in memory with name: lightgbm
[I 2024-05-23 18:50:21,963] Trial 0 finished with value: 0.9541941231184415 and parameters: {'lambda_l2': 5.6649755830282306e-05, 'num_leaves': 185, 'feature_fraction': 0.40006862489040695, 'bagging_fraction': 0.5813995435791038, 'bagging_freq': 2, 'min_child_samples': 13, 'max_depth': 7, 'min_data_in_leaf': 10, 'learning_rate': 0.40279979948836325, 'n_estimators': 585}. Best is trial 0 with value: 0.9541941231184415.
[I 2024-05-23 18:50:50,038] Trial 1 finished with value: 0.9679856680596859 and parameters: {'lambda_l2': 5.925849666609232e-05, 'num_leaves': 176, 'feature_fraction': 0.5226713498389105, 'bagging_fraction': 0.9268704618345672, 'bagging_freq': 1, 'min_child_samples': 69, 'max_depth': 11, 'min_data_in_leaf': 13, 'learning_rate': 0.14898306920928145, 'n_estimators': 278}. Best is trial 1 with value: 0.9679856680596859.
[I 2024-05-23 18:52:29,476] Trial 2 finished with value: 0.9679692924423251 and

Best value: 0.9682869794191241
Best parameters: {'lambda_l2': 1.0782358229300964e-07, 'num_leaves': 244, 'feature_fraction': 0.6699472800879643, 'bagging_fraction': 0.747033768632279, 'bagging_freq': 3, 'min_child_samples': 27, 'max_depth': 19, 'min_data_in_leaf': 14, 'learning_rate': 0.012841623760847382, 'n_estimators': 656}


In [17]:
train_static_cb_score = pd.concat([base_train, base_valid, base_test])[['case_id', 'score', 'target']].rename(columns={'score':'pred_1'})
train_static_cb_score

Unnamed: 0,case_id,pred_1,target
0,0,0.031820,0
1,2,0.031820,0
2,5,0.031820,0
3,6,0.031820,0
4,7,0.031820,0
...,...,...,...
305327,2703422,0.009405,0
305328,2703436,0.068938,0
305329,2703437,0.009409,0
305330,2703443,0.021803,0


In [10]:
#############################################################################################
# TRAINING DATA SET
#############################################################################################

### BASE TABLE
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")

### FEATURES
train_feature_set = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

#############################################################################################
# JOIN TABLES TOGETHER
#############################################################################################
data = train_basetable.join( 
    train_feature_set, how="left", on="case_id"
)

#############################################################################################
# TRAINING AND TESTING SAMPLES
#############################################################################################
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

#############################################################################################
# TRAINING LIGHTGBM
#############################################################################################

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

# params = {
#     'device':'gpu',
#     "boosting_type": "gbdt",
#     "objective": "binary",
#     "metric": "auc",
#     "max_depth": 12,
#     "num_leaves": 64,
#     "min_data_in_leaf": 10,
#     "learning_rate": 0.05,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 10,
#     "n_estimators": 1000,
#     'min_data_in_bin':1,
# #     'max_bin': 1024,
#     "verbose": -1,
# #     "random_state": 42,
# #     'n_jobs': -1
# }

params = {
    'device': 'gpu',
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    'lambda_l2': 1.0782358229300964e-07,
    'num_leaves': 244, 
    'feature_fraction': 0.6699472800879643,
    'bagging_fraction': 0.747033768632279, 
    'bagging_freq': 3,
    'min_child_samples': 27,
    'max_depth': 19,
    'min_data_in_leaf': 14,
    'learning_rate': 0.012841623760847382,
    'n_estimators': 656
    
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(1000), lgb.early_stopping(100)]
)

#############################################################################################
# STABILITY METRICS
#############################################################################################

for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred
    

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')



Training until validation scores don't improve for 1000 rounds
[100]	valid_0's auc: 0.635762
[200]	valid_0's auc: 0.639914
[300]	valid_0's auc: 0.641043
[400]	valid_0's auc: 0.641927
[500]	valid_0's auc: 0.641924
[600]	valid_0's auc: 0.642221
Did not meet early stopping. Best iteration is:
[609]	valid_0's auc: 0.642472
The AUC score on the train set is: 0.7258272162683007
The AUC score on the valid set is: 0.6424723159019011
The AUC score on the test set is: 0.6357653793652086
The stability score on the train set is: 0.3138798932300399
The stability score on the valid set is: 0.20455253517480781
The stability score on the test set is: 0.20077828140770168


In [11]:
joblib.dump(gbm, 'model_opt_static_0_1.pkl')

['model_opt_static_0_1.pkl']

In [15]:
train_static_0_score = pd.concat([base_train, base_valid, base_test])[['case_id', 'score', 'target']].rename(columns={'score':'pred_2'})
train_static_0_score

Unnamed: 0,case_id,pred_2,target
0,0,0.030681,0
1,2,0.030681,0
2,5,0.030681,0
3,6,0.030681,0
4,7,0.030681,0
...,...,...,...
305327,2703422,0.001112,0
305328,2703436,0.017143,0
305329,2703437,0.001980,0
305330,2703443,0.002216,0


In [6]:

def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
#         'device': 'gpu',
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
#         "num_class": 2,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        'min_data_in_bin':trial.suggest_int('min_data_in_bin', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 128, 1024)
    }
    
    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="lightgbm", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

print('Best value:', study.best_value)
print('Best parameters:', study.best_params)

[I 2024-05-25 09:23:02,236] A new study created in memory with name: lightgbm
[I 2024-05-25 09:25:16,585] Trial 0 finished with value: 0.9682313023200975 and parameters: {'lambda_l1': 5.6649755830282306e-05, 'lambda_l2': 0.030403280126677572, 'num_leaves': 2, 'feature_fraction': 0.5813995435791038, 'bagging_fraction': 0.48805353449026784, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 10, 'min_data_in_leaf': 11, 'learning_rate': 0.5434285666633234, 'n_estimators': 477, 'min_data_in_bin': 7, 'max_bin': 311}. Best is trial 0 with value: 0.9682313023200975.
[I 2024-05-25 09:42:48,210] Trial 5 finished with value: 0.9643240800178167 and parameters: {'lambda_l1': 0.0006653767987574282, 'lambda_l2': 0.009422188901528314, 'num_leaves': 516, 'feature_fraction': 0.966756853594488, 'bagging_fraction': 0.7519330243011957, 'bagging_freq': 10, 'min_child_samples': 18, 'max_depth': 7, 'min_data_in_leaf': 17, 'learning_rate': 0.40370006861567825, 'n_estimators': 248, 'min_data_in_bin': 10, 

Best value: 0.9682313023200975
Best parameters: {'lambda_l1': 5.6649755830282306e-05, 'lambda_l2': 0.030403280126677572, 'num_leaves': 2, 'feature_fraction': 0.5813995435791038, 'bagging_fraction': 0.48805353449026784, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 10, 'min_data_in_leaf': 11, 'learning_rate': 0.5434285666633234, 'n_estimators': 477, 'min_data_in_bin': 7, 'max_bin': 311}


In [20]:
dataset_stage_2 = train_static_cb_score.merge(train_static_0_score, how = 'left', on = ['case_id', 'target'])
dataset_stage_2

Unnamed: 0,case_id,pred_1,target,pred_2
0,0,0.031820,0,0.030681
1,2,0.031820,0,0.030681
2,5,0.031820,0,0.030681
3,6,0.031820,0,0.030681
4,7,0.031820,0,0.030681
...,...,...,...,...
1526654,2703422,0.009405,0,0.001112
1526655,2703436,0.068938,0,0.017143
1526656,2703437,0.009409,0,0.001980
1526657,2703443,0.021803,0,0.002216


In [22]:
y = dataset_stage_2['target']
X = dataset_stage_2.drop(columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.4, shuffle=True)


def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
#         'device': 'gpu',
        "objective": "binary",
        "metric": "auc",
#         "verbosity": -1,
        "boosting_type": "gbdt",
#         "num_class": 2,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        'min_data_in_bin':trial.suggest_int("min_data_in_bin", 1, 1000),
        'max_bin': trial.suggest_int("max_bin",64, 1024),
    }
    
    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="lightgbm", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

print('Best value:', study.best_value)
print('Best parameters:', study.best_params)

[I 2024-05-25 10:48:57,939] A new study created in memory with name: lightgbm


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:49:11,398] Trial 0 finished with value: 0.40704544561329964 and parameters: {'lambda_l1': 5.6649755830282306e-05, 'lambda_l2': 0.030403280126677572, 'num_leaves': 2, 'feature_fraction': 0.5813995435791038, 'bagging_fraction': 0.48805353449026784, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 10, 'min_data_in_leaf': 11, 'learning_rate': 0.5434285666633234, 'n_estimators': 477, 'min_data_in_bin': 686, 'max_bin': 260}. Best is trial 0 with value: 0.40704544561329964.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 687
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:50:43,606] Trial 1 finished with value: 0.9543005646312867 and parameters: {'lambda_l1': 0.7999391045172093, 'lambda_l2': 1.763958399884789e-08, 'num_leaves': 172, 'feature_fraction': 0.6503828814202761, 'bagging_fraction': 0.735213897067451, 'bagging_freq': 1, 'min_child_samples': 24, 'max_depth': 17, 'min_data_in_leaf': 20, 'learning_rate': 0.3202899363776504, 'n_estimators': 723, 'min_data_in_bin': 877, 'max_bin': 923}. Best is trial 1 with value: 0.9543005646312867.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 243
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:51:19,608] Trial 2 finished with value: 0.9702586037493613 and parameters: {'lambda_l1': 5.8263678744131555e-08, 'lambda_l2': 2.2464308214708563e-08, 'num_leaves': 45, 'feature_fraction': 0.9268855020576479, 'bagging_fraction': 0.4590081002998301, 'bagging_freq': 3, 'min_child_samples': 96, 'max_depth': 13, 'min_data_in_leaf': 16, 'learning_rate': 0.3223604746960023, 'n_estimators': 718, 'min_data_in_bin': 835, 'max_bin': 81}. Best is trial 2 with value: 0.9702586037493613.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2148
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:51:40,736] Trial 3 finished with value: 0.9733945344739496 and parameters: {'lambda_l1': 0.05640256237813216, 'lambda_l2': 7.938707489172784, 'num_leaves': 192, 'feature_fraction': 0.5682663952386431, 'bagging_fraction': 0.8735675970708932, 'bagging_freq': 1, 'min_child_samples': 47, 'max_depth': 19, 'min_data_in_leaf': 9, 'learning_rate': 0.29489758520048526, 'n_estimators': 217, 'min_data_in_bin': 20, 'max_bin': 716}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 336
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:53:34,653] Trial 4 finished with value: 0.9686177668898117 and parameters: {'lambda_l1': 8.028844449625994e-07, 'lambda_l2': 2.454256412239902e-06, 'num_leaves': 127, 'feature_fraction': 0.43201752707024826, 'bagging_fraction': 0.7444705632952078, 'bagging_freq': 2, 'min_child_samples': 61, 'max_depth': 16, 'min_data_in_leaf': 6, 'learning_rate': 0.41991542794137265, 'n_estimators': 725, 'min_data_in_bin': 415, 'max_bin': 112}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 648
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:53:52,909] Trial 5 finished with value: 0.9699867685011725 and parameters: {'lambda_l1': 0.0006653767987574282, 'lambda_l2': 0.009422188901528314, 'num_leaves': 133, 'feature_fraction': 0.966756853594488, 'bagging_fraction': 0.7519330243011957, 'bagging_freq': 7, 'min_child_samples': 18, 'max_depth': 7, 'min_data_in_leaf': 17, 'learning_rate': 0.40370006861567825, 'n_estimators': 248, 'min_data_in_bin': 928, 'max_bin': 398}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 522
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:54:15,820] Trial 6 finished with value: 0.9219636330289652 and parameters: {'lambda_l1': 0.05718852932029385, 'lambda_l2': 0.03419651651296183, 'num_leaves': 227, 'feature_fraction': 0.7742033242333654, 'bagging_fraction': 0.8505654604164024, 'bagging_freq': 3, 'min_child_samples': 30, 'max_depth': 19, 'min_data_in_leaf': 11, 'learning_rate': 0.9651916466769017, 'n_estimators': 697, 'min_data_in_bin': 622, 'max_bin': 174}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 678
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:54:48,849] Trial 7 finished with value: 0.9663284555827755 and parameters: {'lambda_l1': 3.510777677785371, 'lambda_l2': 0.00011199772508910995, 'num_leaves': 149, 'feature_fraction': 0.6448820816567687, 'bagging_fraction': 0.5422161881458166, 'bagging_freq': 7, 'min_child_samples': 60, 'max_depth': 5, 'min_data_in_leaf': 14, 'learning_rate': 0.3333784527543752, 'n_estimators': 574, 'min_data_in_bin': 886, 'max_bin': 407}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 795
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:54:53,183] Trial 8 finished with value: 0.9624015825396618 and parameters: {'lambda_l1': 1.50250649583485, 'lambda_l2': 0.004076064447663881, 'num_leaves': 6, 'feature_fraction': 0.9576623402462567, 'bagging_fraction': 0.8145381505101543, 'bagging_freq': 7, 'min_child_samples': 21, 'max_depth': 7, 'min_data_in_leaf': 19, 'learning_rate': 0.6998499798750012, 'n_estimators': 159, 'min_data_in_bin': 756, 'max_bin': 788}. Best is trial 3 with value: 0.9733945344739496.


[LightGBM] [Info] Number of positive: 28779, number of negative: 887216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 996
[LightGBM] [Info] Number of data points in the train set: 915995, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031418 -> initscore=-3.428443
[LightGBM] [Info] Start training from score -3.428443


[I 2024-05-25 10:56:21,370] Trial 9 finished with value: 0.9696101293018747 and parameters: {'lambda_l1': 2.02871397043961, 'lambda_l2': 0.025335111948742706, 'num_leaves': 33, 'feature_fraction': 0.4119280803038774, 'bagging_fraction': 0.4157265921266316, 'bagging_freq': 1, 'min_child_samples': 28, 'max_depth': 18, 'min_data_in_leaf': 13, 'learning_rate': 0.5572937588989082, 'n_estimators': 858, 'min_data_in_bin': 125, 'max_bin': 332}. Best is trial 3 with value: 0.9733945344739496.


Best value: 0.9733945344739496
Best parameters: {'lambda_l1': 0.05640256237813216, 'lambda_l2': 7.938707489172784, 'num_leaves': 192, 'feature_fraction': 0.5682663952386431, 'bagging_fraction': 0.8735675970708932, 'bagging_freq': 1, 'min_child_samples': 47, 'max_depth': 19, 'min_data_in_leaf': 9, 'learning_rate': 0.29489758520048526, 'n_estimators': 217, 'min_data_in_bin': 20, 'max_bin': 716}


In [24]:
param = {
#      'device': 'gpu',
     "objective": "binary",
     "metric": "auc",
     "boosting_type": "gbdt",
     'lambda_l1': 0.05640256237813216, 
     'lambda_l2': 7.938707489172784, 
     'num_leaves': 192, 
     'feature_fraction': 0.5682663952386431, 
     'bagging_fraction': 0.8735675970708932, 
     'bagging_freq': 1, 'min_child_samples': 47, 
     'max_depth': 19, 'min_data_in_leaf': 9, 
     'learning_rate': 0.29489758520048526, 
     'n_estimators': 217, 'min_data_in_bin': 20, 
     'max_bin': 716
}

gbm = lgb.LGBMClassifier(**param)
gbm.fit(X, y)
# preds = gbm.predict(X_test)

[LightGBM] [Info] Number of positive: 47994, number of negative: 1478665
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2148
[LightGBM] [Info] Number of data points in the train set: 1526659, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031437 -> initscore=-3.427819
[LightGBM] [Info] Start training from score -3.427819


In [25]:
accuracy_score(gbm.predict(X), y)



0.9741206123960885

In [26]:
# save model
joblib.dump(gbm, 'model_stage_2_opt.pkl')

['model_stage_2_opt.pkl']

In [None]:
#############################################################################################
# TEST DATA SET
#############################################################################################

### BASE TABLE
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")

### FEATURES
test_feature_set  = pl.concat(
    [
    pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes)
    ],
    how="vertical_relaxed"
)


#############################################################################################
# JOIN TABLES TOGETHER
#############################################################################################
test = test_basetable.join( 
    test_feature_set, how="left", on="case_id"
)

cols_pred = []
for col in X_train.columns:
#     if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)
                
test_static = test[cols_pred].to_pandas()
# test_static = convert_strings(test_static)

numeric_cols = ['deferredmnthsnum_166L', 'interestrategrace_34L']

for col in numeric_cols:
    test_static[col] = test_static[col].astype('float64')

categorical_cols = set(X_train.select_dtypes(include=['category']).columns)

for col in categorical_cols:
    test_static[col] = test_static[col].astype('category')
    

y_pred = gbm.predict(test_static, num_iteration=gbm.best_iteration)
y_pred