In [None]:
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [None]:
# import file

dir=r".\storage\data"
train=pd.read_csv(os.path.join(dir,"train.csv"))
test=pd.read_csv(os.path.join(dir,"test.csv"))
sub_=pd.read_csv(os.path.join(dir,"sample_submission.csv"))
orig=pd.read_csv(os.path.join(dir,"loan_dataset_20000.csv"))

print("train shape :", train.shape)
print("test shape :",test.shape)
print("orig shape :",orig.shape)
train.head()

train shape : (593994, 13)
test shape : (254569, 12)
orig shape : (20000, 22)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [26]:
TARGET = 'loan_paid_back'
CATS = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
BASE = [col for col in train.columns if col not in ['id', TARGET]]

In [27]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold
import random
import catboost as cat
from catboost import CatBoostClassifier

In [28]:
#This took a lot of time to code, but I've already finished and saved it as a  file, 
# which you can directly import to get the mapping.
import joblib
os.makedirs("./storage",exist_ok=True)
# joblib.dump(mapping, "./storage/mapping.joblib")
mapping=joblib.load("./storage/mapping.joblib")

# Establish a baseline model

In [36]:
def cv_train(params,SEED_5=5,SEED=42,cv=5,train=train.copy(),test=test.copy()):
   
    oof_preds=np.zeros(len(train))
    test_preds=np.zeros(len(test))
    random.seed(SEED)
    seeds=random.sample(np.arange(len(train)).tolist(),SEED_5)
    kf=StratifiedKFold(n_splits=cv,random_state=SEED,shuffle=True)
    
    for i,(train_idx,val_idx) in enumerate(kf.split(train,train[TARGET])):
        print(f"< {i+1}/{cv} > is training !")
        (x_train,y_train),(x_val,y_val),x_test=mapping[i]
        x_train[CATS]=x_train[CATS].astype("category")
        x_val[CATS]=x_val[CATS].astype("category")
        x_test[CATS]=x_test[CATS].astype("category")

        # 5 seed trainings ‚Äî‚Äî Ensure stable results
        for _ in seeds:
            model=CatBoostClassifier(**params)
            model.fit(x_train,y_train,
                     eval_set=[(x_val,y_val)],
                     verbose=200,
                     cat_features=CATS)
            val_preds=model.predict_proba(x_val)[:,1]
            preds=model.predict_proba(x_test)[:,1]
            
            oof_preds[val_idx]+=val_preds/len(seeds)
            test_preds+=preds/len(seeds)/cv
        print(f"AUC is {roc_auc_score(y_val,oof_preds[val_idx]):.5f}\n")
    print(f"üî•overall AUC is {roc_auc_score(train[TARGET],oof_preds):.5f}üöÄ")
    return oof_preds,test_preds

In [30]:
def to_submission(oof_preds,test_preds,desc):
    sub_["loan_paid_back"]=test_preds
    sub_.to_csv(f"./ensemble/{desc}_test.csv",index=False)

    df={"id":np.arange(len(oof_preds)),
       "oof_preds":oof_preds}
    df=pd.DataFrame(df)
    df.to_csv(f"./ensemble/{desc}_oof.csv",index=False)

In [31]:
import optuna
import tqdm 

In [32]:
def objective(trial,SEED=42,cv=5):    
    params={
        "learning_rate":trial.suggest_float("learning_rate",1e-3,0.2),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "iterations":3000,
        "task_type":"GPU",
        "use_best_model":True,
        "train_dir":"Single_CAT",
        "depth":trial.suggest_int("depth",3,7),
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1,10),
        "subsample":0.8,
        "random_seed":42,
        "early_stopping_rounds":50,
        "bootstrap_type":"Bernoulli"
        }
    random.seed()  # ‰ΩøÁî®Á≥ªÁªüÊó∂Èó¥Ëá™Âä®ÈöèÊú∫Âåñ
    i=random.choice(range(cv))
    
    (x_train,y_train),(x_val,y_val),_=mapping[i]
    x_train[CATS]=x_train[CATS].astype("category")
    x_val[CATS]=x_val[CATS].astype("category")
    
    model=CatBoostClassifier(**params)
    model.fit(x_train,y_train,
             eval_set=[(x_val,y_val)],
              cat_features=CATS,
             verbose=200)
    val_preds=model.predict_proba(x_val)[:,1]                
    return roc_auc_score(y_val,val_preds)

In [33]:
def study_start(N_TRIALS,study_stopping_rounds=20):
    pbar=tqdm.tqdm(total=N_TRIALS)
    
    def fixed_params():
        return {
                "loss_function": "Logloss",
                "eval_metric": "AUC",
                "iterations":3000,
                "task_type":"GPU",
                "use_best_model":True,
                "train_dir":"Single_CAT",
                "subsample":0.8,
                "random_seed":42,
                "early_stopping_rounds":50,
                "bootstrap_type":"Bernoulli"
            }
        
    def logging_callback(study,trial):
        """
        Display progress bar and log.
        """
        pbar.update(1)
        pbar.set_description(f"<{trial.number+1}/{N_TRIALS} > <best value>{study.best_value}")
   
    def stopping_callbacks(study,trial):
        """
        Set early stop for study
        """
        if trial.number>study.best_trial.number+study_stopping_rounds:
            print("\n\nStudy is Stopping!")
            study.stop()
        else:pass
    
    study=optuna.create_study(direction="maximize",load_if_exists=True,\
                              study_name="Single_CAT",
                             storage="sqlite:///./storage/Single_CAT.db")
    print("Study is starting...\n")
    study.optimize(objective,n_trials=N_TRIALS,callbacks=[logging_callback,stopping_callbacks])
    print(f"<best value> {study.best_value}")

    # The last parameter is the optimal parameter and the fixed value parameter.
    best_params=study.best_params | fixed_params()
    pbar.close()
    return best_params

In [34]:
# catboost
cat_best_params=study_start(200)
cat_best_params


  0%|                                                                                          | 0/200 [00:00<?, ?it/s][A[I 2025-11-23 14:10:15,431] Using an existing study with name 'Single_CAT' instead of creating a new one.


Study is starting...



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.9061343	best: 0.9061343 (0)	total: 152ms	remaining: 7m 35s
200:	test: 0.9204969	best: 0.9204983 (199)	total: 9.66s	remaining: 2m 14s
400:	test: 0.9231457	best: 0.9231457 (400)	total: 19.2s	remaining: 2m 4s
600:	test: 0.9242307	best: 0.9242307 (600)	total: 28.7s	remaining: 1m 54s
800:	test: 0.9247716	best: 0.9247716 (800)	total: 38.2s	remaining: 1m 44s
1000:	test: 0.9251025	best: 0.9251025 (1000)	total: 47.9s	remaining: 1m 35s
1200:	test: 0.9253233	best: 0.9253233 (1200)	total: 57.3s	remaining: 1m 25s
1400:	test: 0.9254726	best: 0.9254726 (1400)	total: 1m 7s	remaining: 1m 16s
1600:	test: 0.9255968	best: 0.9255968 (1600)	total: 1m 16s	remaining: 1m 7s
1800:	test: 0.9256797	best: 0.9256797 (1792)	total: 1m 26s	remaining: 57.4s
bestTest = 0.9256971478
bestIteration = 1865
Shrink model to first 1866 iterations.


[I 2025-11-23 14:11:51,019] Trial 40 finished with value: 0.9256971236673952 and parameters: {'learning_rate': 0.0074111637983533185, 'depth': 5, 'l2_leaf_reg': 4.826983050886218}. Best is trial 13 with value: 0.926742904513193.

  0%|‚ñç                                                                               | 1/200 [01:35<5:18:06, 95.91s/it][A
<41/200 > <best value>0.926742904513193:   0%|‚ñè                                      | 1/200 [01:35<5:18:11, 95.94s/it][A



Study is Stopping!
<best value> 0.926742904513193





{'learning_rate': 0.009208019189984352,
 'depth': 5,
 'l2_leaf_reg': 5.839362810867411,
 'loss_function': 'Logloss',
 'eval_metric': 'AUC',
 'iterations': 3000,
 'task_type': 'GPU',
 'use_best_model': True,
 'train_dir': 'Single_CAT',
 'subsample': 0.8,
 'random_seed': 42,
 'early_stopping_rounds': 50,
 'bootstrap_type': 'Bernoulli'}

In [35]:
cat_oof_preds,cat_test_preds=cv_train(cat_best_params)
to_submission(cat_oof_preds,cat_test_preds,"cat_optuna")

< 1/5 > is training !


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.9061343	best: 0.9061343 (0)	total: 75.5ms	remaining: 3m 46s
200:	test: 0.9212995	best: 0.9212995 (200)	total: 10.1s	remaining: 2m 20s
400:	test: 0.9237339	best: 0.9237339 (400)	total: 20.5s	remaining: 2m 12s
600:	test: 0.9245930	best: 0.9245930 (600)	total: 30.5s	remaining: 2m 1s
800:	test: 0.9250381	best: 0.9250381 (800)	total: 40.3s	remaining: 1m 50s
1000:	test: 0.9253111	best: 0.9253111 (1000)	total: 49.7s	remaining: 1m 39s
1200:	test: 0.9254887	best: 0.9254887 (1200)	total: 58.9s	remaining: 1m 28s
1400:	test: 0.9256190	best: 0.9256190 (1400)	total: 1m 8s	remaining: 1m 18s
bestTest = 0.9256581068
bestIteration = 1525
Shrink model to first 1526 iterations.


CatBoostError: To employ param {'use_best_model': True} provide non-empty 'eval_set'.