In [1]:
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [None]:
# import file

dir=r".\storage\data"
train=pd.read_csv(os.path.join(dir,"train.csv"))
test=pd.read_csv(os.path.join(dir,"test.csv"))
sub_=pd.read_csv(os.path.join(dir,"sample_submission.csv"))
orig=pd.read_csv(os.path.join(dir,"loan_dataset_20000.csv"))

print("train shape :", train.shape)
print("test shape :",test.shape)
print("orig shape :",orig.shape)
train.head()

train shape : (593994, 13)
test shape : (254569, 12)
orig shape : (20000, 22)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [3]:
TARGET = 'loan_paid_back'
CATS = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
BASE = [col for col in train.columns if col not in ['id', TARGET]]

In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold
import random
import xgboost as xgb
from xgboost import XGBClassifier

In [5]:
#This took a lot of time to code, but I've already finished and saved it as a  file, 
# which you can directly import to get the mapping.
import joblib
os.makedirs("./storage",exist_ok=True)
# joblib.dump(mapping, "./storage/mapping.joblib")
mapping=joblib.load("./storage/mapping.joblib")

# Establish a baseline model

In [32]:
def cv_train(params,SEED_5=5,SEED=42,cv=5,train=train.copy(),test=test.copy()):
   
    oof_preds=np.zeros(len(train))
    test_preds=np.zeros(len(test))
    random.seed(SEED)
    seeds=random.sample(np.arange(len(train)).tolist(),SEED_5)
    kf=StratifiedKFold(n_splits=cv,random_state=SEED,shuffle=True)
    
    for i,(train_idx,val_idx) in enumerate(kf.split(train,train[TARGET])):
        print(f"< {i+1}/{cv} > is training !")
        (x_train,y_train),(x_val,y_val),x_test=mapping[i]
        x_train[CATS]=x_train[CATS].astype("category")
        x_val[CATS]=x_val[CATS].astype("category")
        x_test[CATS]=x_test[CATS].astype("category")
        
        # 5 seed trainings ‚Äî‚Äî Ensure stable results
        for _ in seeds:
            model=XGBClassifier(**params)
            model.fit(x_train,y_train,
                     eval_set=[(x_train,y_train),(x_val,y_val)],
                     verbose=200)
            val_preds=model.predict_proba(x_val)[:,1]
            preds=model.predict_proba(x_test)[:,1]
            
            oof_preds[val_idx]+=val_preds/len(seeds)
            test_preds+=preds/len(seeds)/cv
        print(f"AUC is {roc_auc_score(y_val,oof_preds[val_idx]):.5f}\n")
    print(f"üî•overall AUC is {roc_auc_score(train[TARGET],oof_preds):.5f}üöÄ")
    return oof_preds,test_preds

In [14]:
def to_submission(oof_preds,test_preds,desc):
    sub_["loan_paid_back"]=test_preds
    sub_.to_csv(f"./ensemble/{desc}_test.csv",index=False)

    df={"id":np.arange(len(oof_preds)),
       "oof_preds":oof_preds}
    df=pd.DataFrame(df)
    df.to_csv(f"./ensemble/{desc}_oof.csv",index=False)

# Optuna optimization

In [15]:
import optuna
import tqdm 

In [16]:
def objective(trial,SEED=42,cv=5):    
    params={
        "learning_rate":trial.suggest_float("learning_rate",1e-3,0.2),
        "colsample_bytree":trial.suggest_float("colsample_bytree",0.6,1),
        "subsample":trial.suggest_float("subsample",0.6,1),
        "seed":42,
        "reg_lambda":trial.suggest_float("reg_lambda",0,5),
        "reg_alpha":trial.suggest_float("reg_alpha",0,30),
        "max_depth":trial.suggest_int("max_depth",3,10),
        "n_estimators":1_000_00,
        "device":"cuda",
        "eval_metric":"auc",
        "objective":"binary:logistic",
        "enable_categorical":True,
        "early_stopping_rounds":50
            }  
    random.seed()  # ‰ΩøÁî®Á≥ªÁªüÊó∂Èó¥Ëá™Âä®ÈöèÊú∫Âåñ
    i=random.choice(range(cv))
    
    (x_train,y_train),(x_val,y_val),_=mapping[i]
    x_train[CATS]=x_train[CATS].astype("category")
    x_val[CATS]=x_val[CATS].astype("category")
    
    model=XGBClassifier(**params)
    model.fit(x_train,y_train,
             eval_set=[(x_val,y_val)],
             verbose=200
             )
    val_preds=model.predict_proba(x_val)[:,1]                
    return roc_auc_score(y_val,val_preds)

In [17]:
def study_start(N_TRIALS,study_stopping_rounds=20):
    pbar=tqdm.tqdm(total=N_TRIALS)
    
    def fixed_params():
        return {
                "seed":42,
                "n_estimators":1_000_00,
                "device":"cuda",
                "eval_metric":"auc",
                "objective":"binary:logistic",
                "enable_categorical":True,
                "early_stopping_rounds":50
            }
        
    def logging_callback(study,trial):
        """
        Display progress bar and log.
        """
        pbar.update(1)
        pbar.set_description(f"<{trial.number+1}/{N_TRIALS} > <best value>{study.best_value}")
   
    def stopping_callbacks(study,trial):
        """
        Set early stop for study
        """
        if trial.number>study.best_trial.number+study_stopping_rounds:
            print("\n\nStudy is Stopping!")
            study.stop()
        else:pass
    
    study=optuna.create_study(direction="maximize",load_if_exists=True,\
                              study_name="Single_XGB",
                             storage="sqlite:///./storage/Single__XGB.db")
    print("Study is starting...\n")
    study.optimize(objective,n_trials=N_TRIALS,callbacks=[logging_callback,stopping_callbacks])
    print(f"<best value> {study.best_value}")

    # The last parameter is the optimal parameter and the fixed value parameter.
    best_params=study.best_params | fixed_params()
    pbar.close()
    return best_params

In [28]:
# xgboost
xgb_best_params=study_start(200)
xgb_best_params

  0%|                                                                                          | 0/200 [00:00<?, ?it/s][I 2025-11-23 14:14:16,775] Using an existing study with name 'Single_XGB' instead of creating a new one.


Study is starting...

[0]	validation_0-auc:0.91022
[200]	validation_0-auc:0.92560
[241]	validation_0-auc:0.92556


[I 2025-11-23 14:14:26,772] Trial 43 finished with value: 0.92560066061866 and parameters: {'learning_rate': 0.08146750058905479, 'colsample_bytree': 0.8480736272227738, 'subsample': 0.729734507346444, 'reg_lambda': 4.619223824415076, 'reg_alpha': 17.807514449112208, 'max_depth': 5}. Best is trial 19 with value: 0.9270874152440687.
<44/200 > <best value>0.9270874152440687:   0%|‚ñè                                       | 1/200 [00:10<33:39, 10.15s/it]



Study is Stopping!
<best value> 0.9270874152440687





{'learning_rate': 0.006067721209091673,
 'colsample_bytree': 0.8447375405555225,
 'subsample': 0.7564771119470342,
 'reg_lambda': 2.271598084701006,
 'reg_alpha': 12.886802028668606,
 'max_depth': 7,
 'seed': 42,
 'n_estimators': 100000,
 'device': 'cuda',
 'eval_metric': 'auc',
 'objective': 'binary:logistic',
 'enable_categorical': True,
 'early_stopping_rounds': 50}

# CV-training

In [30]:
xgb_oof_preds,xgb_test_preds=cv_train(xgb_best_params,SEED_5=3)
to_submission(xgb_oof_preds,xgb_test_preds,"xgb_optuna")

< 1/5 > is training !
[0]	validation_0-auc:0.91882	validation_1-auc:0.91684
[200]	validation_0-auc:0.92469	validation_1-auc:0.92247
[400]	validation_0-auc:0.92619	validation_1-auc:0.92389
[600]	validation_0-auc:0.92740	validation_1-auc:0.92482
[800]	validation_0-auc:0.92827	validation_1-auc:0.92541
[1000]	validation_0-auc:0.92917	validation_1-auc:0.92579
[1200]	validation_0-auc:0.93011	validation_1-auc:0.92601
[1400]	validation_0-auc:0.93104	validation_1-auc:0.92616
[1600]	validation_0-auc:0.93193	validation_1-auc:0.92624
[1800]	validation_0-auc:0.93281	validation_1-auc:0.92629
[2000]	validation_0-auc:0.93365	validation_1-auc:0.92631
[2085]	validation_0-auc:0.93401	validation_1-auc:0.92631


ValueError: Must have at least 1 validation dataset for early stopping.