In [None]:
!pip install optuna catboost lightgbm xgboost 

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import optuna
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score



In [None]:
data = pd.read_csv('C:/Users/HP/Pictures/githubhoanchinh/data/processed/processed_scaling/train_data_attrition_scaling.csv')  # Đọc dữ liệu
X = data.drop('Attrition', axis=1)     
y = data['Attrition']                   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(y.value_counts())

Attrition
0    787
1    787
Name: count, dtype: int64


 Chú thích: 1: Yes, 0: No

# Tiến hành chọn hyperpara và training XGBoost model

In [21]:
def objective_xgb(trial):

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'random_state': 42
    }
    

    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['scale_pos_weight'] = scale_pos_weight if scale_pos_weight > 1 else 1
    

    model = XGBClassifier(**params, eval_metric='logloss')
    
 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    f1_mean = cv_scores.mean()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  
    
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
   
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)
    

    return f1_mean


study = optuna.create_study(direction='maximize')  
study.optimize(objective_xgb, n_trials=50) 

best_trial = study.best_trial
print("Best parameters for XGBoost:")
print(best_trial.params)
print(f"Best cross-validated F1 Score: {best_trial.value:.4f}")
print(f"Recall on test set: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial.user_attrs['roc_auc']:.4f}")

best_model = XGBClassifier(**best_trial.params, eval_metric='logloss')
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

[I 2025-06-12 23:40:57,207] A new study created in memory with name: no-name-baad6af0-0b5b-4087-9117-13e6a21035a1
[I 2025-06-12 23:41:01,529] Trial 0 finished with value: 0.8914400858698481 and parameters: {'max_depth': 4, 'learning_rate': 0.1567138171370458, 'n_estimators': 926, 'subsample': 0.6697550033904569, 'colsample_bytree': 0.5442989656155314, 'gamma': 2.672356167915623e-05, 'min_child_weight': 3.682583338972477}. Best is trial 0 with value: 0.8914400858698481.
[I 2025-06-12 23:41:04,885] Trial 1 finished with value: 0.8118984056907831 and parameters: {'max_depth': 14, 'learning_rate': 0.002299116424334187, 'n_estimators': 333, 'subsample': 0.6382220696912337, 'colsample_bytree': 0.864289602752361, 'gamma': 3.4421514558285297e-07, 'min_child_weight': 6.167706108729625}. Best is trial 0 with value: 0.8914400858698481.
[I 2025-06-12 23:41:08,425] Trial 2 finished with value: 0.8910456457005134 and parameters: {'max_depth': 11, 'learning_rate': 0.2767003676788397, 'n_estimators': 

Best parameters for XGBoost:
{'max_depth': 4, 'learning_rate': 0.22425354543748605, 'n_estimators': 703, 'subsample': 0.8149768225762151, 'colsample_bytree': 0.5962275923041719, 'gamma': 0.0022448401042904783, 'min_child_weight': 2.375640809749931}
Best cross-validated F1 Score: 0.9037
Recall on test set: 0.9161
Precision on test set: 0.9467
ROC AUC Score on test set: 0.9702

Final evaluation on test set:
F1 Score: 0.9139
Recall: 0.8903
Precision: 0.9388
ROC AUC Score: 0.9672


# Tiến hành tìm hyperpara và training CatBoost model

In [22]:

def objective_cb(trial):

    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),  
        'depth': trial.suggest_int('depth', 4, 10),            
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),  
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 1.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-2, 1.0, log=True),
        'random_seed': 42,
        'verbose': 0
    }
    
  
    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['auto_class_weights'] = 'Balanced' if scale_pos_weight > 1 else None
    

    model = CatBoostClassifier(**params)
    
  
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    f1_mean = cv_scores.mean()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)
    
  
    return f1_mean


study = optuna.create_study(direction='maximize')  
study.optimize(objective_cb, n_trials=50)  


best_trial = study.best_trial
print("Best parameters for CatBoost:")
print(best_trial.params)
print(f"Best cross-validated F1 Score: {best_trial.value:.4f}")
print(f"Recall on test set: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial.user_attrs['roc_auc']:.4f}")


best_model = CatBoostClassifier(**best_trial.params, verbose=0)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

[I 2025-06-12 23:44:51,475] A new study created in memory with name: no-name-bb800625-3396-4843-8660-000341517c2e


[I 2025-06-12 23:45:18,130] Trial 0 finished with value: 0.9045174348968429 and parameters: {'iterations': 483, 'depth': 10, 'learning_rate': 0.015554769122399913, 'l2_leaf_reg': 0.03406713463849547, 'border_count': 62, 'bagging_temperature': 0.36691527192511564, 'random_strength': 0.4180989334532747}. Best is trial 0 with value: 0.9045174348968429.
[I 2025-06-12 23:45:40,813] Trial 1 finished with value: 0.8926356601253297 and parameters: {'iterations': 457, 'depth': 9, 'learning_rate': 0.00537407430804316, 'l2_leaf_reg': 0.011178764098865021, 'border_count': 115, 'bagging_temperature': 0.0273566479377515, 'random_strength': 0.3875843687184475}. Best is trial 0 with value: 0.9045174348968429.
[I 2025-06-12 23:45:41,808] Trial 2 finished with value: 0.8615557155254193 and parameters: {'iterations': 60, 'depth': 6, 'learning_rate': 0.029253892058671016, 'l2_leaf_reg': 0.7389061003128428, 'border_count': 102, 'bagging_temperature': 0.08616084243967692, 'random_strength': 0.15866295474439

Best parameters for CatBoost:
{'iterations': 262, 'depth': 7, 'learning_rate': 0.08236791062507032, 'l2_leaf_reg': 0.028866546100752618, 'border_count': 88, 'bagging_temperature': 0.023709498305707677, 'random_strength': 0.36576988182805475}
Best cross-validated F1 Score: 0.9151
Recall on test set: 0.9290
Precision on test set: 0.9474
ROC AUC Score on test set: 0.9773

Final evaluation on test set:
F1 Score: 0.9311
Recall: 0.9161
Precision: 0.9467
ROC AUC Score: 0.9784


# Tiến hành tìm hyperpara và training LightGBM model

In [23]:
def objective_lgbm(trial):
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 42,
        'verbose':-1
    }
    
 
    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['scale_pos_weight'] = scale_pos_weight if scale_pos_weight > 1 else 1
    

    model = LGBMClassifier(**params)
    
 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    f1_mean = cv_scores.mean()
    
  
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  

    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)
    
    return f1_mean


study = optuna.create_study(direction='maximize') 
study.optimize(objective_lgbm, n_trials=50) 


best_trial = study.best_trial
print("Best parameters for LightGBM:")
print(best_trial.params)
print(f"Best cross-validated F1 Score: {best_trial.value:.4f}")
print(f"Recall on test set: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial.user_attrs['roc_auc']:.4f}")

best_model = LGBMClassifier(**best_trial.params)
best_model.fit(X_train, y_train)


y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

[I 2025-06-12 23:51:08,338] A new study created in memory with name: no-name-886a1e9f-d90c-4142-9da3-b8da9d3804cb
[I 2025-06-12 23:51:12,315] Trial 0 finished with value: 0.8919451554853588 and parameters: {'num_leaves': 31, 'max_depth': 13, 'learning_rate': 0.10413235301383492, 'n_estimators': 902, 'min_child_samples': 87, 'subsample': 0.586406119140335, 'colsample_bytree': 0.7866961180479204, 'reg_alpha': 6.252288002275009e-07, 'reg_lambda': 0.0254271440659008}. Best is trial 0 with value: 0.8919451554853588.
[I 2025-06-12 23:51:13,098] Trial 1 finished with value: 0.808877111083046 and parameters: {'num_leaves': 143, 'max_depth': 15, 'learning_rate': 0.0011890411251360149, 'n_estimators': 221, 'min_child_samples': 62, 'subsample': 0.723792966418547, 'colsample_bytree': 0.60183550838895, 'reg_alpha': 4.356573142049589e-07, 'reg_lambda': 4.6026991780377635}. Best is trial 0 with value: 0.8919451554853588.
[I 2025-06-12 23:51:14,837] Trial 2 finished with value: 0.7977080457810171 and 

Best parameters for LightGBM:
{'num_leaves': 79, 'max_depth': 5, 'learning_rate': 0.04192873391907352, 'n_estimators': 489, 'min_child_samples': 5, 'subsample': 0.8327911182602638, 'colsample_bytree': 0.5018194336436865, 'reg_alpha': 0.011666032820805907, 'reg_lambda': 2.064144481460455e-07}
Best cross-validated F1 Score: 0.9199
Recall on test set: 0.9032
Precision on test set: 0.9524
ROC AUC Score on test set: 0.9736

Final evaluation on test set:
F1 Score: 0.9302
Recall: 0.9032
Precision: 0.9589
ROC AUC Score: 0.9725
