In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import optuna
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, classification_report
optuna.logging.set_verbosity(optuna.logging.ERROR)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import warnings
import random



In [2]:
data = pd.read_csv('C:/Users/HP/Pictures/githubhoanchinh/data/processed/processed_scaling/train_data_attrition_scaling.csv')  # Đọc dữ liệu
X = data.drop('Attrition', axis=1)     
y = data['Attrition']                   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
print(y.value_counts())

Attrition
0    787
1    787
Name: count, dtype: int64


 Chú thích: 1: Yes, 0: No

# Tiến hành chọn hyperpara và training XGBoost model

In [None]:

def objective_xgb(trial):

    params = {
        'classifier__max_depth': trial.suggest_int('max_depth', 3, 15),
        'classifier__learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'classifier__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'classifier__subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'classifier__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'classifier__gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'classifier__min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'classifier__random_state': 42
    }


    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['classifier__scale_pos_weight'] = scale_pos_weight if scale_pos_weight > 1 else 1


    pipeline = Pipeline([
      
        ('classifier', XGBClassifier(eval_metric='logloss'))  
    ])


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1', error_score='raise')
    f1_mean = cv_scores.mean()

  
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)

    return f1_mean


study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)

best_trial_xgb = study_xgb.best_trial
print("Best parameters for XGBoost:")
print(best_trial_xgb.params)
print(f"Best cross-validated F1 Score: {best_trial_xgb.value:.4f}")
print(f"Recall on test set: {best_trial_xgb.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial_xgb.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial_xgb.user_attrs['roc_auc']:.4f}")


best_params_xgb = best_trial_xgb.params
pipeline_xgb = Pipeline([

    ('classifier', XGBClassifier(**{k.replace('classifier__', ''): v for k, v in best_params_xgb.items()}, eval_metric='logloss'))
])
pipeline_xgb.fit(X_train, y_train)


y_pred = pipeline_xgb.predict(X_test)
y_pred_proba = pipeline_xgb.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, digits=3))  


Best parameters for XGBoost:
{'max_depth': 8, 'learning_rate': 0.0574677910125613, 'n_estimators': 412, 'subsample': 0.81383944860593, 'colsample_bytree': 0.6033357329722178, 'gamma': 0.010177352273286784, 'min_child_weight': 7.2045708385481655}
Best cross-validated F1 Score: 0.8935
Recall on test set: 0.8968
Precision on test set: 0.9267
ROC AUC Score on test set: 0.9663

Final evaluation on test set:
F1 Score: 0.8963
Recall: 0.8645
Precision: 0.9306
ROC AUC Score: 0.9654

Detailed Classification Report:
              precision    recall  f1-score   support

           0      0.877     0.938     0.906       160
           1      0.931     0.865     0.896       155

    accuracy                          0.902       315
   macro avg      0.904     0.901     0.901       315
weighted avg      0.903     0.902     0.901       315



In [5]:
best_params_xgb = best_trial_xgb.params
pipeline_xgb = Pipeline([
    ('classifier', XGBClassifier(**{k.replace('classifier__', ''): v for k, v in best_params_xgb.items()}, eval_metric='logloss'))
])
pipeline_xgb.fit(X, y)

with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(pipeline_xgb, f)
print("\nModel saved as 'xgboost_model.pkl'")


Model saved as 'xgboost_model.pkl'


# Tiến hành tìm hyperpara và training CatBoost model

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

optuna.logging.set_verbosity(optuna.logging.ERROR)  
def objective_cb(trial):
    params = {
        'classifier__iterations': trial.suggest_int('iterations', 50, 500),
        'classifier__depth': trial.suggest_int('depth', 4, 10),
        'classifier__learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'classifier__l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 1.0, log=True),
        'classifier__border_count': trial.suggest_int('border_count', 32, 128),
        'classifier__bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'classifier__random_strength': trial.suggest_float('random_strength', 1e-2, 1.0, log=True),
        'classifier__random_seed': 42,
        'classifier__verbose': 0  
    }

    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['classifier__auto_class_weights'] = 'Balanced' if scale_pos_weight > 1 else None

 
    pipeline = Pipeline([
        ('classifier', CatBoostClassifier(**{k.replace('classifier__', ''): v for k, v in params.items() if k != 'classifier__verbose'}, verbose=0))
    ])

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random.randint(0, 100))
    try:
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1', error_score='raise')
        f1_mean = cv_scores.mean()
    except ValueError as e:
        raise ValueError(f"Error in cross-validation: {e}. Check if X_train and y_train are compatible.")

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)

    return f1_mean


random_seed = random.randint(0, 1000)
sampler = optuna.samplers.TPESampler(seed=random_seed)
study = optuna.create_study(direction='maximize', sampler=sampler)
try:
    study.optimize(objective_cb, n_trials=50)
except Exception as e:
    raise Exception(f"Optimization failed: {e}")


best_trial = study.best_trial
print("Best parameters for CatBoost:")
print(best_trial.params)
print(f"Best cross-validated F1 Score: {best_trial.value:.4f}")
print(f"Recall on test set: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial.user_attrs['roc_auc']:.4f}")


best_params = best_trial.params
pipeline = Pipeline([
    ('classifier', CatBoostClassifier(**{k.replace('classifier__', ''): v for k, v in best_params.items()}, verbose=0))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, digits=3))

Best parameters for CatBoost:
{'iterations': 475, 'depth': 6, 'learning_rate': 0.02799727623735741, 'l2_leaf_reg': 0.019697363695872543, 'border_count': 116, 'bagging_temperature': 0.18398533764527536, 'random_strength': 0.13685399168008192}
Best cross-validated F1 Score: 0.9239
Recall on test set: 0.9032
Precision on test set: 0.9459
ROC AUC Score on test set: 0.9734

Final evaluation on test set:
F1 Score: 0.9262
Recall: 0.8903
Precision: 0.9650
ROC AUC Score: 0.9750

Detailed Classification Report:
              precision    recall  f1-score   support

           0      0.901     0.969     0.934       160
           1      0.965     0.890     0.926       155

    accuracy                          0.930       315
   macro avg      0.933     0.930     0.930       315
weighted avg      0.933     0.930     0.930       315



In [7]:
best_params = best_trial.params
pipeline_catb = Pipeline([
    ('classifier', CatBoostClassifier(**{k.replace('classifier__', ''): v for k, v in best_params.items()}, verbose=0))
])
pipeline_catb.fit(X, y)

with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(pipeline_catb, f)
print("\nModel saved as 'catboost_model.pkl'")


Model saved as 'catboost_model.pkl'


# Tiến hành tìm hyperpara và training LightGBM model

In [8]:
def objective_lgb(trial):
    params = {
        'classifier__num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'classifier__max_depth': trial.suggest_int('max_depth', 3, 15),
        'classifier__learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'classifier__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'classifier__min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'classifier__subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'classifier__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'classifier__reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'classifier__reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'classifier__random_state': 42,
        'classifier__verbose': -1  
    }

   
    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    params['classifier__scale_pos_weight'] = scale_pos_weight if scale_pos_weight > 1 else 1

   
    pipeline = Pipeline([
        ('classifier', LGBMClassifier())
    ])

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1', error_score='raise')
    f1_mean = cv_scores.mean()

  
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

  
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

  
    trial.set_user_attr('recall', recall)
    trial.set_user_attr('precision', precision)
    trial.set_user_attr('roc_auc', roc_auc)

    return f1_mean

study = optuna.create_study(direction='maximize')
study.optimize(objective_lgb, n_trials=50)

best_trial = study.best_trial
print("Best parameters for LightGBM:")
print(best_trial.params)
print(f"Best cross-validated F1 Score: {best_trial.value:.4f}")
print(f"Recall on test set: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision on test set: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score on test set: {best_trial.user_attrs['roc_auc']:.4f}")

best_params = best_trial.params
pipeline = Pipeline([
    ('classifier', LGBMClassifier(**{k.replace('classifier__', ''): v for k, v in best_params.items()}, verbose=-1))
])
pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
print("\nFinal evaluation on test set:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, digits=3))

[LightGBM] [Info] Number of positive: 421, number of negative: 418
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000906 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3407
[LightGBM] [Info] Number of data points in the train set: 839, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501788 -> initscore=0.007151
[LightGBM] [Info] Start training from score 0.007151
[LightGBM] [Info] Number of positive: 421, number of negative: 418
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3386
[LightGBM] [Info] Number of data points in the train set: 839, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501788 -> initscore=0.007151
[LightGBM] [Info]

In [9]:
best_params = best_trial.params
pipeline_lightgbm = Pipeline([
    ('classifier', LGBMClassifier(**{k.replace('classifier__', ''): v for k, v in best_params.items()}, verbose=-1))
])
pipeline_lightgbm.fit(X, y)

with open('lightgbm_model.pkl', 'wb') as f:
    pickle.dump(pipeline_lightgbm, f)
print("\nModel saved as 'lightgbm_model.pkl'")


Model saved as 'lightgbm_model.pkl'
