In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install optuna



In [4]:
import pandas as pd
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [7]:
df = pd.read_csv("/content/drive/MyDrive/train_data_attrition_scaling.csv")

In [8]:
# Đọc dữ liệu và tách features - target
X = df.drop(columns=['Attrition'])
y = df['Attrition']

In [9]:
# Hàm tối ưu với Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
    }
    clf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_proba = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    return f1_score(y, y_pred)

In [11]:
# Chạy tối ưu
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [14]:
# Huấn luyện và đánh giá với tham số tốt nhất
best_params = study.best_params
clf = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_proba = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]
y_pred = (y_proba >= 0.5).astype(int)
print("Best F1-score:", study.best_value)
print("Best parameters:", study.best_params)

Best F1-score: 0.9130998702983139
Best parameters: {'n_estimators': 205, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt', 'criterion': 'gini'}


In [13]:
# Kết quả đánh giá
print("Best parameters:", best_params)
print("Precision:", precision_score(y, y_pred))
print("Recall:", recall_score(y, y_pred))
print("F1-score:", f1_score(y, y_pred))
print("ROC AUC:", roc_auc_score(y, y_proba))

Best parameters: {'n_estimators': 205, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt', 'criterion': 'gini'}
Precision: 0.9308510638297872
Recall: 0.8894536213468869
F1-score: 0.9096816114359974
ROC AUC: 0.9660073397280136


In [15]:
import joblib
# Lưu mô hình đã huấn luyện
joblib.dump(clf, '/content/drive/MyDrive/best_random_forest_model.pkl')


['/content/drive/MyDrive/best_random_forest_model.pkl']