In [114]:
!pip install optuna scikit-learn




[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [158]:
import optuna
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler

Chuẩn bị dữ liệu

In [159]:
df = pd.read_csv(r"C:\Users\ntquy\Downloads\train_data_attrition_scaling.csv")
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,-0.953774,1,-0.040465,2,-1.013473,2,1,1,1927,3,...,80,1,-0.527297,-0.613546,3,0.011597,0.772856,-0.67611,0.843327,0
1,0.241886,1,-0.878823,1,-0.154619,1,1,1,602,3,...,80,1,-0.399283,0.157319,3,0.175907,-0.33452,-0.67611,0.843327,0
2,0.024494,2,0.968031,1,-0.522699,2,3,1,460,4,...,80,2,-0.399283,-0.613546,2,-0.152713,-0.611364,-0.67611,-0.007719,0
3,-0.192899,2,0.844743,1,1.685783,4,3,1,75,3,...,80,1,-0.91134,0.157319,3,-0.809953,-0.611364,-0.034378,-0.575084,0
4,-0.084203,2,-0.72348,1,-0.768086,3,5,1,397,3,...,80,0,-0.143254,-0.613546,3,0.175907,-1.165051,1.569949,0.843327,0


In [160]:
X = df.drop(['Attrition', 'EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1)
y = df['Attrition']               

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [162]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [164]:
def objective(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    if penalty == 'l1':
        solver = trial.suggest_categorical('solver_l1', ['liblinear', 'saga'])
    else:
        solver = trial.suggest_categorical('solver_l2', ['liblinear', 'saga'])
    max_iter = trial.suggest_int('max_iter', 1000, 5000)

    model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=max_iter,class_weight='balanced', random_state=42)
    f1_scorer = make_scorer(f1_score)
    roc_auc_scorer = make_scorer(roc_auc_score)
    f1_score_cv = cross_val_score(model, X_train, y_train, cv=5, scoring=f1_scorer).mean()
    roc_auc_cv = cross_val_score(model, X_train, y_train, cv=5, scoring=roc_auc_scorer).mean()
    return (f1_score_cv + roc_auc_cv) / 2

In [165]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [166]:
print("Best hyperparameters: ", study.best_params)
print("Best combined score: ", study.best_value)

best_params = study.best_params
final_model = LogisticRegression(
    penalty=best_params['penalty'],
    C=best_params['C'],
    solver=best_params['solver_l1'] if best_params['penalty'] == 'l1' else best_params['solver_l2'],
    max_iter=best_params['max_iter'],
    random_state=42
)
final_model.fit(X_train, y_train)

Best hyperparameters:  {'penalty': 'l1', 'C': 0.6996221390986981, 'solver_l1': 'liblinear', 'max_iter': 3089}
Best combined score:  0.8307653424513297


In [167]:
y_pred = final_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_pred_proba)
print("Test F1-score: ", f1_test)
print("Test ROC-AUC: ", roc_auc_test)

Test F1-score:  0.8343949044585988
Test ROC-AUC:  0.8997823107312747
