In [3]:
%pip install optuna -q

In [12]:
import optuna
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import warnings
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings('ignore', category=FutureWarning)
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report
import logging
import pickle

In [6]:
df=pd.read_csv('/content/train_data_attrition_scaling.csv')
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,-0.953774,1,-0.040465,2,-1.013473,2,1,1,1927,3,...,80,1,-0.527297,-0.613546,3,0.011597,0.772856,-0.67611,0.843327,0
1,0.241886,1,-0.878823,1,-0.154619,1,1,1,602,3,...,80,1,-0.399283,0.157319,3,0.175907,-0.33452,-0.67611,0.843327,0
2,0.024494,2,0.968031,1,-0.522699,2,3,1,460,4,...,80,2,-0.399283,-0.613546,2,-0.152713,-0.611364,-0.67611,-0.007719,0
3,-0.192899,2,0.844743,1,1.685783,4,3,1,75,3,...,80,1,-0.91134,0.157319,3,-0.809953,-0.611364,-0.034378,-0.575084,0
4,-0.084203,2,-0.72348,1,-0.768086,3,5,1,397,3,...,80,0,-0.143254,-0.613546,3,0.175907,-1.165051,1.569949,0.843327,0


In [11]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
X = df.drop(columns=['Attrition'])
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
def objective_svc(trial):
    try:
        params = {
            'svc__C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'svc__kernel': trial.suggest_categorical('kernel', ['linear', 'rbf']),
            'svc__gamma': 'scale' if trial.suggest_categorical('kernel', ['linear', 'rbf']) != 'rbf' else trial.suggest_float('gamma', 1e-3, 1, log=True),
            'svc__random_state': 42
        }
        scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
        params['svc__class_weight'] = 'balanced' if scale_pos_weight > 1 else None
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('svc', SVC(probability=True))
        ])
        pipeline.set_params(**params)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1', error_score='raise')
        f1_mean = cv_scores.mean()
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        trial.set_user_attr('recall', recall)
        trial.set_user_attr('precision', precision)
        trial.set_user_attr('roc_auc', roc_auc)
        logger.info(f"Thử nghiệm {trial.number}: F1={f1_mean:.4f}, Recall={recall:.4f}, Precision={precision:.4f}, ROC AUC={roc_auc:.4f}")
        return f1_mean
    except Exception as e:
        logger.error(f"Thử nghiệm {trial.number} thất bại: {str(e)}")
        return 0.0
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc, n_trials=50)
best_trial = study.best_trial
print("Tham số tốt nhất cho SVC:")
print(best_trial.params)
print(f"F1 Score cross-validated tốt nhất: {best_trial.value:.4f}")
print(f"Recall trên tập test: {best_trial.user_attrs['recall']:.4f}")
print(f"Precision trên tập test: {best_trial.user_attrs['precision']:.4f}")
print(f"ROC AUC Score trên tập test: {best_trial.user_attrs['roc_auc']:.4f}")
best_params = best_trial.params
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(
        C=best_params['C'],
        kernel=best_params['kernel'],
        gamma=best_params.get('gamma', 'scale'),
        class_weight='balanced' if np.sum(y_train == 0) / np.sum(y_train == 1) > 1 else None,
        probability=True,
        random_state=42
    ))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
print("\nĐánh giá cuối cùng trên tập kiểm tra:")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nBáo cáo phân loại chi tiết:")
print(classification_report(y_test, y_pred, digits=3))

Tham số tốt nhất cho SVC:
{'C': 35.454842884517525, 'kernel': 'rbf', 'gamma': 0.07607589059120864}
F1 Score cross-validated tốt nhất: 0.9244
Recall trên tập test: 0.9280
Precision trên tập test: 0.9280
ROC AUC Score trên tập test: 0.9823

Đánh giá cuối cùng trên tập kiểm tra:
F1 Score: 0.9280
Recall: 0.9280
Precision: 0.9280
ROC AUC Score: 0.9823

Báo cáo phân loại chi tiết:
              precision    recall  f1-score   support

           0      0.928     0.928     0.928       237
           1      0.928     0.928     0.928       236

    accuracy                          0.928       473
   macro avg      0.928     0.928     0.928       473
weighted avg      0.928     0.928     0.928       473



In [13]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(
        C=best_params['C'],
        kernel=best_params['kernel'],
        gamma=best_params.get('gamma', 'scale'),
        class_weight='balanced' if np.sum(y == 0) / np.sum(y == 1) > 1 else None,
        probability=True,
        random_state=42
    ))
])
pipeline.fit(X, y)
with open('svc_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("\nMô hình đã được lưu vào 'svc_model.pkl'")


Mô hình đã được lưu vào 'svc_model.pkl'
