In [7]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Set MLflow Experiment
mlflow.set_experiment("MLOps_MSE.HCM_SP25_Experiment")

# Step 1: Tạo dữ liệu
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    random_state=42
)

# Step 2: Data Augmentation + Noise + Chuẩn hóa
X_aug, y_aug = make_classification(
    n_samples=2000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    random_state=42
)
noise = np.random.normal(0, 0.1, X_aug.shape)
X_aug_noisy = X_aug + noise
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug_noisy)

# Step 3: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_aug, test_size=0.3, random_state=42)

# Bảng tổng hợp kết quả
results = {}

# Step 4: Train các mô hình

# 4.1 Logistic Regression
with mlflow.start_run(run_name="Logistic Regression"):
    model_lr = LogisticRegression(random_state=42, max_iter=500)
    model_lr.fit(X_train, y_train)
    y_pred_lr = model_lr.predict(X_test)
    acc_lr = accuracy_score(y_test, y_pred_lr)
    results["Logistic Regression"] = acc_lr
    print(f"[Result] Logistic Regression Accuracy: {acc_lr:.4f}")
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", acc_lr)
    mlflow.sklearn.log_model(model_lr, artifact_path="model")

# 4.2 Logistic Regression (Tuned)
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 300, 500]
}
grid_lr = GridSearchCV(LogisticRegression(random_state=42), param_grid_lr, scoring='accuracy', cv=5, verbose=1)
grid_lr.fit(X_train, y_train)

with mlflow.start_run(run_name="Logistic Regression Tuned"):
    best_lr = grid_lr.best_estimator_
    y_pred_lr_tuned = best_lr.predict(X_test)
    acc_lr_tuned = accuracy_score(y_test, y_pred_lr_tuned)
    results["Logistic Regression Tuned"] = acc_lr_tuned
    print(f"[Result] Logistic Regression Tuned Accuracy: {acc_lr_tuned:.4f}")
    mlflow.log_params(grid_lr.best_params_)
    mlflow.log_metric("accuracy", acc_lr_tuned)
    mlflow.sklearn.log_model(best_lr, artifact_path="model")

# 4.3 Random Forest
with mlflow.start_run(run_name="Random Forest"):
    model_rf = RandomForestClassifier(random_state=42, n_estimators=100)
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    results["Random Forest"] = acc_rf
    print(f"[Result] Random Forest Accuracy: {acc_rf:.4f}")
    mlflow.log_param("model", "Random Forest")
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.sklearn.log_model(model_rf, artifact_path="model")

# 4.4 XGBoost
with mlflow.start_run(run_name="XGBoost"):
    model_xgb = XGBClassifier(
        random_state=42,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    acc_xgb = accuracy_score(y_test, y_pred_xgb)
    results["XGBoost"] = acc_xgb
    print(f"[Result] XGBoost Accuracy: {acc_xgb:.4f}")
    mlflow.log_param("model", "XGBoost")
    mlflow.log_metric("accuracy", acc_xgb)
    mlflow.sklearn.log_model(model_xgb, artifact_path="model")

# 4.5 LightGBM
with mlflow.start_run(run_name="LightGBM"):
    model_lgb = LGBMClassifier(random_state=42, n_estimators=100)
    model_lgb.fit(X_train, y_train)
    y_pred_lgb = model_lgb.predict(X_test)
    acc_lgb = accuracy_score(y_test, y_pred_lgb)
    results["LightGBM"] = acc_lgb
    print(f"[Result] LightGBM Accuracy: {acc_lgb:.4f}")
    mlflow.log_param("model", "LightGBM")
    mlflow.log_metric("accuracy", acc_lgb)
    mlflow.sklearn.log_model(model_lgb, artifact_path="model")

# 4.6 CatBoost
with mlflow.start_run(run_name="CatBoost"):
    model_cat = CatBoostClassifier(random_state=42, verbose=0)
    model_cat.fit(X_train, y_train)
    y_pred_cat = model_cat.predict(X_test)
    acc_cat = accuracy_score(y_test, y_pred_cat)
    results["CatBoost"] = acc_cat
    print(f"[Result] CatBoost Accuracy: {acc_cat:.4f}")
    mlflow.log_param("model", "CatBoost")
    mlflow.log_metric("accuracy", acc_cat)
    mlflow.sklearn.log_model(model_cat, artifact_path="model")

# Tổng kết kết quả
print("\n==================== Summary Results ====================")
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}")

print("\nTraining completed. Check MLflow UI to choose the best model!")

[Result] Logistic Regression Accuracy: 0.7050




Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Result] Logistic Regression Tuned Accuracy: 0.7017




[Result] Random Forest Accuracy: 0.8517


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[Result] XGBoost Accuracy: 0.8617




[LightGBM] [Info] Number of positive: 695, number of negative: 705
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496429 -> initscore=-0.014286
[LightGBM] [Info] Start training from score -0.014286
[Result] LightGBM Accuracy: 0.8583




[Result] CatBoost Accuracy: 0.8817





Logistic Regression: 0.7050
Logistic Regression Tuned: 0.7017
Random Forest: 0.8517
XGBoost: 0.8617
LightGBM: 0.8583
CatBoost: 0.8817

Training completed. Check MLflow UI to choose the best model!


In [8]:
import joblib

# Sau khi train model_cat
joblib.dump(model_cat, "best_model.pkl")
print("[INFO] Best model (CatBoost) đã được lưu vào file best_model.pkl!")

[INFO] Best model (CatBoost) đã được lưu vào file best_model.pkl!
