# Step 6: Training model

In [3]:
import logging
import os
import wandb
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import optuna

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%d-%m-%Y %H:%M:%S')
logger = logging.getLogger()

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load data

In [4]:
def load_data():
    wandb.login()
    run = wandb.init(project="diabetes",entity="ngocnhi-p4work-national-economics-university", job_type="train")
    artifact = run.use_artifact("train.csv:latest")
    artifact_dir = artifact.download()
    df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
    logger.info(f"Tập dữ liệu đã load với shape: {df.shape}")
    return df, run

## 2. Prepare data

In [5]:
def prepare_data(df):
    # Split data into training and validation sets
    X = df.drop(columns=["OUTCOME"])
    y = df["OUTCOME"]
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    logger.info(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}") # log train and validation shapes
    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    logger.info(f"After SMOTE - Counts: {y_train_res.value_counts().to_dict()}") # log class counts after SMOTE
    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled, y_train_res, y_val, scaler

In [6]:
def train_base_models(X_train, y_train, X_val, y_val):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=500),
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        metrics = {
            'accuracy': accuracy_score(y_val, preds),
            'precision': precision_score(y_val, preds),
            'recall': recall_score(y_val, preds),
            'f1': f1_score(y_val, preds)
        }
        logger.info(f"{name} - {metrics}")
        print(f"\n{name} Classification Report:\n", classification_report(y_val, preds))
        print(f"Confusion Matrix:\n{confusion_matrix(y_val, preds)}")
        results[name] = model
    return results

In [7]:
def tune_random_forest(X_train, y_train):
    def objective(trial):
        params = {
            'random_state': 42,
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 2, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        }

        model = RandomForestClassifier(**params)
        return cross_val_score(model, X_train, y_train, scoring='accuracy', cv=3).mean()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=500)
    logger.info(f"Best RF params: {study.best_params}")
    return RandomForestClassifier(**study.best_params)

In [8]:
def tune_xgboost(X_train, y_train):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'random_state': 42,
            'eval_metric': 'logloss',
            'use_label_encoder': False
        }
        model = XGBClassifier(**params)
        return cross_val_score(model, X_train, y_train, scoring='accuracy', cv=3).mean()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=500)
    logger.info(f"Best XGBoost params: {study.best_params}")
    return XGBClassifier(**study.best_params)

In [9]:
def save_and_log_model(model, scaler, run, filename='final_model.pkl'):
    joblib.dump((model, scaler), filename)
    artifact = wandb.Artifact(name=filename, type='model')
    artifact.add_file(filename)
    run.log_artifact(artifact)
    logger.info(f"Đã lưu mô hình và scaler vào W&B: {filename}")

In [10]:
# === RUN PIPELINE ===
df, run = load_data()
X_train, X_val, y_train, y_val, scaler = prepare_data(df)
train_base_models(X_train, y_train, X_val, y_val)

# Tune models
xgb_model = tune_xgboost(X_train, y_train)
rf_model = tune_random_forest(X_train, y_train)
xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_auc = accuracy_score(y_val, xgb_model.predict(X_val))
rf_auc = accuracy_score(y_val, rf_model.predict(X_val))

best_model = xgb_model if xgb_auc >= rf_auc else rf_model
logger.info(f"Best model accuracy: {max(xgb_auc, rf_auc)}")
save_and_log_model(best_model, scaler, run)
run.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mmlcolongmay[0m ([33mmlcolongmay-neu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  
07-05-2025 22:47:22 Tập dữ liệu đã load với shape: (537, 9)
07-05-2025 22:47:22 Train shape: (429, 8), Validation shape: (108, 8)
07-05-2025 22:47:22 After SMOTE - Counts: {0: 280, 1: 280}
07-05-2025 22:47:22 Logistic Regression - {'accuracy': 0.7592592592592593, 'precision': 0.6304347826086957, 'recall': 0.7631578947368421, 'f1': 0.6904761904761905}



Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.76      0.80        70
           1       0.63      0.76      0.69        38

    accuracy                           0.76       108
   macro avg       0.74      0.76      0.75       108
weighted avg       0.78      0.76      0.76       108

Confusion Matrix:
[[53 17]
 [ 9 29]]


07-05-2025 22:47:22 Random Forest - {'accuracy': 0.7222222222222222, 'precision': 0.5833333333333334, 'recall': 0.7368421052631579, 'f1': 0.6511627906976745}
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
07-05-2025 22:47:23 XGBoost - {'accuracy': 0.7592592592592593, 'precision': 0.6363636363636364, 'recall': 0.7368421052631579, 'f1': 0.6829268292682926}
[I 2025-05-07 22:47:23,179] A new study created in memory with name: no-name-8728c157-d6ad-4470-81cc-6cf0346c9da3



Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.71      0.77        70
           1       0.58      0.74      0.65        38

    accuracy                           0.72       108
   macro avg       0.71      0.73      0.71       108
weighted avg       0.75      0.72      0.73       108

Confusion Matrix:
[[50 20]
 [10 28]]

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.77      0.81        70
           1       0.64      0.74      0.68        38

    accuracy                           0.76       108
   macro avg       0.74      0.75      0.74       108
weighted avg       0.77      0.76      0.76       108

Confusion Matrix:
[[54 16]
 [10 28]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 22:47:23,542] Trial 0 finished with value: 0.7875721158453607 and parameters: {'n_estimators': 252, 'max_depth': 9, 'learning_rate': 0.2508906823072129, 'subsample': 0.6941207239532933, 'colsample_bytree': 0.923603638160894, 'gamma': 1.7649795532059276, 'min_child_weight': 7}. Best is trial 0 with value: 0.7875721158453607.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 22:47:23,795] Trial 1 finished with value: 0.7857512506468863 and parameters

In [17]:
best_model

In [21]:
import os
model_path = "../model/final_model.pkl"
if os.path.exists(model_path):
    os.remove(model_path)
    print(f"Đã xóa tệp {model_path}")
import joblib
joblib.dump([best_model, scaler], model_path)


Đã xóa tệp ../model/final_model.pkl


['../model/final_model.pkl']

In [22]:
loaded = joblib.load("../model/final_model.pkl")
model = loaded[0]
scaler = loaded[1]
print(type(model))  # Expect XGBClassifier
print(type(scaler))  # Expect StandardScaler


<class 'xgboost.sklearn.XGBClassifier'>
<class 'sklearn.preprocessing._data.StandardScaler'>


In [24]:
with open("../model/final_model.pkl", "rb") as f:
    content = f.read(20)
print(content)


b'\x80\x04\x95d\x03\x00\x00\x00\x00\x00\x00]\x94(\x8c\x0fxgbo'


In [13]:
import sklearn
print("scikit-learn version used to train:", sklearn.__version__)


scikit-learn version used to train: 1.1.3


In [14]:
import sys
print("Python location:", sys.executable)

import numpy as np
print("NumPy version:", np.__version__)


Python location: c:\Users\x-hp\OneDrive - National Economics University\Desktop\ML Ops\.venv\Scripts\python.exe
NumPy version: 1.23.5
