## Set up

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import os
import sys

sys.path.insert(0, '..')

In [19]:
import mlflow

In [20]:
mlflow.set_tracking_uri('http://localhost:2002')

In [21]:
import numpy as np
import optuna
import pickle
import joblib
import dill
from optuna import Trial
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [22]:
from src.features.schedules import (
    reason_pipeline_steps,
    numeric_pipeline_steps
)
from src.data_prep_utils import chunk_transform, add_transformed_feature

In [23]:
features = pd.read_csv('../data/dataset.csv', index_col=0)

from sklearn.model_selection import train_test_split

In [24]:
features.head()

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name
0,2,22,2,,1,38,khám tiêu hóa,tiêu hoá
1,2,1,2,,1,29,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp
2,4,3,2,,40,29,"hở van tim 3 lá,ngoại tâm thu",tim mạch
3,17,3,2,,10,40,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch
4,17,3,2,,-1,70,cao huyết áp,tim mạch


In [25]:
features.isnull().sum()

partner_id            0
specialist_id         0
status                0
gender             1657
province_id           0
age                   0
reason_combind        1
specialist_name       0
dtype: int64

In [26]:
features.shape

(47447, 8)

In [27]:
features = features.dropna(subset=["reason_combind"])

In [28]:
features.shape

(47446, 8)

In [29]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [30]:
with open('../models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [31]:
features['target'].unique()

array([16,  2, 15, 13, 12,  8, 10,  3,  9, 18,  7,  4, 11,  0,  1, 14, 17,
        6,  5, 19])

In [32]:
for idx, label in zip(features['target'].unique(), label_encoder.inverse_transform(features['target'].unique())):
    print(f"idx : {idx}, label: {label}")


idx : 16, label: tiêu hoá
idx : 2, label: cơ xương khớp
idx : 15, label: tim mạch
idx : 13, label: thần kinh
idx : 12, label: tai mũi họng
idx : 8, label: nhi khoa
idx : 10, label: sản phụ khoa
idx : 3, label: cột sống
idx : 9, label: nội khoa
idx : 18, label: ung bướu
idx : 7, label: nam học
idx : 4, label: da liễu
idx : 11, label: sức khỏe tâm thần
idx : 0, label: bệnh viêm gan
idx : 1, label: chuyên khoa mắt
idx : 14, label: thận - tiết niệu
idx : 17, label: tiểu đường - nội tiết
idx : 6, label: khám tổng quát
idx : 5, label: hô hấp - phổi
idx : 19, label: vô sinh - hiếm muộn


In [33]:
features

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name,target
0,2,22,2,,1,38,khám tiêu hóa,tiêu hoá,16
1,2,1,2,,1,29,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2
2,4,3,2,,40,29,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15
3,17,3,2,,10,40,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15
4,17,3,2,,-1,70,cao huyết áp,tim mạch,15
...,...,...,...,...,...,...,...,...,...
50162,111,17,2,0.0,74,0,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,9
50163,125,1,2,0.0,74,0,"đau khớp vai, cử động tay phải nghe lụp cụp",cơ xương khớp,2
50164,410,26,2,0.0,82,0,bìu có nỗi mụt,nam học,7
50165,414,1,2,1.0,1,0,"trượt chân, ngã ngồi đau khu xương cụt",cơ xương khớp,2


In [34]:
X = features.drop(columns=["specialist_name", "target"], axis=1)
y = features['target']

In [39]:
text_col = "reason_combind"
# numeric_cols = ['partner_id']

tfm = [
    (
        "reason_combind",
        Pipeline(reason_pipeline_steps()),
        text_col
    ),
    # (
    #     "numeric_pipeline",
    #     Pipeline(numeric_pipeline_steps()),
    #     numeric_cols
    # )
]

preprocessor = ColumnTransformer(
    transformers=tfm, remainder="drop"
)

features_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        (
            "normalize",
            StandardScaler()
        ),
    ]
)

In [40]:
# fit the pipeline
fit_df = X.drop_duplicates(subset=["reason_combind"])
features_pipeline.fit(fit_df)

AttributeError: 'csr_matrix' object has no attribute '_reset'

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train = chunk_transform(
    X_train, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/38 [00:00<?, ?it/s]

In [25]:
X_test = chunk_transform(
    X_test, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/10 [00:00<?, ?it/s]

In [26]:
X_train[0], X_train.shape

(array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46246315, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.55749125, 0.        ,
        0.48384422, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.49114776,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [27]:
features.shape, type(X)

((47446, 9), pandas.core.frame.DataFrame)

In [28]:
features

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name,target
0,2,22,2,,1,38,khám tiêu hóa,tiêu hoá,16
1,2,1,2,,1,29,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2
2,4,3,2,,40,29,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15
3,17,3,2,,10,40,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15
4,17,3,2,,-1,70,cao huyết áp,tim mạch,15
...,...,...,...,...,...,...,...,...,...
50162,111,17,2,0.0,74,0,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,9
50163,125,1,2,0.0,74,0,"đau khớp vai, cử động tay phải nghe lụp cụp",cơ xương khớp,2
50164,410,26,2,0.0,82,0,bìu có nỗi mụt,nam học,7
50165,414,1,2,1.0,1,0,"trượt chân, ngã ngồi đau khu xương cụt",cơ xương khớp,2


In [29]:
SEED=42

In [30]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [31]:
X_train.shape, y_train.shape

((37956, 128), (37956,))

## Random Forest

In [34]:
mlflow.set_experiment("random_forest_v5")

2025/03/28 16:10:06 INFO mlflow.tracking.fluent: Experiment with name 'random_forest_v5' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/244051155133141666', creation_time=1743153006100, experiment_id='244051155133141666', last_update_time=1743153006100, lifecycle_stage='active', name='random_forest_v5', tags={}>

In [35]:
def optimize_random_forest(trial: Trial):
    n_estimators = trial.suggest_int(name="n_estimators", low=50, high=500, step=100)
    max_features = trial.suggest_categorical(name="max_features", choices=['log2', 'sqrt'])
    max_depth = trial.suggest_int(name="max_depth", low=5, high=20, step=5)
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)

    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }

    model = RandomForestClassifier(random_state=SEED, **params)

    with mlflow.start_run():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_random_forest, n_trials=10)

[I 2025-03-28 16:10:08,033] A new study created in memory with name: no-name-afeebf66-6a1a-4b2d-a75b-aac3e135dbe1
[I 2025-03-28 16:10:34,297] Trial 0 finished with value: 0.5814152175281715 and parameters: {'n_estimators': 450, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.5814152175281715.


🏃 View run likeable-fish-74 at: http://localhost:2002/#/experiments/244051155133141666/runs/e3394b22dded47ab9339713dcfaeb366
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-03-28 16:11:07,480] Trial 1 finished with value: 0.5860568939972082 and parameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.5860568939972082.


🏃 View run upbeat-perch-579 at: http://localhost:2002/#/experiments/244051155133141666/runs/e332f72b30f544999ab9c3e8ef5c5362
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-03-28 16:11:25,191] Trial 2 finished with value: 0.5897138807932535 and parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.5897138807932535.


🏃 View run delightful-dove-760 at: http://localhost:2002/#/experiments/244051155133141666/runs/7099490ba7444b9ab198cf2f9892c72c
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-03-28 16:12:09,690] Trial 3 finished with value: 0.5892028665531723 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.5897138807932535.


🏃 View run whimsical-whale-841 at: http://localhost:2002/#/experiments/244051155133141666/runs/67a23375c3994366a5a09b16745e904b
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666




In [None]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

## Logistic Regression

In [22]:
mlflow.set_experiment("linear_regression_v1")

2025/03/20 15:29:42 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression_v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/123102627901971667', creation_time=1742459382516, experiment_id='123102627901971667', last_update_time=1742459382516, lifecycle_stage='active', name='linear_regression_v1', tags={}>

In [24]:
def optimize_logistic_regression(trial: Trial):
    X, y = X_train, y_train

    solver = trial.suggest_categorical("solver", ["liblinear", "newton-cg", "lbfgs", "sag", "saga"])
    C = trial.suggest_uniform("C", 0.001, 100)

    params = {
        "solver": solver,
        "C": C
    }

    model = LogisticRegression(**params, random_state=SEED)

    with mlflow.start_run():
        model.fit(X, y)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "solver": solver,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_logistic_regression, n_trials=10)

[I 2025-03-20 15:30:13,136] A new study created in memory with name: no-name-fc6fa582-9dc1-4139-a131-32e44f27ff54
  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:30:37,903] Trial 0 finished with value: 0.7148514225936348 and parameters: {'solver': 'sag', 'C': 6.092308429423362}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run judicious-auk-578 at: http://localhost:2002/#/experiments/123102627901971667/runs/8f0324f1b2b743559cd5257e1d46af50
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:31:07,601] Trial 1 finished with value: 0.7147658294599712 and parameters: {'solver': 'saga', 'C': 31.961504878103675}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run worried-pug-572 at: http://localhost:2002/#/experiments/123102627901971667/runs/eb222d4a981d4f56bf96d2a21667fdd4
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:35:09,245] Trial 2 finished with value: 0.7044437078145703 and parameters: {'solver': 'liblinear', 'C': 94.7585783003262}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run beautiful-moose-862 at: http://localhost:2002/#/experiments/123102627901971667/runs/a3291a0c909541439a2dfea085aa8e7b
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-03-20 15:35:24,471] Trial 3 finished with value: 0.7148176073413588 and parameters: {'solver': 'lbfgs', 'C': 4.824384293432776}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run mysterious-crane-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/bed02c8374c144e7b80ae6b9eb83234e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:39:03,346] Trial 4 finished with value: 0.7045246460953708 and parameters: {'solver': 'liblinear', 'C': 52.26484015177877}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run bemused-carp-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/4c695b79408b4d7587b03062b3b09a8c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:39:14,589] Trial 5 finished with value: 0.714620199656842 and parameters: {'solver': 'newton-cg', 'C': 74.64038215834337}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run big-cow-603 at: http://localhost:2002/#/experiments/123102627901971667/runs/84f2cfcffbc24d9aa0090033840ea2b6
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:42:41,992] Trial 6 finished with value: 0.7045165754601793 and parameters: {'solver': 'liblinear', 'C': 26.97552465816906}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run capricious-elk-248 at: http://localhost:2002/#/experiments/123102627901971667/runs/21807d435fcd4dc2b1db8a0c70b6758c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:03,222] Trial 7 finished with value: 0.70440663421201 and parameters: {'solver': 'liblinear', 'C': 24.080164430543526}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run traveling-moose-289 at: http://localhost:2002/#/experiments/123102627901971667/runs/340e81dd860a4fecbddf75377377076e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:13,369] Trial 8 finished with value: 0.7146802847625534 and parameters: {'solver': 'newton-cg', 'C': 67.38174142434916}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run learned-zebra-952 at: http://localhost:2002/#/experiments/123102627901971667/runs/29076cf6530147fb978378bfe187f93a
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:45,764] Trial 9 finished with value: 0.7149226275282914 and parameters: {'solver': 'saga', 'C': 2.2712615130224454}. Best is trial 9 with value: 0.7149226275282914.


🏃 View run blushing-cub-533 at: http://localhost:2002/#/experiments/123102627901971667/runs/550b877cec054f53a1aba272e1dcfb3e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


## Support Vector Machine

In [37]:
mlflow.set_experiment("support_vector_machine")

2025/03/20 09:47:20 INFO mlflow.tracking.fluent: Experiment with name 'support_vector_machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/862488884265769758', creation_time=1742438840703, experiment_id='862488884265769758', last_update_time=1742438840703, lifecycle_stage='active', name='support_vector_machine', tags={}>

In [70]:
def optimize_support_vector_machine(trial: Trial):
    X, y = X_train, y_train
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    C = trial.suggest_int("C", 1, 1000, step=99)  # Ensures divisibility

    params = {
        "kernel": kernel,
        "gamma": gamma,
        "C": C
    }

    model = SVC(kernel=kernel, gamma=gamma, C=C, random_state=SEED)
    model_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model)
        ]
    )

    with mlflow.start_run():
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "kernel": kernel,
            "gamma": gamma,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1
        

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_support_vector_machine, n_trials=10)

[I 2025-03-20 10:58:39,341] A new study created in memory with name: no-name-09d0461b-a888-406c-8a7a-2603efe79ea1
[I 2025-03-20 11:05:46,413] Trial 0 finished with value: 0.7573041992698557 and parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}. Best is trial 0 with value: 0.7573041992698557.


🏃 View run shivering-wasp-674 at: http://localhost:2002/#/experiments/862488884265769758/runs/3b01b73235b546bfbcedea08581988bc
🧪 View experiment at: http://localhost:2002/#/experiments/862488884265769758




### XGboost

#### preparing dataset for xgboost

for multi-class classification, our target variable must take values in ${\{0,1, ..., K\}}$. However, from the histogram of cover type above, we see that it takes values not continous. to fix this we can use the `scikit-learn label encoder` to create a valid target column.

In [42]:
mlflow.set_experiment("xgboost_v5")

<Experiment: artifact_location='mlflow-artifacts:/430686376537508648', creation_time=1742959056915, experiment_id='430686376537508648', last_update_time=1742959056915, lifecycle_stage='active', name='xgboost_v5', tags={}>

In [43]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }
    model = xgb.XGBClassifier(**param,
                             tree_method="hist")
    
    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [44]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-03-27 17:22:35,270] A new study created in memory with name: no-name-cc47dac9-d8cd-488d-b591-256265539b89


[0]	validation_0-mlogloss:2.75895	validation_1-mlogloss:2.76435
[2]	validation_0-mlogloss:2.49104	validation_1-mlogloss:2.50883
[4]	validation_0-mlogloss:2.31478	validation_1-mlogloss:2.34121
[6]	validation_0-mlogloss:2.18680	validation_1-mlogloss:2.22312
[8]	validation_0-mlogloss:2.08746	validation_1-mlogloss:2.13078
[10]	validation_0-mlogloss:2.00568	validation_1-mlogloss:2.05610
[12]	validation_0-mlogloss:1.93999	validation_1-mlogloss:1.99633
[14]	validation_0-mlogloss:1.88518	validation_1-mlogloss:1.94739
[16]	validation_0-mlogloss:1.83820	validation_1-mlogloss:1.90574
[18]	validation_0-mlogloss:1.79888	validation_1-mlogloss:1.87071
[20]	validation_0-mlogloss:1.76441	validation_1-mlogloss:1.84015
[22]	validation_0-mlogloss:1.73445	validation_1-mlogloss:1.81383
[24]	validation_0-mlogloss:1.70818	validation_1-mlogloss:1.79073
[26]	validation_0-mlogloss:1.68573	validation_1-mlogloss:1.77126
[28]	validation_0-mlogloss:1.66493	validation_1-mlogloss:1.75356
[30]	validation_0-mlogloss:1.6

[I 2025-03-27 17:23:08,808] Trial 0 finished with value: 0.5574810459538914 and parameters: {'max_depth': 10, 'learning_rate': 0.09296082330060387, 'n_estimators': 699, 'subsample': 0.6077068687394, 'colsample_bytree': 0.9527188251289247, 'min_child_weight': 7, 'gamma': 4.909077883996953}. Best is trial 0 with value: 0.5574810459538914.


🏃 View run vaunted-snipe-711 at: http://localhost:2002/#/experiments/430686376537508648/runs/56432fdf163249dcbb4d724212e0b422
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.78110	validation_1-mlogloss:2.78924
[2]	validation_0-mlogloss:2.49689	validation_1-mlogloss:2.52189
[4]	validation_0-mlogloss:2.31364	validation_1-mlogloss:2.35522
[6]	validation_0-mlogloss:2.17428	validation_1-mlogloss:2.22882
[8]	validation_0-mlogloss:2.06440	validation_1-mlogloss:2.13051
[10]	validation_0-mlogloss:1.97500	validation_1-mlogloss:2.05147
[12]	validation_0-mlogloss:1.90159	validation_1-mlogloss:1.98778
[14]	validation_0-mlogloss:1.83948	validation_1-mlogloss:1.93422
[16]	validation_0-mlogloss:1.78691	validation_1-mlogloss:1.88918
[18]	validation_0-mlogloss:1.74084	validation_1-mlogloss:1.85050
[20]	validation_0-mlogloss:1.70069	validation_1-mlogloss:1.81785
[22]	validation_0-mlogloss:1.66551	validation_1-mlogloss:1.78902
[24]	validation_0-mlog

[I 2025-03-27 17:23:58,031] Trial 1 finished with value: 0.5645984836763113 and parameters: {'max_depth': 10, 'learning_rate': 0.09125946439152477, 'n_estimators': 940, 'subsample': 0.9401838401484515, 'colsample_bytree': 0.8487249738830647, 'min_child_weight': 7, 'gamma': 2.444731621509377}. Best is trial 1 with value: 0.5645984836763113.


🏃 View run suave-stoat-411 at: http://localhost:2002/#/experiments/430686376537508648/runs/387c42d159d2419ab8abb177f7ff9f4d
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.91981	validation_1-mlogloss:2.92430
[2]	validation_0-mlogloss:2.76895	validation_1-mlogloss:2.78038
[4]	validation_0-mlogloss:2.65348	validation_1-mlogloss:2.67243
[6]	validation_0-mlogloss:2.55834	validation_1-mlogloss:2.58383
[8]	validation_0-mlogloss:2.47717	validation_1-mlogloss:2.50821
[10]	validation_0-mlogloss:2.40756	validation_1-mlogloss:2.44397
[12]	validation_0-mlogloss:2.34494	validation_1-mlogloss:2.38608
[14]	validation_0-mlogloss:2.28984	validation_1-mlogloss:2.33548
[16]	validation_0-mlogloss:2.23916	validation_1-mlogloss:2.28894
[18]	validation_0-mlogloss:2.19240	validation_1-mlogloss:2.24646
[20]	validation_0-mlogloss:2.15058	validation_1-mlogloss:2.20820
[22]	validation_0-mlogloss:2.11209	validation_1-mlogloss:2.17314
[24]	validation_0-mloglo

[I 2025-03-27 17:25:03,796] Trial 2 finished with value: 0.571406467584713 and parameters: {'max_depth': 5, 'learning_rate': 0.03806264079322071, 'n_estimators': 879, 'subsample': 0.9603703387774565, 'colsample_bytree': 0.7288981779172787, 'min_child_weight': 2, 'gamma': 1.4707024094433974}. Best is trial 2 with value: 0.571406467584713.


🏃 View run sedate-skunk-854 at: http://localhost:2002/#/experiments/430686376537508648/runs/ce98c41b47024c98b8c7ebe25608d143
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.85654	validation_1-mlogloss:2.85965
[2]	validation_0-mlogloss:2.66197	validation_1-mlogloss:2.67186
[4]	validation_0-mlogloss:2.52089	validation_1-mlogloss:2.53689
[6]	validation_0-mlogloss:2.41017	validation_1-mlogloss:2.43267
[8]	validation_0-mlogloss:2.32045	validation_1-mlogloss:2.34832
[10]	validation_0-mlogloss:2.24415	validation_1-mlogloss:2.27688
[12]	validation_0-mlogloss:2.17860	validation_1-mlogloss:2.21575
[14]	validation_0-mlogloss:2.12210	validation_1-mlogloss:2.16351
[16]	validation_0-mlogloss:2.07143	validation_1-mlogloss:2.11739
[18]	validation_0-mlogloss:2.02620	validation_1-mlogloss:2.07608
[20]	validation_0-mlogloss:1.98607	validation_1-mlogloss:2.03935
[22]	validation_0-mlogloss:1.94888	validation_1-mlogloss:2.00588
[24]	validation_0-mlogl

[I 2025-03-27 17:25:50,760] Trial 3 finished with value: 0.5596472226520192 and parameters: {'max_depth': 6, 'learning_rate': 0.05348878488140414, 'n_estimators': 977, 'subsample': 0.9684682921340322, 'colsample_bytree': 0.9504551623738113, 'min_child_weight': 9, 'gamma': 3.5902080997138013}. Best is trial 2 with value: 0.571406467584713.


🏃 View run entertaining-bee-772 at: http://localhost:2002/#/experiments/430686376537508648/runs/c9c1e83dd4224c3b866db919fafd1071
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.81430	validation_1-mlogloss:2.82082
[2]	validation_0-mlogloss:2.57100	validation_1-mlogloss:2.59042
[4]	validation_0-mlogloss:2.40695	validation_1-mlogloss:2.43594
[6]	validation_0-mlogloss:2.28530	validation_1-mlogloss:2.32318
[8]	validation_0-mlogloss:2.18901	validation_1-mlogloss:2.23395
[10]	validation_0-mlogloss:2.10882	validation_1-mlogloss:2.16101
[12]	validation_0-mlogloss:2.04218	validation_1-mlogloss:2.10008
[14]	validation_0-mlogloss:1.98610	validation_1-mlogloss:2.04863
[16]	validation_0-mlogloss:1.93666	validation_1-mlogloss:2.00391
[18]	validation_0-mlogloss:1.89346	validation_1-mlogloss:1.96479
[20]	validation_0-mlogloss:1.85647	validation_1-mlogloss:1.93136
[22]	validation_0-mlogloss:1.82350	validation_1-mlogloss:1.90155
[24]	validation_0-m

[I 2025-03-27 17:26:06,184] Trial 4 finished with value: 0.5563979576048275 and parameters: {'max_depth': 6, 'learning_rate': 0.06962671131922156, 'n_estimators': 255, 'subsample': 0.8807988979518562, 'colsample_bytree': 0.941227527255226, 'min_child_weight': 4, 'gamma': 4.838012063793363}. Best is trial 2 with value: 0.571406467584713.


🏃 View run luminous-fowl-506 at: http://localhost:2002/#/experiments/430686376537508648/runs/edd35d1ecab34d76b138090e8190d67a
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.95658	validation_1-mlogloss:2.95821
[2]	validation_0-mlogloss:2.88630	validation_1-mlogloss:2.89140
[4]	validation_0-mlogloss:2.82407	validation_1-mlogloss:2.83230
[6]	validation_0-mlogloss:2.76762	validation_1-mlogloss:2.77887
[8]	validation_0-mlogloss:2.71646	validation_1-mlogloss:2.73086
[10]	validation_0-mlogloss:2.66969	validation_1-mlogloss:2.68689
[12]	validation_0-mlogloss:2.62647	validation_1-mlogloss:2.64612
[14]	validation_0-mlogloss:2.58637	validation_1-mlogloss:2.60848
[16]	validation_0-mlogloss:2.54912	validation_1-mlogloss:2.57363
[18]	validation_0-mlogloss:2.51420	validation_1-mlogloss:2.54096
[20]	validation_0-mlogloss:2.48135	validation_1-mlogloss:2.51045
[22]	validation_0-mlogloss:2.45043	validation_1-mlogloss:2.48157
[24]	validation_0-mlog

[I 2025-03-27 17:27:35,956] Trial 5 finished with value: 0.5599566764660374 and parameters: {'max_depth': 6, 'learning_rate': 0.013756914195516065, 'n_estimators': 708, 'subsample': 0.982148919899279, 'colsample_bytree': 0.9979171907348062, 'min_child_weight': 4, 'gamma': 2.1101125035723904}. Best is trial 2 with value: 0.571406467584713.


🏃 View run amazing-mouse-430 at: http://localhost:2002/#/experiments/430686376537508648/runs/95a5336b378b43b5a40b68414d27ba57
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.93621	validation_1-mlogloss:2.93881
[2]	validation_0-mlogloss:2.81746	validation_1-mlogloss:2.82293
[4]	validation_0-mlogloss:2.72056	validation_1-mlogloss:2.73077
[6]	validation_0-mlogloss:2.63691	validation_1-mlogloss:2.65168
[8]	validation_0-mlogloss:2.56478	validation_1-mlogloss:2.58366
[10]	validation_0-mlogloss:2.49975	validation_1-mlogloss:2.52272
[12]	validation_0-mlogloss:2.43989	validation_1-mlogloss:2.46726
[14]	validation_0-mlogloss:2.38725	validation_1-mlogloss:2.41841
[16]	validation_0-mlogloss:2.33847	validation_1-mlogloss:2.37319
[18]	validation_0-mlogloss:2.29259	validation_1-mlogloss:2.33119
[20]	validation_0-mlogloss:2.25059	validation_1-mlogloss:2.29278
[22]	validation_0-mlogloss:2.21211	validation_1-mlogloss:2.25761
[24]	validation_0-mlog

[I 2025-03-27 17:28:38,401] Trial 6 finished with value: 0.5616586724431378 and parameters: {'max_depth': 10, 'learning_rate': 0.031632333311692996, 'n_estimators': 327, 'subsample': 0.5110711323535759, 'colsample_bytree': 0.7222435337711299, 'min_child_weight': 6, 'gamma': 0.5673013409462041}. Best is trial 2 with value: 0.571406467584713.


🏃 View run incongruous-hare-658 at: http://localhost:2002/#/experiments/430686376537508648/runs/c16a6463a6bb46898031cd00b23361c1
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.84859	validation_1-mlogloss:2.85263
[2]	validation_0-mlogloss:2.63669	validation_1-mlogloss:2.64992
[4]	validation_0-mlogloss:2.48875	validation_1-mlogloss:2.50958
[6]	validation_0-mlogloss:2.37633	validation_1-mlogloss:2.40468
[8]	validation_0-mlogloss:2.28595	validation_1-mlogloss:2.31988
[10]	validation_0-mlogloss:2.20760	validation_1-mlogloss:2.24638
[12]	validation_0-mlogloss:2.14240	validation_1-mlogloss:2.18599
[14]	validation_0-mlogloss:2.08680	validation_1-mlogloss:2.13475
[16]	validation_0-mlogloss:2.03787	validation_1-mlogloss:2.08993
[18]	validation_0-mlogloss:1.99326	validation_1-mlogloss:2.04936
[20]	validation_0-mlogloss:1.95438	validation_1-mlogloss:2.01393
[22]	validation_0-mlogloss:1.91887	validation_1-mlogloss:1.98116
[24]	validation_0-m

[I 2025-03-27 17:29:13,127] Trial 7 finished with value: 0.5619681262571561 and parameters: {'max_depth': 5, 'learning_rate': 0.0600144841672613, 'n_estimators': 731, 'subsample': 0.9406985275532984, 'colsample_bytree': 0.9057520062817683, 'min_child_weight': 6, 'gamma': 3.0138718215115463}. Best is trial 2 with value: 0.571406467584713.


🏃 View run handsome-vole-27 at: http://localhost:2002/#/experiments/430686376537508648/runs/bab8e65ea4544b22a9571eea98f494f1
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.84027	validation_1-mlogloss:2.85249
[2]	validation_0-mlogloss:2.56321	validation_1-mlogloss:2.59624
[4]	validation_0-mlogloss:2.38503	validation_1-mlogloss:2.43579
[6]	validation_0-mlogloss:2.24783	validation_1-mlogloss:2.31209
[8]	validation_0-mlogloss:2.13981	validation_1-mlogloss:2.21678
[10]	validation_0-mlogloss:2.05006	validation_1-mlogloss:2.13708
[12]	validation_0-mlogloss:1.97519	validation_1-mlogloss:2.07187
[14]	validation_0-mlogloss:1.91233	validation_1-mlogloss:2.01692
[16]	validation_0-mlogloss:1.85969	validation_1-mlogloss:1.97063
[18]	validation_0-mlogloss:1.80947	validation_1-mlogloss:1.92700
[20]	validation_0-mlogloss:1.76740	validation_1-mlogloss:1.89097
[22]	validation_0-mlogloss:1.73018	validation_1-mlogloss:1.85817
[24]	validation_0-mlogl

[I 2025-03-27 17:29:51,818] Trial 8 finished with value: 0.5663004796534117 and parameters: {'max_depth': 9, 'learning_rate': 0.08405615853185568, 'n_estimators': 908, 'subsample': 0.9889040487551499, 'colsample_bytree': 0.5876408967073747, 'min_child_weight': 2, 'gamma': 3.695173622709081}. Best is trial 2 with value: 0.571406467584713.


🏃 View run whimsical-horse-254 at: http://localhost:2002/#/experiments/430686376537508648/runs/8fc92cec2fab4ca78596199121de5e83
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648
[0]	validation_0-mlogloss:2.84905	validation_1-mlogloss:2.85348
[2]	validation_0-mlogloss:2.60628	validation_1-mlogloss:2.61661
[4]	validation_0-mlogloss:2.44683	validation_1-mlogloss:2.46542
[6]	validation_0-mlogloss:2.31694	validation_1-mlogloss:2.34386
[8]	validation_0-mlogloss:2.21746	validation_1-mlogloss:2.25110
[10]	validation_0-mlogloss:2.13184	validation_1-mlogloss:2.17215
[12]	validation_0-mlogloss:2.06271	validation_1-mlogloss:2.10881
[14]	validation_0-mlogloss:2.00333	validation_1-mlogloss:2.05557
[16]	validation_0-mlogloss:1.95211	validation_1-mlogloss:2.00984
[18]	validation_0-mlogloss:1.90368	validation_1-mlogloss:1.96678
[20]	validation_0-mlogloss:1.86229	validation_1-mlogloss:1.93034
[22]	validation_0-mlogloss:1.82498	validation_1-mlogloss:1.89731
[24]	validation_0-ml

[I 2025-03-27 17:30:35,411] Trial 9 finished with value: 0.5650626643973387 and parameters: {'max_depth': 9, 'learning_rate': 0.085205776431595, 'n_estimators': 920, 'subsample': 0.5575782880891291, 'colsample_bytree': 0.6501762011797507, 'min_child_weight': 8, 'gamma': 2.072180701731323}. Best is trial 2 with value: 0.571406467584713.


🏃 View run nebulous-gnu-781 at: http://localhost:2002/#/experiments/430686376537508648/runs/050458377a0645849d0050a3f00ff02d
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648


In [35]:
 study.best_trial, study.best_params

(FrozenTrial(number=6, state=TrialState.COMPLETE, values=[0.6509121061359867], datetime_start=datetime.datetime(2025, 3, 26, 10, 40, 46, 495701), datetime_complete=datetime.datetime(2025, 3, 26, 10, 43, 2, 327263), params={'max_depth': 6, 'learning_rate': 0.09104989311752477, 'n_estimators': 905, 'subsample': 0.9962385437166579, 'colsample_bytree': 0.9380213894542341, 'min_child_weight': 3, 'gamma': 0.35543120411603457}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=No

### Error analysis 

In [36]:
param =  {
    'max_depth': 8,
    'learning_rate': 0.044353992283905075,
    'n_estimators': 962,
    'subsample': 0.9161256648971945,
    'colsample_bytree': 0.7752581060503564,
    'min_child_weight': 3,
    'gamma': 1.782516264856805
}

model = xgb.XGBClassifier(
    **study.best_params,
    tree_method="hist"
)

In [37]:
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.55      0.60       339
           1       0.76      0.79      0.77       708
           2       0.73      0.74      0.73      1161
           3       0.74      0.86      0.79      1143
           4       0.63      0.81      0.71      1251
           5       0.54      0.27      0.36       209
           6       0.63      0.64      0.63      1094
           7       0.66      0.70      0.68      1217
           8       0.38      0.10      0.16       321
           9       0.29      0.12      0.17      1021
          10       0.50      0.57      0.53       740
          11       0.75      0.79      0.77      1235
          12       0.74      0.83      0.78      1187
          13       0.56      0.56      0.56      1132
          14       0.66      0.59      0.62       544
          15       0.70      0.82      0.75      1175
          16       0.73      0.77      0.75      1286
          17       0.46    

In [70]:
model.save_model('../models/xgb_v4.json')

In [68]:
type(y_test), type(y_pred)

(pandas.core.series.Series, numpy.ndarray)

In [61]:
indices = np.arange(len(y_test))

y_test_array = y_test.to_numpy()

missclassified_indices = indices[y_test != y_pred]
missclassified_samples = X_test[missclassified_indices]

for idx, sample in zip(missclassified_indices, missclassified_samples):
    print(f"\n\n===========Sample Index: {idx}============\n\n")
    print(f"True Label:{y_test_array[idx]}")
    print(f"Predicted Label: {y_pred[idx]}")
    print(f"Sample Data: {sample}\n")





True Label:15
Predicted Label: 6
Sample Data: [-0.13908093 -0.14457898 -0.12797779 -0.14222392 -0.13319853 -0.18499567
 -0.20856976 -0.38567183 -0.17948182 -0.19680505 -0.28827239 -0.12771494
 -0.41168933 -0.12954803 -0.16320786 -0.16277165 -0.24196358 -0.15795148
 -0.14340706 -0.1993695  -0.1601961  -0.16085056 -0.17375158 -0.13267836
 -0.15199265 -0.1891086  -0.17132276 -0.18564885 -0.15913934 -0.16756606
 -0.18223925 -0.15341039  2.05708714  9.78784621 -0.19287774 -0.23321828
 -0.14815948 -0.24788729 -0.18898315 -0.21668874 -0.20668534 -0.17509282
  7.18642669 -0.15099312 -0.15676132 -0.17582942 -0.22426307 -0.1476794
 -0.14676746 -0.33496981 -0.15995228 -0.18609043 -0.16754128 -0.14266724
 -0.18758728 -0.21422371 -0.16980777 -0.16256739 -0.16947577 -0.17823464
 -0.13902994 -0.17875841 -0.16690793 -0.21287434 -0.19854208 -0.17436856
 -0.13713111 -0.23002467 -0.15211049 -0.1404188  -0.23067168 -0.18788621
 -0.14862178 -0.14570461 -0.1795044  -0.16704693 -0.23222594 -0.15440686
 -

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

