## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri('http://localhost:2002')

In [5]:
import numpy as np
import optuna
from optuna import Trial
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [6]:
from src.features.schedules import (
    reason_pipeline_steps,
    numeric_pipeline_steps
)
from src.data_prep_utils import chunk_transform, add_transformed_feature

In [7]:
features = pd.read_csv('../data/data_label_balanced.csv', index_col=0)

from sklearn.model_selection import train_test_split

In [8]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name
0,2,khám tiêu hóa,tiêu hoá
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch
4,17,cao huyết áp,tim mạch


In [9]:
features.isnull().sum()

partner_id          0
reason_combind     10
specialist_name     0
dtype: int64

In [10]:
features.shape

(82743, 3)

In [11]:
features = features.dropna(subset=["reason_combind"])

In [12]:
features.shape

(82733, 3)

In [13]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [14]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name,target
0,2,khám tiêu hóa,tiêu hoá,15
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,1
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,14
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,14
4,17,cao huyết áp,tim mạch,14


In [15]:
X = features.drop(columns=["specialist_name", "target"], axis=1)
y = features['target']

In [16]:
text_col = "reason_combind"
numeric_cols = ['partner_id']

tfm = [
    (
        "reason_combind",
        Pipeline(reason_pipeline_steps()),
        text_col
    ),
    (
        "numeric_pipeline",
        Pipeline(numeric_pipeline_steps()),
        numeric_cols
    )
]

preprocessor = ColumnTransformer(
    transformers=tfm, remainder="drop"
)

features_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        (
            "normalize",
            StandardScaler()
        ),
    ]
)

In [17]:
# fit the pipeline
fit_df = X.drop_duplicates(subset=["reason_combind"])
features_pipeline.fit(fit_df)

In [18]:
X = chunk_transform(
    X, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/83 [00:00<?, ?it/s]

In [19]:
X[0], X.shape

(array([-0.13908093, -0.14457898, -0.12797779, -0.14222392, -0.13319853,
        -0.18499567, -0.20856976, -0.38567183, -0.17948182, -0.19680505,
        -0.28827239, -0.12771494, -0.41168933, -0.12954803, -0.16320786,
        -0.16277165, -0.24196358, -0.15795148, -0.14340706, -0.1993695 ,
        -0.1601961 , -0.16085056, -0.17375158, -0.13267836, -0.15199265,
        -0.1891086 , -0.17132276, -0.18564885,  8.84033458, -0.16756606,
        -0.18223925, -0.15341039,  2.1420994 , -0.14359169, -0.19287774,
        -0.23321828, -0.14815948, -0.24788729, -0.18898315, -0.21668874,
        -0.20668534, -0.17509282, -0.20703585, -0.15099312, -0.15676132,
        -0.17582942, -0.22426307, -0.1476794 , -0.14676746, -0.33496981,
        -0.15995228, -0.18609043, -0.16754128, -0.14266724, -0.18758728,
        -0.21422371, -0.16980777, -0.16256739, -0.16947577, -0.17823464,
        -0.13902994, -0.17875841, -0.16690793, -0.21287434, -0.19854208,
        -0.17436856, -0.13713111, -0.23002467, -0.1

In [21]:
features.shape, type(X)

((82733, 4), numpy.ndarray)

In [22]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
0,2,khám tiêu hóa,tiêu hoá,15
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,1
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,14
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,14
4,17,cao huyết áp,tim mạch,14
...,...,...,...,...
154449,111,khám tuyến giáp,tiểu đường - nội tiết,16
154450,448,khám bằng lái xe b1,khám tổng quát,5
154451,111,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,8
154457,111,khám tuyến giáp,ung bướu,17


In [23]:
SEED=42

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [25]:
X_train.shape, y_train.shape

((66186, 129), (66186,))

## Random Forest

In [60]:
mlflow.set_experiment("random_forest_v3")

<Experiment: artifact_location='mlflow-artifacts:/929452482496634215', creation_time=1742457014915, experiment_id='929452482496634215', last_update_time=1742457014915, lifecycle_stage='active', name='random_forest_v3', tags={}>

In [61]:
def optimize_random_forest(trial: Trial):
    n_estimators = trial.suggest_int(name="n_estimators", low=50, high=500, step=100)
    max_features = trial.suggest_categorical(name="max_features", choices=['log2', 'sqrt'])
    max_depth = trial.suggest_int(name="max_depth", low=5, high=20, step=5)
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)

    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }

    model = RandomForestClassifier(random_state=SEED, **params)

    with mlflow.start_run():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [62]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_random_forest, n_trials=10)

[I 2025-03-24 17:48:41,896] A new study created in memory with name: no-name-4c9d3227-212f-44f8-9317-85e62cfad700
[I 2025-03-24 17:49:57,971] Trial 0 finished with value: 0.7936751441203204 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run fortunate-eel-774 at: http://localhost:2002/#/experiments/929452482496634215/runs/542864bf7d8b4e41b6fc2558a68614aa
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:50:25,338] Trial 1 finished with value: 0.7340954240264566 and parameters: {'n_estimators': 250, 'max_features': 'log2', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run trusting-shrew-990 at: http://localhost:2002/#/experiments/929452482496634215/runs/068066c36c4d4c2e8c718dd8d731047c
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:51:15,266] Trial 2 finished with value: 0.7607901419291607 and parameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run resilient-hare-839 at: http://localhost:2002/#/experiments/929452482496634215/runs/805ca4edfb054fcc9c67724bbab3f267
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:52:20,146] Trial 3 finished with value: 0.7705462188120129 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run bald-robin-421 at: http://localhost:2002/#/experiments/929452482496634215/runs/a9e3bb29b365454da08811a6f6574795
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:53:00,247] Trial 4 finished with value: 0.7535308905275904 and parameters: {'n_estimators': 350, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run glamorous-loon-565 at: http://localhost:2002/#/experiments/929452482496634215/runs/b464ef4b9f9a45648848e708d7ce18c7
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:53:17,732] Trial 5 finished with value: 0.7531376048184056 and parameters: {'n_estimators': 150, 'max_features': 'log2', 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run industrious-tern-884 at: http://localhost:2002/#/experiments/929452482496634215/runs/9c2ea078b7bf4340a0f625a9d6fee3d5
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:22,524] Trial 6 finished with value: 0.7713007412144229 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run vaunted-crab-870 at: http://localhost:2002/#/experiments/929452482496634215/runs/d572f1a0993940b08b983232f82da918
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:29,555] Trial 7 finished with value: 0.765687653636794 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run unleashed-shrike-770 at: http://localhost:2002/#/experiments/929452482496634215/runs/742c73f545194b6d8989b53940aee1a1
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:54,301] Trial 8 finished with value: 0.7944789991093926 and parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 8 with value: 0.7944789991093926.


🏃 View run crawling-carp-784 at: http://localhost:2002/#/experiments/929452482496634215/runs/046ddb07237c4e93beef9d6602edc0dc
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:55:53,287] Trial 9 finished with value: 0.7919618985586342 and parameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 8 with value: 0.7944789991093926.


🏃 View run respected-sloth-569 at: http://localhost:2002/#/experiments/929452482496634215/runs/650a71bb94164a1e925e76ba79ea0364
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


In [30]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=4, state=TrialState.COMPLETE, values=[0.814727760781682], datetime_start=datetime.datetime(2025, 3, 24, 12, 51, 59, 801696), datetime_complete=datetime.datetime(2025, 3, 24, 12, 52, 30, 22030), params={'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=450, log=False, low=50, step=100), 'max_features': CategoricalDistribution(choices=('log2', 'sqrt')), 'max_depth': IntDistribution(high=20, log=False, low=5, step=5), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=2), 'min_samples_leaf': IntDistribution(high=4, log=False, low=1, step=1)}, trial_id=4, value=None)
Best hyperparameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}


## Logistic Regression

In [22]:
mlflow.set_experiment("linear_regression_v1")

2025/03/20 15:29:42 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression_v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/123102627901971667', creation_time=1742459382516, experiment_id='123102627901971667', last_update_time=1742459382516, lifecycle_stage='active', name='linear_regression_v1', tags={}>

In [24]:
def optimize_logistic_regression(trial: Trial):
    X, y = X_train, y_train

    solver = trial.suggest_categorical("solver", ["liblinear", "newton-cg", "lbfgs", "sag", "saga"])
    C = trial.suggest_uniform("C", 0.001, 100)

    params = {
        "solver": solver,
        "C": C
    }

    model = LogisticRegression(**params, random_state=SEED)

    with mlflow.start_run():
        model.fit(X, y)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "solver": solver,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_logistic_regression, n_trials=10)

[I 2025-03-20 15:30:13,136] A new study created in memory with name: no-name-fc6fa582-9dc1-4139-a131-32e44f27ff54
  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:30:37,903] Trial 0 finished with value: 0.7148514225936348 and parameters: {'solver': 'sag', 'C': 6.092308429423362}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run judicious-auk-578 at: http://localhost:2002/#/experiments/123102627901971667/runs/8f0324f1b2b743559cd5257e1d46af50
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:31:07,601] Trial 1 finished with value: 0.7147658294599712 and parameters: {'solver': 'saga', 'C': 31.961504878103675}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run worried-pug-572 at: http://localhost:2002/#/experiments/123102627901971667/runs/eb222d4a981d4f56bf96d2a21667fdd4
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:35:09,245] Trial 2 finished with value: 0.7044437078145703 and parameters: {'solver': 'liblinear', 'C': 94.7585783003262}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run beautiful-moose-862 at: http://localhost:2002/#/experiments/123102627901971667/runs/a3291a0c909541439a2dfea085aa8e7b
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-03-20 15:35:24,471] Trial 3 finished with value: 0.7148176073413588 and parameters: {'solver': 'lbfgs', 'C': 4.824384293432776}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run mysterious-crane-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/bed02c8374c144e7b80ae6b9eb83234e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:39:03,346] Trial 4 finished with value: 0.7045246460953708 and parameters: {'solver': 'liblinear', 'C': 52.26484015177877}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run bemused-carp-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/4c695b79408b4d7587b03062b3b09a8c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:39:14,589] Trial 5 finished with value: 0.714620199656842 and parameters: {'solver': 'newton-cg', 'C': 74.64038215834337}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run big-cow-603 at: http://localhost:2002/#/experiments/123102627901971667/runs/84f2cfcffbc24d9aa0090033840ea2b6
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:42:41,992] Trial 6 finished with value: 0.7045165754601793 and parameters: {'solver': 'liblinear', 'C': 26.97552465816906}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run capricious-elk-248 at: http://localhost:2002/#/experiments/123102627901971667/runs/21807d435fcd4dc2b1db8a0c70b6758c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:03,222] Trial 7 finished with value: 0.70440663421201 and parameters: {'solver': 'liblinear', 'C': 24.080164430543526}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run traveling-moose-289 at: http://localhost:2002/#/experiments/123102627901971667/runs/340e81dd860a4fecbddf75377377076e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:13,369] Trial 8 finished with value: 0.7146802847625534 and parameters: {'solver': 'newton-cg', 'C': 67.38174142434916}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run learned-zebra-952 at: http://localhost:2002/#/experiments/123102627901971667/runs/29076cf6530147fb978378bfe187f93a
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:45,764] Trial 9 finished with value: 0.7149226275282914 and parameters: {'solver': 'saga', 'C': 2.2712615130224454}. Best is trial 9 with value: 0.7149226275282914.


🏃 View run blushing-cub-533 at: http://localhost:2002/#/experiments/123102627901971667/runs/550b877cec054f53a1aba272e1dcfb3e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


## Support Vector Machine

In [37]:
mlflow.set_experiment("support_vector_machine")

2025/03/20 09:47:20 INFO mlflow.tracking.fluent: Experiment with name 'support_vector_machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/862488884265769758', creation_time=1742438840703, experiment_id='862488884265769758', last_update_time=1742438840703, lifecycle_stage='active', name='support_vector_machine', tags={}>

In [70]:
def optimize_support_vector_machine(trial: Trial):
    X, y = X_train, y_train
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    C = trial.suggest_int("C", 1, 1000, step=99)  # Ensures divisibility

    params = {
        "kernel": kernel,
        "gamma": gamma,
        "C": C
    }

    model = SVC(kernel=kernel, gamma=gamma, C=C, random_state=SEED)
    model_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model)
        ]
    )

    with mlflow.start_run():
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "kernel": kernel,
            "gamma": gamma,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1
        

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_support_vector_machine, n_trials=10)

[I 2025-03-20 10:58:39,341] A new study created in memory with name: no-name-09d0461b-a888-406c-8a7a-2603efe79ea1
[I 2025-03-20 11:05:46,413] Trial 0 finished with value: 0.7573041992698557 and parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}. Best is trial 0 with value: 0.7573041992698557.


🏃 View run shivering-wasp-674 at: http://localhost:2002/#/experiments/862488884265769758/runs/3b01b73235b546bfbcedea08581988bc
🧪 View experiment at: http://localhost:2002/#/experiments/862488884265769758




### XGboost

#### preparing dataset for xgboost

for multi-class classification, our target variable must take values in ${\{0,1, ..., K\}}$. However, from the histogram of cover type above, we see that it takes values not continous. to fix this we can use the `scikit-learn label encoder` to create a valid target column.

In [23]:
mlflow.set_experiment("xgboost")

<Experiment: artifact_location='mlflow-artifacts:/608416825633838197', creation_time=1742467850396, experiment_id='608416825633838197', last_update_time=1742467850396, lifecycle_stage='active', name='xgboost', tags={}>

In [40]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }
    model = xgb.XGBClassifier(**param,
                             tree_method="hist")
    
    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [41]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-03-25 09:01:59,735] A new study created in memory with name: no-name-fdd34d5f-be1b-4e8d-8b80-bd1aae578200


[0]	validation_0-mlogloss:2.76523	validation_1-mlogloss:2.76501
[2]	validation_0-mlogloss:2.50275	validation_1-mlogloss:2.50071
[4]	validation_0-mlogloss:2.32166	validation_1-mlogloss:2.31999
[6]	validation_0-mlogloss:2.17952	validation_1-mlogloss:2.17757
[8]	validation_0-mlogloss:2.06658	validation_1-mlogloss:2.06459
[10]	validation_0-mlogloss:1.96894	validation_1-mlogloss:1.96643
[12]	validation_0-mlogloss:1.88638	validation_1-mlogloss:1.88428
[14]	validation_0-mlogloss:1.81166	validation_1-mlogloss:1.81023
[16]	validation_0-mlogloss:1.74714	validation_1-mlogloss:1.74623
[18]	validation_0-mlogloss:1.68881	validation_1-mlogloss:1.68784
[20]	validation_0-mlogloss:1.63579	validation_1-mlogloss:1.63454
[22]	validation_0-mlogloss:1.58822	validation_1-mlogloss:1.58704
[24]	validation_0-mlogloss:1.54471	validation_1-mlogloss:1.54365
[26]	validation_0-mlogloss:1.50487	validation_1-mlogloss:1.50424
[28]	validation_0-mlogloss:1.46652	validation_1-mlogloss:1.46626
[30]	validation_0-mlogloss:1.4

[I 2025-03-25 09:02:12,767] Trial 0 finished with value: 0.7722850063455611 and parameters: {'max_depth': 3, 'learning_rate': 0.053449299114390864, 'n_estimators': 131, 'subsample': 0.5928599689610272, 'colsample_bytree': 0.8629166414873133, 'min_child_weight': 10, 'gamma': 2.7911131800973066}. Best is trial 0 with value: 0.7722850063455611.


🏃 View run unique-fox-163 at: http://localhost:2002/#/experiments/608416825633838197/runs/5f5dce9a26ac4db38086f0e90a7445f5
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.73874	validation_1-mlogloss:2.74176
[2]	validation_0-mlogloss:2.43742	validation_1-mlogloss:2.44291
[4]	validation_0-mlogloss:2.23371	validation_1-mlogloss:2.24242
[6]	validation_0-mlogloss:2.06473	validation_1-mlogloss:2.07541
[8]	validation_0-mlogloss:1.92420	validation_1-mlogloss:1.93725
[10]	validation_0-mlogloss:1.80606	validation_1-mlogloss:1.82088
[12]	validation_0-mlogloss:1.70595	validation_1-mlogloss:1.72274
[14]	validation_0-mlogloss:1.61797	validation_1-mlogloss:1.63686
[16]	validation_0-mlogloss:1.54290	validation_1-mlogloss:1.56290
[18]	validation_0-mlogloss:1.47101	validation_1-mlogloss:1.49195
[20]	validation_0-mlogloss:1.40905	validation_1-mlogloss:1.43125
[22]	validation_0-mlogloss:1.35057	validation_1-mlogloss:1.37412
[24]	validation_0-mloglos

[I 2025-03-25 09:03:58,540] Trial 1 finished with value: 0.837855804677585 and parameters: {'max_depth': 8, 'learning_rate': 0.044353992283905075, 'n_estimators': 962, 'subsample': 0.9161256648971945, 'colsample_bytree': 0.7752581060503564, 'min_child_weight': 3, 'gamma': 1.782516264856805}. Best is trial 1 with value: 0.837855804677585.


🏃 View run capricious-eel-160 at: http://localhost:2002/#/experiments/608416825633838197/runs/9974a6d6857845908afda466274389f7
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.66277	validation_1-mlogloss:2.66451
[2]	validation_0-mlogloss:2.29807	validation_1-mlogloss:2.29878
[4]	validation_0-mlogloss:2.07265	validation_1-mlogloss:2.07325
[6]	validation_0-mlogloss:1.90297	validation_1-mlogloss:1.90423
[8]	validation_0-mlogloss:1.76601	validation_1-mlogloss:1.76772
[10]	validation_0-mlogloss:1.65499	validation_1-mlogloss:1.65703
[12]	validation_0-mlogloss:1.56299	validation_1-mlogloss:1.56557
[14]	validation_0-mlogloss:1.48269	validation_1-mlogloss:1.48574
[16]	validation_0-mlogloss:1.41562	validation_1-mlogloss:1.41929
[18]	validation_0-mlogloss:1.35633	validation_1-mlogloss:1.36046
[20]	validation_0-mlogloss:1.30307	validation_1-mlogloss:1.30767
[22]	validation_0-mlogloss:1.25583	validation_1-mlogloss:1.26068
[24]	validation_0-mlo

[I 2025-03-25 09:05:06,046] Trial 2 finished with value: 0.8238955701939928 and parameters: {'max_depth': 4, 'learning_rate': 0.07545709490030106, 'n_estimators': 698, 'subsample': 0.9209904645816166, 'colsample_bytree': 0.8592852448815524, 'min_child_weight': 8, 'gamma': 3.5932998321841407}. Best is trial 1 with value: 0.837855804677585.


🏃 View run upset-doe-211 at: http://localhost:2002/#/experiments/608416825633838197/runs/d841e01065be4753b90b68d224c5ac69
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.72424	validation_1-mlogloss:2.72648
[2]	validation_0-mlogloss:2.41574	validation_1-mlogloss:2.41931
[4]	validation_0-mlogloss:2.19541	validation_1-mlogloss:2.20055
[6]	validation_0-mlogloss:2.02849	validation_1-mlogloss:2.03534
[8]	validation_0-mlogloss:1.89504	validation_1-mlogloss:1.90311
[10]	validation_0-mlogloss:1.78112	validation_1-mlogloss:1.79028
[12]	validation_0-mlogloss:1.68373	validation_1-mlogloss:1.69360
[14]	validation_0-mlogloss:1.59757	validation_1-mlogloss:1.60857
[16]	validation_0-mlogloss:1.52345	validation_1-mlogloss:1.53497
[18]	validation_0-mlogloss:1.45560	validation_1-mlogloss:1.46776
[20]	validation_0-mlogloss:1.39474	validation_1-mlogloss:1.40759
[22]	validation_0-mlogloss:1.34005	validation_1-mlogloss:1.35348
[24]	validation_0-mlogloss

[I 2025-03-25 09:06:10,586] Trial 3 finished with value: 0.824862512842207 and parameters: {'max_depth': 7, 'learning_rate': 0.04687774527635741, 'n_estimators': 575, 'subsample': 0.7636293955268851, 'colsample_bytree': 0.9319601118832399, 'min_child_weight': 5, 'gamma': 4.30409226520721}. Best is trial 1 with value: 0.837855804677585.


🏃 View run crawling-sow-224 at: http://localhost:2002/#/experiments/608416825633838197/runs/f25e5209df1240a996c0d2d360611431
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.73398	validation_1-mlogloss:2.73604
[2]	validation_0-mlogloss:2.42629	validation_1-mlogloss:2.42973
[4]	validation_0-mlogloss:2.21840	validation_1-mlogloss:2.22382
[6]	validation_0-mlogloss:2.04717	validation_1-mlogloss:2.05360
[8]	validation_0-mlogloss:1.90599	validation_1-mlogloss:1.91331
[10]	validation_0-mlogloss:1.78951	validation_1-mlogloss:1.79802
[12]	validation_0-mlogloss:1.69113	validation_1-mlogloss:1.70041
[14]	validation_0-mlogloss:1.60572	validation_1-mlogloss:1.61587
[16]	validation_0-mlogloss:1.53297	validation_1-mlogloss:1.54363
[18]	validation_0-mlogloss:1.46305	validation_1-mlogloss:1.47400
[20]	validation_0-mlogloss:1.40300	validation_1-mlogloss:1.41438
[22]	validation_0-mlogloss:1.34526	validation_1-mlogloss:1.35740
[24]	validation_0-mlogl

[I 2025-03-25 09:07:15,992] Trial 4 finished with value: 0.8246207771801535 and parameters: {'max_depth': 8, 'learning_rate': 0.04853352516671868, 'n_estimators': 652, 'subsample': 0.5511228688622793, 'colsample_bytree': 0.7871137696422328, 'min_child_weight': 5, 'gamma': 4.9480156015984464}. Best is trial 1 with value: 0.837855804677585.


🏃 View run kindly-toad-667 at: http://localhost:2002/#/experiments/608416825633838197/runs/1eab873fb3c44d92af1fbb157761c3bc
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.77364	validation_1-mlogloss:2.77486
[2]	validation_0-mlogloss:2.50867	validation_1-mlogloss:2.51037
[4]	validation_0-mlogloss:2.32237	validation_1-mlogloss:2.32541
[6]	validation_0-mlogloss:2.16145	validation_1-mlogloss:2.16579
[8]	validation_0-mlogloss:2.03226	validation_1-mlogloss:2.03712
[10]	validation_0-mlogloss:1.92092	validation_1-mlogloss:1.92614
[12]	validation_0-mlogloss:1.82689	validation_1-mlogloss:1.83266
[14]	validation_0-mlogloss:1.74351	validation_1-mlogloss:1.74992
[16]	validation_0-mlogloss:1.67161	validation_1-mlogloss:1.67871
[18]	validation_0-mlogloss:1.60345	validation_1-mlogloss:1.61081
[20]	validation_0-mlogloss:1.54313	validation_1-mlogloss:1.55083
[22]	validation_0-mlogloss:1.48522	validation_1-mlogloss:1.49358
[24]	validation_0-mloglo

[I 2025-03-25 09:09:10,017] Trial 5 finished with value: 0.8311476400555992 and parameters: {'max_depth': 7, 'learning_rate': 0.04142047888752727, 'n_estimators': 882, 'subsample': 0.5289378454288168, 'colsample_bytree': 0.808775654846073, 'min_child_weight': 10, 'gamma': 2.6683732728119445}. Best is trial 1 with value: 0.837855804677585.


🏃 View run learned-finch-983 at: http://localhost:2002/#/experiments/608416825633838197/runs/d1b67abce16746499d2e4ec5315291e5
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.73034	validation_1-mlogloss:2.73213
[2]	validation_0-mlogloss:2.43335	validation_1-mlogloss:2.43508
[4]	validation_0-mlogloss:2.22705	validation_1-mlogloss:2.22928
[6]	validation_0-mlogloss:2.07112	validation_1-mlogloss:2.07408
[8]	validation_0-mlogloss:1.94239	validation_1-mlogloss:1.94561
[10]	validation_0-mlogloss:1.83083	validation_1-mlogloss:1.83466
[12]	validation_0-mlogloss:1.73689	validation_1-mlogloss:1.74181
[14]	validation_0-mlogloss:1.65191	validation_1-mlogloss:1.65765
[16]	validation_0-mlogloss:1.58124	validation_1-mlogloss:1.58793
[18]	validation_0-mlogloss:1.51606	validation_1-mlogloss:1.52348
[20]	validation_0-mlogloss:1.45623	validation_1-mlogloss:1.46437
[22]	validation_0-mlogloss:1.40274	validation_1-mlogloss:1.41151
[24]	validation_0-mlog

[I 2025-03-25 09:09:37,604] Trial 6 finished with value: 0.8048588868072762 and parameters: {'max_depth': 5, 'learning_rate': 0.04908513540697597, 'n_estimators': 112, 'subsample': 0.9278001353470964, 'colsample_bytree': 0.9119347637921873, 'min_child_weight': 2, 'gamma': 1.913223708661096}. Best is trial 1 with value: 0.837855804677585.


🏃 View run likeable-stork-654 at: http://localhost:2002/#/experiments/608416825633838197/runs/21e898fb71ef4471a937f2827a8ca1d5
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.70241	validation_1-mlogloss:2.70426
[2]	validation_0-mlogloss:2.35401	validation_1-mlogloss:2.35666
[4]	validation_0-mlogloss:2.14081	validation_1-mlogloss:2.14517
[6]	validation_0-mlogloss:1.96876	validation_1-mlogloss:1.97352
[8]	validation_0-mlogloss:1.82287	validation_1-mlogloss:1.82840
[10]	validation_0-mlogloss:1.70868	validation_1-mlogloss:1.71481
[12]	validation_0-mlogloss:1.60726	validation_1-mlogloss:1.61385
[14]	validation_0-mlogloss:1.52641	validation_1-mlogloss:1.53370
[16]	validation_0-mlogloss:1.45334	validation_1-mlogloss:1.46157
[18]	validation_0-mlogloss:1.38273	validation_1-mlogloss:1.39175
[20]	validation_0-mlogloss:1.32029	validation_1-mlogloss:1.33035
[22]	validation_0-mlogloss:1.26174	validation_1-mlogloss:1.27285
[24]	validation_0-mlo

[I 2025-03-25 09:10:51,577] Trial 7 finished with value: 0.8324771861968937 and parameters: {'max_depth': 7, 'learning_rate': 0.06132738041800587, 'n_estimators': 558, 'subsample': 0.6436122774026433, 'colsample_bytree': 0.677933784141419, 'min_child_weight': 9, 'gamma': 2.59055661956616}. Best is trial 1 with value: 0.837855804677585.


🏃 View run smiling-tern-609 at: http://localhost:2002/#/experiments/608416825633838197/runs/f5bd031e415c45b4ac276755266aa896
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.55398	validation_1-mlogloss:2.55577
[2]	validation_0-mlogloss:2.11426	validation_1-mlogloss:2.11650
[4]	validation_0-mlogloss:1.86727	validation_1-mlogloss:1.87171
[6]	validation_0-mlogloss:1.67967	validation_1-mlogloss:1.68508
[8]	validation_0-mlogloss:1.52961	validation_1-mlogloss:1.53583
[10]	validation_0-mlogloss:1.41216	validation_1-mlogloss:1.41946
[12]	validation_0-mlogloss:1.31926	validation_1-mlogloss:1.32687
[14]	validation_0-mlogloss:1.24148	validation_1-mlogloss:1.24959
[16]	validation_0-mlogloss:1.17637	validation_1-mlogloss:1.18522
[18]	validation_0-mlogloss:1.11587	validation_1-mlogloss:1.12490
[20]	validation_0-mlogloss:1.06511	validation_1-mlogloss:1.07481
[22]	validation_0-mlogloss:1.01644	validation_1-mlogloss:1.02735
[24]	validation_0-mlogl

[I 2025-03-25 09:11:40,407] Trial 8 finished with value: 0.8233516649543724 and parameters: {'max_depth': 6, 'learning_rate': 0.09991888378805232, 'n_estimators': 667, 'subsample': 0.5806190621130847, 'colsample_bytree': 0.7233936027656257, 'min_child_weight': 7, 'gamma': 4.88071790820351}. Best is trial 1 with value: 0.837855804677585.


🏃 View run upset-yak-453 at: http://localhost:2002/#/experiments/608416825633838197/runs/c404026906094231a03b593ece6f11df
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197
[0]	validation_0-mlogloss:2.89983	validation_1-mlogloss:2.90006
[2]	validation_0-mlogloss:2.81446	validation_1-mlogloss:2.81469
[4]	validation_0-mlogloss:2.74242	validation_1-mlogloss:2.74280
[6]	validation_0-mlogloss:2.67509	validation_1-mlogloss:2.67547
[8]	validation_0-mlogloss:2.61077	validation_1-mlogloss:2.61090
[10]	validation_0-mlogloss:2.55233	validation_1-mlogloss:2.55248
[12]	validation_0-mlogloss:2.49904	validation_1-mlogloss:2.49935
[14]	validation_0-mlogloss:2.45045	validation_1-mlogloss:2.45056
[16]	validation_0-mlogloss:2.40480	validation_1-mlogloss:2.40514
[18]	validation_0-mlogloss:2.35972	validation_1-mlogloss:2.36027
[20]	validation_0-mlogloss:2.31843	validation_1-mlogloss:2.31885
[22]	validation_0-mlogloss:2.27758	validation_1-mlogloss:2.27825
[24]	validation_0-mlogloss

[I 2025-03-25 09:13:16,656] Trial 9 finished with value: 0.8014141536230133 and parameters: {'max_depth': 4, 'learning_rate': 0.013344432832836838, 'n_estimators': 647, 'subsample': 0.5012484317183987, 'colsample_bytree': 0.648807703955902, 'min_child_weight': 9, 'gamma': 0.5993926562032714}. Best is trial 1 with value: 0.837855804677585.


🏃 View run traveling-wren-433 at: http://localhost:2002/#/experiments/608416825633838197/runs/7b8e4d7dddd0406aaa714ce93aa0e873
🧪 View experiment at: http://localhost:2002/#/experiments/608416825633838197


In [43]:
 study.best_trial, study.best_params

(FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.837855804677585], datetime_start=datetime.datetime(2025, 3, 25, 9, 2, 12, 768021), datetime_complete=datetime.datetime(2025, 3, 25, 9, 3, 58, 540158), params={'max_depth': 8, 'learning_rate': 0.044353992283905075, 'n_estimators': 962, 'subsample': 0.9161256648971945, 'colsample_bytree': 0.7752581060503564, 'min_child_weight': 3, 'gamma': 1.782516264856805}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)},

### Error analysis 

In [26]:
param =  {
    'max_depth': 8,
    'learning_rate': 0.044353992283905075,
    'n_estimators': 962,
    'subsample': 0.9161256648971945,
    'colsample_bytree': 0.7752581060503564,
    'min_child_weight': 3,
    'gamma': 1.782516264856805
}

model = xgb.XGBClassifier(
    **study.best_params,
    tree_method="hist"
)

NameError: name 'study' is not defined

In [48]:
model.fit(X_train, y_train)

In [57]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       713
           1       0.89      0.87      0.88      1198
           2       0.90      0.89      0.90      1165
           3       0.98      0.97      0.97      1211
           4       0.76      0.74      0.75       214
           5       0.87      0.86      0.87      1091
           6       0.80      0.89      0.84      1250
           7       0.63      0.36      0.46       305
           8       0.65      0.67      0.66       990
           9       0.81      0.77      0.79       816
          10       0.93      0.94      0.93      1166
          11       0.90      0.95      0.92      1195
          12       0.76      0.78      0.77      1219
          13       0.78      0.65      0.71       482
          14       0.85      0.89      0.87      1220
          15       0.90      0.90      0.90      1245
          16       0.77      0.57      0.66       422
          17       0.62    

In [59]:
type(y_test), type(y_pred)

(pandas.core.series.Series, numpy.ndarray)

In [61]:
indices = np.arange(len(y_test))

y_test_array = y_test.to_numpy()

missclassified_indices = indices[y_test != y_pred]
missclassified_samples = X_test[missclassified_indices]

for idx, sample in zip(missclassified_indices, missclassified_samples):
    print(f"\n\n===========Sample Index: {idx}============\n\n")
    print(f"True Label:{y_test_array[idx]}")
    print(f"Predicted Label: {y_pred[idx]}")
    print(f"Sample Data: {sample}\n")





True Label:15
Predicted Label: 6
Sample Data: [-0.13908093 -0.14457898 -0.12797779 -0.14222392 -0.13319853 -0.18499567
 -0.20856976 -0.38567183 -0.17948182 -0.19680505 -0.28827239 -0.12771494
 -0.41168933 -0.12954803 -0.16320786 -0.16277165 -0.24196358 -0.15795148
 -0.14340706 -0.1993695  -0.1601961  -0.16085056 -0.17375158 -0.13267836
 -0.15199265 -0.1891086  -0.17132276 -0.18564885 -0.15913934 -0.16756606
 -0.18223925 -0.15341039  2.05708714  9.78784621 -0.19287774 -0.23321828
 -0.14815948 -0.24788729 -0.18898315 -0.21668874 -0.20668534 -0.17509282
  7.18642669 -0.15099312 -0.15676132 -0.17582942 -0.22426307 -0.1476794
 -0.14676746 -0.33496981 -0.15995228 -0.18609043 -0.16754128 -0.14266724
 -0.18758728 -0.21422371 -0.16980777 -0.16256739 -0.16947577 -0.17823464
 -0.13902994 -0.17875841 -0.16690793 -0.21287434 -0.19854208 -0.17436856
 -0.13713111 -0.23002467 -0.15211049 -0.1404188  -0.23067168 -0.18788621
 -0.14862178 -0.14570461 -0.1795044  -0.16704693 -0.23222594 -0.15440686
 -

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

