## Set up

In [32]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import os
import sys

sys.path.insert(0, '..')

In [34]:
import mlflow

In [35]:
mlflow.set_tracking_uri('http://localhost:2002')

In [36]:
import numpy as np
import optuna
from optuna import Trial
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [37]:
from src.features.schedules import (
    reason_pipeline_steps,
    numeric_pipeline_steps
)
from src.data_prep_utils import chunk_transform, add_transformed_feature

In [38]:
features = pd.read_csv('../data/data_label_balanced.csv', index_col=0)

from sklearn.model_selection import train_test_split

In [39]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name
0,2,khám tiêu hóa,tiêu hoá
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch
4,17,cao huyết áp,tim mạch


In [40]:
features.isnull().sum()

partner_id          0
reason_combind     10
specialist_name     0
dtype: int64

In [41]:
features.shape

(84429, 3)

In [42]:
features = features.dropna(subset=["reason_combind"])

In [43]:
features.shape

(84419, 3)

In [44]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [45]:
 0.59      0.38      0.46       321features['target'].unique()

array([16,  2, 15, 13, 12,  8, 10,  3,  9, 18,  7,  4, 11,  0,  1, 14,  6,
       17,  5, 19])

In [46]:
for idx, label in zip(features['target'].unique(), label_encoder.inverse_transform(features['target'].unique())):
    print(f"idx : {idx}, label: {label}")


idx : 16, label: tiêu hoá
idx : 2, label: cơ xương khớp
idx : 15, label: tim mạch
idx : 13, label: thần kinh
idx : 12, label: tai mũi họng
idx : 8, label: nhi khoa
idx : 10, label: sản phụ khoa
idx : 3, label: cột sống
idx : 9, label: nội khoa
idx : 18, label: ung bướu
idx : 7, label: nam học
idx : 4, label: da liễu
idx : 11, label: sức khỏe tâm thần
idx : 0, label: bệnh viêm gan
idx : 1, label: chuyên khoa mắt
idx : 14, label: thận - tiết niệu
idx : 6, label: khám tổng quát
idx : 17, label: tiểu đường - nội tiết
idx : 5, label: hô hấp - phổi
idx : 19, label: vô sinh - hiếm muộn


In [47]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
0,2,khám tiêu hóa,tiêu hoá,16
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15
4,17,cao huyết áp,tim mạch,15
...,...,...,...,...
154449,111,khám tuyến giáp,tiểu đường - nội tiết,17
154450,448,khám bằng lái xe b1,khám tổng quát,6
154451,111,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,9
154457,111,khám tuyến giáp,ung bướu,18


In [48]:
X = features.drop(columns=["specialist_name", "target"], axis=1)
y = features['target']

In [50]:
text_col = "reason_combind"
numeric_cols = ['partner_id']

tfm = [
    (
        "reason_combind",
        Pipeline(reason_pipeline_steps()),
        text_col
    ),
    (
        "numeric_pipeline",
        Pipeline(numeric_pipeline_steps()),
        numeric_cols
    )
]

preprocessor = ColumnTransformer(
    transformers=tfm, remainder="drop"
)

features_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        (
            "normalize",
            StandardScaler()
        ),
    ]
)

In [51]:
# fit the pipeline
fit_df = X.drop_duplicates(subset=["reason_combind"])
features_pipeline.fit(fit_df)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
X_train = chunk_transform(
    X_train, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/68 [00:00<?, ?it/s]

In [54]:
X_test = chunk_transform(
    X_test, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/17 [00:00<?, ?it/s]

In [55]:
X_train[0], X_train.shape

(array([-0.13890339, -0.14448854, -0.12819779, -0.14149086, -0.18393909,
        -0.21103964,  2.65117188, -0.18033488, -0.19534263, -0.13772154,
         3.85699349, -0.12693845, -0.40978609, -0.12894015, -0.16174901,
        -0.16131506,  4.54061158, -0.15769074, -0.14248021, -0.19866107,
        -0.16062212, -0.19021617, -0.1725176 , -0.13170129, -0.15073873,
        -0.18780466, -0.17122757, -0.18431416, -0.15930095, -0.16687182,
        -0.18105289, -0.15320894, -0.42703218, -0.14414422, -0.19201712,
        -0.23201843, -0.14758675, -0.2465885 , -0.18750779, -0.21842241,
        -0.208667  , -0.1735596 , -0.20701925, -0.15002035, -0.15532614,
        -0.17446311, -0.2225547 , -0.14685893, -0.14846019, -0.3348588 ,
        -0.15881073, -0.18634897, -0.1676147 , -0.14297763, -0.18855394,
        -0.21360297, -0.16873792, -0.16135206, -0.16995917, -0.17702909,
        -0.13969238, -0.17799106, -0.16742549, -0.21131531,  5.57187561,
        -0.17311749, -0.13748692, -0.22854313, -0.1

In [56]:
features.shape, type(X)

((84419, 4), pandas.core.frame.DataFrame)

In [57]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
0,2,khám tiêu hóa,tiêu hoá,16
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15
4,17,cao huyết áp,tim mạch,15
...,...,...,...,...
154449,111,khám tuyến giáp,tiểu đường - nội tiết,17
154450,448,khám bằng lái xe b1,khám tổng quát,6
154451,111,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,9
154457,111,khám tuyến giáp,ung bướu,18


In [58]:
SEED=42

In [59]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [60]:
X_train.shape, y_train.shape

((67535, 129), (67535,))

## Random Forest

In [60]:
mlflow.set_experiment("random_forest_v3")

<Experiment: artifact_location='mlflow-artifacts:/929452482496634215', creation_time=1742457014915, experiment_id='929452482496634215', last_update_time=1742457014915, lifecycle_stage='active', name='random_forest_v3', tags={}>

In [61]:
def optimize_random_forest(trial: Trial):
    n_estimators = trial.suggest_int(name="n_estimators", low=50, high=500, step=100)
    max_features = trial.suggest_categorical(name="max_features", choices=['log2', 'sqrt'])
    max_depth = trial.suggest_int(name="max_depth", low=5, high=20, step=5)
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)

    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }

    model = RandomForestClassifier(random_state=SEED, **params)

    with mlflow.start_run():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [62]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_random_forest, n_trials=10)

[I 2025-03-24 17:48:41,896] A new study created in memory with name: no-name-4c9d3227-212f-44f8-9317-85e62cfad700
[I 2025-03-24 17:49:57,971] Trial 0 finished with value: 0.7936751441203204 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run fortunate-eel-774 at: http://localhost:2002/#/experiments/929452482496634215/runs/542864bf7d8b4e41b6fc2558a68614aa
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:50:25,338] Trial 1 finished with value: 0.7340954240264566 and parameters: {'n_estimators': 250, 'max_features': 'log2', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run trusting-shrew-990 at: http://localhost:2002/#/experiments/929452482496634215/runs/068066c36c4d4c2e8c718dd8d731047c
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:51:15,266] Trial 2 finished with value: 0.7607901419291607 and parameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run resilient-hare-839 at: http://localhost:2002/#/experiments/929452482496634215/runs/805ca4edfb054fcc9c67724bbab3f267
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:52:20,146] Trial 3 finished with value: 0.7705462188120129 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run bald-robin-421 at: http://localhost:2002/#/experiments/929452482496634215/runs/a9e3bb29b365454da08811a6f6574795
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:53:00,247] Trial 4 finished with value: 0.7535308905275904 and parameters: {'n_estimators': 350, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run glamorous-loon-565 at: http://localhost:2002/#/experiments/929452482496634215/runs/b464ef4b9f9a45648848e708d7ce18c7
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:53:17,732] Trial 5 finished with value: 0.7531376048184056 and parameters: {'n_estimators': 150, 'max_features': 'log2', 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run industrious-tern-884 at: http://localhost:2002/#/experiments/929452482496634215/runs/9c2ea078b7bf4340a0f625a9d6fee3d5
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:22,524] Trial 6 finished with value: 0.7713007412144229 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run vaunted-crab-870 at: http://localhost:2002/#/experiments/929452482496634215/runs/d572f1a0993940b08b983232f82da918
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:29,555] Trial 7 finished with value: 0.765687653636794 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7936751441203204.


🏃 View run unleashed-shrike-770 at: http://localhost:2002/#/experiments/929452482496634215/runs/742c73f545194b6d8989b53940aee1a1
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:54:54,301] Trial 8 finished with value: 0.7944789991093926 and parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 8 with value: 0.7944789991093926.


🏃 View run crawling-carp-784 at: http://localhost:2002/#/experiments/929452482496634215/runs/046ddb07237c4e93beef9d6602edc0dc
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


[I 2025-03-24 17:55:53,287] Trial 9 finished with value: 0.7919618985586342 and parameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 8 with value: 0.7944789991093926.


🏃 View run respected-sloth-569 at: http://localhost:2002/#/experiments/929452482496634215/runs/650a71bb94164a1e925e76ba79ea0364
🧪 View experiment at: http://localhost:2002/#/experiments/929452482496634215


In [30]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=4, state=TrialState.COMPLETE, values=[0.814727760781682], datetime_start=datetime.datetime(2025, 3, 24, 12, 51, 59, 801696), datetime_complete=datetime.datetime(2025, 3, 24, 12, 52, 30, 22030), params={'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=450, log=False, low=50, step=100), 'max_features': CategoricalDistribution(choices=('log2', 'sqrt')), 'max_depth': IntDistribution(high=20, log=False, low=5, step=5), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=2), 'min_samples_leaf': IntDistribution(high=4, log=False, low=1, step=1)}, trial_id=4, value=None)
Best hyperparameters: {'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}


## Logistic Regression

In [22]:
mlflow.set_experiment("linear_regression_v1")

2025/03/20 15:29:42 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression_v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/123102627901971667', creation_time=1742459382516, experiment_id='123102627901971667', last_update_time=1742459382516, lifecycle_stage='active', name='linear_regression_v1', tags={}>

In [24]:
def optimize_logistic_regression(trial: Trial):
    X, y = X_train, y_train

    solver = trial.suggest_categorical("solver", ["liblinear", "newton-cg", "lbfgs", "sag", "saga"])
    C = trial.suggest_uniform("C", 0.001, 100)

    params = {
        "solver": solver,
        "C": C
    }

    model = LogisticRegression(**params, random_state=SEED)

    with mlflow.start_run():
        model.fit(X, y)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "solver": solver,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_logistic_regression, n_trials=10)

[I 2025-03-20 15:30:13,136] A new study created in memory with name: no-name-fc6fa582-9dc1-4139-a131-32e44f27ff54
  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:30:37,903] Trial 0 finished with value: 0.7148514225936348 and parameters: {'solver': 'sag', 'C': 6.092308429423362}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run judicious-auk-578 at: http://localhost:2002/#/experiments/123102627901971667/runs/8f0324f1b2b743559cd5257e1d46af50
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:31:07,601] Trial 1 finished with value: 0.7147658294599712 and parameters: {'solver': 'saga', 'C': 31.961504878103675}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run worried-pug-572 at: http://localhost:2002/#/experiments/123102627901971667/runs/eb222d4a981d4f56bf96d2a21667fdd4
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:35:09,245] Trial 2 finished with value: 0.7044437078145703 and parameters: {'solver': 'liblinear', 'C': 94.7585783003262}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run beautiful-moose-862 at: http://localhost:2002/#/experiments/123102627901971667/runs/a3291a0c909541439a2dfea085aa8e7b
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-03-20 15:35:24,471] Trial 3 finished with value: 0.7148176073413588 and parameters: {'solver': 'lbfgs', 'C': 4.824384293432776}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run mysterious-crane-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/bed02c8374c144e7b80ae6b9eb83234e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:39:03,346] Trial 4 finished with value: 0.7045246460953708 and parameters: {'solver': 'liblinear', 'C': 52.26484015177877}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run bemused-carp-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/4c695b79408b4d7587b03062b3b09a8c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:39:14,589] Trial 5 finished with value: 0.714620199656842 and parameters: {'solver': 'newton-cg', 'C': 74.64038215834337}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run big-cow-603 at: http://localhost:2002/#/experiments/123102627901971667/runs/84f2cfcffbc24d9aa0090033840ea2b6
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:42:41,992] Trial 6 finished with value: 0.7045165754601793 and parameters: {'solver': 'liblinear', 'C': 26.97552465816906}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run capricious-elk-248 at: http://localhost:2002/#/experiments/123102627901971667/runs/21807d435fcd4dc2b1db8a0c70b6758c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:03,222] Trial 7 finished with value: 0.70440663421201 and parameters: {'solver': 'liblinear', 'C': 24.080164430543526}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run traveling-moose-289 at: http://localhost:2002/#/experiments/123102627901971667/runs/340e81dd860a4fecbddf75377377076e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:13,369] Trial 8 finished with value: 0.7146802847625534 and parameters: {'solver': 'newton-cg', 'C': 67.38174142434916}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run learned-zebra-952 at: http://localhost:2002/#/experiments/123102627901971667/runs/29076cf6530147fb978378bfe187f93a
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:45,764] Trial 9 finished with value: 0.7149226275282914 and parameters: {'solver': 'saga', 'C': 2.2712615130224454}. Best is trial 9 with value: 0.7149226275282914.


🏃 View run blushing-cub-533 at: http://localhost:2002/#/experiments/123102627901971667/runs/550b877cec054f53a1aba272e1dcfb3e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


## Support Vector Machine

In [37]:
mlflow.set_experiment("support_vector_machine")

2025/03/20 09:47:20 INFO mlflow.tracking.fluent: Experiment with name 'support_vector_machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/862488884265769758', creation_time=1742438840703, experiment_id='862488884265769758', last_update_time=1742438840703, lifecycle_stage='active', name='support_vector_machine', tags={}>

In [70]:
def optimize_support_vector_machine(trial: Trial):
    X, y = X_train, y_train
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    C = trial.suggest_int("C", 1, 1000, step=99)  # Ensures divisibility

    params = {
        "kernel": kernel,
        "gamma": gamma,
        "C": C
    }

    model = SVC(kernel=kernel, gamma=gamma, C=C, random_state=SEED)
    model_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model)
        ]
    )

    with mlflow.start_run():
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "kernel": kernel,
            "gamma": gamma,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1
        

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_support_vector_machine, n_trials=10)

[I 2025-03-20 10:58:39,341] A new study created in memory with name: no-name-09d0461b-a888-406c-8a7a-2603efe79ea1
[I 2025-03-20 11:05:46,413] Trial 0 finished with value: 0.7573041992698557 and parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}. Best is trial 0 with value: 0.7573041992698557.


🏃 View run shivering-wasp-674 at: http://localhost:2002/#/experiments/862488884265769758/runs/3b01b73235b546bfbcedea08581988bc
🧪 View experiment at: http://localhost:2002/#/experiments/862488884265769758




### XGboost

#### preparing dataset for xgboost

for multi-class classification, our target variable must take values in ${\{0,1, ..., K\}}$. However, from the histogram of cover type above, we see that it takes values not continous. to fix this we can use the `scikit-learn label encoder` to create a valid target column.

In [61]:
mlflow.set_experiment("xgboost_v4")

2025/03/25 11:59:41 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_v4' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/698682542899927327', creation_time=1742878781620, experiment_id='698682542899927327', last_update_time=1742878781620, lifecycle_stage='active', name='xgboost_v4', tags={}>

In [62]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }
    model = xgb.XGBClassifier(**param,
                             tree_method="hist")
    
    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [63]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-03-25 11:59:44,710] A new study created in memory with name: no-name-7d7d195e-f9a0-439d-b665-3795d1affe63


[0]	validation_0-mlogloss:2.75549	validation_1-mlogloss:2.75619
[2]	validation_0-mlogloss:2.42394	validation_1-mlogloss:2.42685
[4]	validation_0-mlogloss:2.20706	validation_1-mlogloss:2.21189
[6]	validation_0-mlogloss:2.04375	validation_1-mlogloss:2.05051
[8]	validation_0-mlogloss:1.91542	validation_1-mlogloss:1.92358
[10]	validation_0-mlogloss:1.80377	validation_1-mlogloss:1.81338
[12]	validation_0-mlogloss:1.70743	validation_1-mlogloss:1.71824
[14]	validation_0-mlogloss:1.62015	validation_1-mlogloss:1.63260
[16]	validation_0-mlogloss:1.54752	validation_1-mlogloss:1.56099
[18]	validation_0-mlogloss:1.48094	validation_1-mlogloss:1.49543
[20]	validation_0-mlogloss:1.41916	validation_1-mlogloss:1.43460
[22]	validation_0-mlogloss:1.36483	validation_1-mlogloss:1.38137
[24]	validation_0-mlogloss:1.31533	validation_1-mlogloss:1.33263
[26]	validation_0-mlogloss:1.27196	validation_1-mlogloss:1.29015
[28]	validation_0-mlogloss:1.23304	validation_1-mlogloss:1.25197
[30]	validation_0-mlogloss:1.1

[I 2025-03-25 12:00:04,822] Trial 0 finished with value: 0.7904524994077233 and parameters: {'max_depth': 5, 'learning_rate': 0.05631365776312349, 'n_estimators': 108, 'subsample': 0.7535822046465062, 'colsample_bytree': 0.8450664076819554, 'min_child_weight': 10, 'gamma': 0.4239641853752346}. Best is trial 0 with value: 0.7904524994077233.


🏃 View run spiffy-hare-448 at: http://localhost:2002/#/experiments/698682542899927327/runs/525c2c5668c24b1f87418768e30177ba
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.93735	validation_1-mlogloss:2.93790
[2]	validation_0-mlogloss:2.82750	validation_1-mlogloss:2.82868
[4]	validation_0-mlogloss:2.72660	validation_1-mlogloss:2.72930
[6]	validation_0-mlogloss:2.63446	validation_1-mlogloss:2.63802
[8]	validation_0-mlogloss:2.55184	validation_1-mlogloss:2.55606
[10]	validation_0-mlogloss:2.48087	validation_1-mlogloss:2.48631
[12]	validation_0-mlogloss:2.41732	validation_1-mlogloss:2.42367
[14]	validation_0-mlogloss:2.35653	validation_1-mlogloss:2.36356
[16]	validation_0-mlogloss:2.29795	validation_1-mlogloss:2.30589
[18]	validation_0-mlogloss:2.24248	validation_1-mlogloss:2.25139
[20]	validation_0-mlogloss:2.19006	validation_1-mlogloss:2.19963
[22]	validation_0-mlogloss:2.14215	validation_1-mlogloss:2.15231
[24]	validation_0-mloglo

[I 2025-03-25 12:01:33,433] Trial 1 finished with value: 0.8132551528073916 and parameters: {'max_depth': 10, 'learning_rate': 0.013980339041229407, 'n_estimators': 234, 'subsample': 0.9570854735992347, 'colsample_bytree': 0.598885108333887, 'min_child_weight': 8, 'gamma': 0.5130763205257222}. Best is trial 1 with value: 0.8132551528073916.


🏃 View run serious-bat-89 at: http://localhost:2002/#/experiments/698682542899927327/runs/10b2491f645e465399c6710af5e80497
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.66383	validation_1-mlogloss:2.66715
[2]	validation_0-mlogloss:2.27753	validation_1-mlogloss:2.28599
[4]	validation_0-mlogloss:2.02745	validation_1-mlogloss:2.03950
[6]	validation_0-mlogloss:1.84577	validation_1-mlogloss:1.85960
[8]	validation_0-mlogloss:1.70239	validation_1-mlogloss:1.71857
[10]	validation_0-mlogloss:1.58311	validation_1-mlogloss:1.60144
[12]	validation_0-mlogloss:1.48045	validation_1-mlogloss:1.50113
[14]	validation_0-mlogloss:1.39165	validation_1-mlogloss:1.41436
[16]	validation_0-mlogloss:1.31644	validation_1-mlogloss:1.34027
[18]	validation_0-mlogloss:1.24940	validation_1-mlogloss:1.27458
[20]	validation_0-mlogloss:1.19019	validation_1-mlogloss:1.21670
[22]	validation_0-mlogloss:1.13669	validation_1-mlogloss:1.16479
[24]	validation_0-mloglos

[I 2025-03-25 12:03:18,116] Trial 2 finished with value: 0.8210139777303956 and parameters: {'max_depth': 8, 'learning_rate': 0.06338069351533333, 'n_estimators': 466, 'subsample': 0.8830350761562729, 'colsample_bytree': 0.964513299858103, 'min_child_weight': 8, 'gamma': 2.0071651813380003}. Best is trial 2 with value: 0.8210139777303956.


🏃 View run whimsical-hare-555 at: http://localhost:2002/#/experiments/698682542899927327/runs/44afc14504fd4cbb8a13f3a5253acd85
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.84089	validation_1-mlogloss:2.83976
[2]	validation_0-mlogloss:2.62288	validation_1-mlogloss:2.61974
[4]	validation_0-mlogloss:2.43168	validation_1-mlogloss:2.43092
[6]	validation_0-mlogloss:2.30703	validation_1-mlogloss:2.30631
[8]	validation_0-mlogloss:2.19537	validation_1-mlogloss:2.19471
[10]	validation_0-mlogloss:2.09524	validation_1-mlogloss:2.09603
[12]	validation_0-mlogloss:2.00661	validation_1-mlogloss:2.00839
[14]	validation_0-mlogloss:1.92890	validation_1-mlogloss:1.93187
[16]	validation_0-mlogloss:1.86387	validation_1-mlogloss:1.86749
[18]	validation_0-mlogloss:1.79807	validation_1-mlogloss:1.80227
[20]	validation_0-mlogloss:1.74196	validation_1-mlogloss:1.74664
[22]	validation_0-mlogloss:1.68815	validation_1-mlogloss:1.69331
[24]	validation_0-mlo

[I 2025-03-25 12:03:41,841] Trial 3 finished with value: 0.7769485903814262 and parameters: {'max_depth': 3, 'learning_rate': 0.054986150789731056, 'n_estimators': 201, 'subsample': 0.72541428011233, 'colsample_bytree': 0.5110831244944094, 'min_child_weight': 4, 'gamma': 4.97259084031535}. Best is trial 2 with value: 0.8210139777303956.


🏃 View run abundant-vole-330 at: http://localhost:2002/#/experiments/698682542899927327/runs/116bcb9bf7344875aeff46587f51893d
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.65741	validation_1-mlogloss:2.66037
[2]	validation_0-mlogloss:2.23836	validation_1-mlogloss:2.24455
[4]	validation_0-mlogloss:1.96736	validation_1-mlogloss:1.97863
[6]	validation_0-mlogloss:1.76907	validation_1-mlogloss:1.78293
[8]	validation_0-mlogloss:1.61270	validation_1-mlogloss:1.62850
[10]	validation_0-mlogloss:1.49143	validation_1-mlogloss:1.50943
[12]	validation_0-mlogloss:1.39086	validation_1-mlogloss:1.41082
[14]	validation_0-mlogloss:1.30502	validation_1-mlogloss:1.32686
[16]	validation_0-mlogloss:1.22649	validation_1-mlogloss:1.24986
[18]	validation_0-mlogloss:1.15911	validation_1-mlogloss:1.18387
[20]	validation_0-mlogloss:1.10006	validation_1-mlogloss:1.12643
[22]	validation_0-mlogloss:1.04677	validation_1-mlogloss:1.07426
[24]	validation_0-mlog

[I 2025-03-25 12:05:49,013] Trial 4 finished with value: 0.8248637763563137 and parameters: {'max_depth': 8, 'learning_rate': 0.08719031698829072, 'n_estimators': 994, 'subsample': 0.5515173291833572, 'colsample_bytree': 0.6046911345278895, 'min_child_weight': 6, 'gamma': 1.6579563885629078}. Best is trial 4 with value: 0.8248637763563137.


🏃 View run learned-sheep-261 at: http://localhost:2002/#/experiments/698682542899927327/runs/146a352f44584e43b1e80d0f7765a54f
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.73864	validation_1-mlogloss:2.73996
[2]	validation_0-mlogloss:2.41208	validation_1-mlogloss:2.41571
[4]	validation_0-mlogloss:2.18243	validation_1-mlogloss:2.18962
[6]	validation_0-mlogloss:2.00761	validation_1-mlogloss:2.01669
[8]	validation_0-mlogloss:1.86691	validation_1-mlogloss:1.87684
[10]	validation_0-mlogloss:1.75136	validation_1-mlogloss:1.76315
[12]	validation_0-mlogloss:1.65781	validation_1-mlogloss:1.67045
[14]	validation_0-mlogloss:1.57457	validation_1-mlogloss:1.58842
[16]	validation_0-mlogloss:1.49978	validation_1-mlogloss:1.51444
[18]	validation_0-mlogloss:1.43183	validation_1-mlogloss:1.44747
[20]	validation_0-mlogloss:1.37078	validation_1-mlogloss:1.38723
[22]	validation_0-mlogloss:1.31591	validation_1-mlogloss:1.33304
[24]	validation_0-mlog

[I 2025-03-25 12:07:07,707] Trial 5 finished with value: 0.8231461738924426 and parameters: {'max_depth': 5, 'learning_rate': 0.06569840460718585, 'n_estimators': 544, 'subsample': 0.7227181895543373, 'colsample_bytree': 0.6737514152417964, 'min_child_weight': 7, 'gamma': 0.42927040641113856}. Best is trial 4 with value: 0.8248637763563137.


🏃 View run stylish-frog-899 at: http://localhost:2002/#/experiments/698682542899927327/runs/6ab1d62d63374a10b41babbfcc904659
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.85308	validation_1-mlogloss:2.85200
[2]	validation_0-mlogloss:2.63365	validation_1-mlogloss:2.63194
[4]	validation_0-mlogloss:2.46350	validation_1-mlogloss:2.46258
[6]	validation_0-mlogloss:2.33558	validation_1-mlogloss:2.33578
[8]	validation_0-mlogloss:2.22909	validation_1-mlogloss:2.22954
[10]	validation_0-mlogloss:2.13306	validation_1-mlogloss:2.13418
[12]	validation_0-mlogloss:2.05542	validation_1-mlogloss:2.05709
[14]	validation_0-mlogloss:1.98186	validation_1-mlogloss:1.98428
[16]	validation_0-mlogloss:1.91634	validation_1-mlogloss:1.91940
[18]	validation_0-mlogloss:1.85719	validation_1-mlogloss:1.86095
[20]	validation_0-mlogloss:1.80100	validation_1-mlogloss:1.80560
[22]	validation_0-mlogloss:1.75002	validation_1-mlogloss:1.75516
[24]	validation_0-mlogl

[I 2025-03-25 12:07:50,361] Trial 6 finished with value: 0.7868988391376451 and parameters: {'max_depth': 3, 'learning_rate': 0.04268879900539477, 'n_estimators': 334, 'subsample': 0.7989288418342256, 'colsample_bytree': 0.7427055311862705, 'min_child_weight': 10, 'gamma': 0.7368135218854172}. Best is trial 4 with value: 0.8248637763563137.


🏃 View run tasteful-skunk-392 at: http://localhost:2002/#/experiments/698682542899927327/runs/a5229a7eb65b4573a6f64d5ccc9c9de3
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.62552	validation_1-mlogloss:2.62804
[2]	validation_0-mlogloss:2.20364	validation_1-mlogloss:2.21199
[4]	validation_0-mlogloss:1.94525	validation_1-mlogloss:1.95847
[6]	validation_0-mlogloss:1.75770	validation_1-mlogloss:1.77399
[8]	validation_0-mlogloss:1.61440	validation_1-mlogloss:1.63252
[10]	validation_0-mlogloss:1.49697	validation_1-mlogloss:1.51761
[12]	validation_0-mlogloss:1.39508	validation_1-mlogloss:1.41788
[14]	validation_0-mlogloss:1.30987	validation_1-mlogloss:1.33474
[16]	validation_0-mlogloss:1.23522	validation_1-mlogloss:1.26172
[18]	validation_0-mlogloss:1.16891	validation_1-mlogloss:1.19683
[20]	validation_0-mlogloss:1.11207	validation_1-mlogloss:1.14087
[22]	validation_0-mlogloss:1.06120	validation_1-mlogloss:1.09118
[24]	validation_0-mlo

[I 2025-03-25 12:09:06,728] Trial 7 finished with value: 0.8145581615730869 and parameters: {'max_depth': 7, 'learning_rate': 0.07803730187120754, 'n_estimators': 835, 'subsample': 0.7098511507614664, 'colsample_bytree': 0.804857417418011, 'min_child_weight': 2, 'gamma': 4.54660536083502}. Best is trial 4 with value: 0.8248637763563137.


🏃 View run angry-mule-983 at: http://localhost:2002/#/experiments/698682542899927327/runs/a92dfc4b8fa94b96a3ea614d79737a1d
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327
[0]	validation_0-mlogloss:2.72652	validation_1-mlogloss:2.72936
[2]	validation_0-mlogloss:2.36659	validation_1-mlogloss:2.37510
[4]	validation_0-mlogloss:2.12826	validation_1-mlogloss:2.14079
[6]	validation_0-mlogloss:1.94466	validation_1-mlogloss:1.96151
[8]	validation_0-mlogloss:1.80498	validation_1-mlogloss:1.82436
[10]	validation_0-mlogloss:1.68241	validation_1-mlogloss:1.70435
[12]	validation_0-mlogloss:1.58359	validation_1-mlogloss:1.60799
[14]	validation_0-mlogloss:1.49308	validation_1-mlogloss:1.51980
[16]	validation_0-mlogloss:1.41230	validation_1-mlogloss:1.44135
[18]	validation_0-mlogloss:1.34044	validation_1-mlogloss:1.37171
[20]	validation_0-mlogloss:1.27639	validation_1-mlogloss:1.30940
[22]	validation_0-mlogloss:1.21764	validation_1-mlogloss:1.25296
[24]	validation_0-mloglos

[I 2025-03-25 12:13:01,949] Trial 8 finished with value: 0.8291281686804075 and parameters: {'max_depth': 8, 'learning_rate': 0.055959809711760226, 'n_estimators': 853, 'subsample': 0.7971966693412211, 'colsample_bytree': 0.7663498339026278, 'min_child_weight': 3, 'gamma': 0.7810942563197587}. Best is trial 8 with value: 0.8291281686804075.


🏃 View run whimsical-stork-823 at: http://localhost:2002/#/experiments/698682542899927327/runs/4b637e5c1b4144a1b65d192266487f2e
🧪 View experiment at: http://localhost:2002/#/experiments/698682542899927327


In [64]:
 study.best_trial, study.best_params

(FrozenTrial(number=8, state=TrialState.COMPLETE, values=[0.8291281686804075], datetime_start=datetime.datetime(2025, 3, 25, 12, 9, 6, 728679), datetime_complete=datetime.datetime(2025, 3, 25, 12, 13, 1, 948769), params={'max_depth': 8, 'learning_rate': 0.055959809711760226, 'n_estimators': 853, 'subsample': 0.7971966693412211, 'colsample_bytree': 0.7663498339026278, 'min_child_weight': 3, 'gamma': 0.7810942563197587}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None

### Error analysis 

In [65]:
param =  {
    'max_depth': 8,
    'learning_rate': 0.044353992283905075,
    'n_estimators': 962,
    'subsample': 0.9161256648971945,
    'colsample_bytree': 0.7752581060503564,
    'min_child_weight': 3,
    'gamma': 1.782516264856805
}

model = xgb.XGBClassifier(
    **study.best_params,
    tree_method="hist"
)

In [66]:
model.fit(X_train, y_train)

In [67]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67       339
           1       0.97      0.96      0.96       708
           2       0.86      0.84      0.85      1161
           3       0.87      0.90      0.89      1143
           4       0.98      0.97      0.98      1251
           5       0.80      0.70      0.74       209
           6       0.86      0.90      0.88      1094
           7       0.80      0.88      0.84      1217
           8       0.59      0.38      0.46       321
           9       0.63      0.62      0.63      1021
          10       0.80      0.79      0.80       740
          11       0.93      0.95      0.94      1235
          12       0.90      0.94      0.92      1187
          13       0.74      0.77      0.75      1132
          14       0.79      0.66      0.72       544
          15       0.85      0.87      0.86      1175
          16       0.89      0.88      0.88      1286
          17       0.75    

In [70]:
model.save_model('../models/xgb_v4.json')

In [68]:
type(y_test), type(y_pred)

(pandas.core.series.Series, numpy.ndarray)

In [61]:
indices = np.arange(len(y_test))

y_test_array = y_test.to_numpy()

missclassified_indices = indices[y_test != y_pred]
missclassified_samples = X_test[missclassified_indices]

for idx, sample in zip(missclassified_indices, missclassified_samples):
    print(f"\n\n===========Sample Index: {idx}============\n\n")
    print(f"True Label:{y_test_array[idx]}")
    print(f"Predicted Label: {y_pred[idx]}")
    print(f"Sample Data: {sample}\n")





True Label:15
Predicted Label: 6
Sample Data: [-0.13908093 -0.14457898 -0.12797779 -0.14222392 -0.13319853 -0.18499567
 -0.20856976 -0.38567183 -0.17948182 -0.19680505 -0.28827239 -0.12771494
 -0.41168933 -0.12954803 -0.16320786 -0.16277165 -0.24196358 -0.15795148
 -0.14340706 -0.1993695  -0.1601961  -0.16085056 -0.17375158 -0.13267836
 -0.15199265 -0.1891086  -0.17132276 -0.18564885 -0.15913934 -0.16756606
 -0.18223925 -0.15341039  2.05708714  9.78784621 -0.19287774 -0.23321828
 -0.14815948 -0.24788729 -0.18898315 -0.21668874 -0.20668534 -0.17509282
  7.18642669 -0.15099312 -0.15676132 -0.17582942 -0.22426307 -0.1476794
 -0.14676746 -0.33496981 -0.15995228 -0.18609043 -0.16754128 -0.14266724
 -0.18758728 -0.21422371 -0.16980777 -0.16256739 -0.16947577 -0.17823464
 -0.13902994 -0.17875841 -0.16690793 -0.21287434 -0.19854208 -0.17436856
 -0.13713111 -0.23002467 -0.15211049 -0.1404188  -0.23067168 -0.18788621
 -0.14862178 -0.14570461 -0.1795044  -0.16704693 -0.23222594 -0.15440686
 -

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

