## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri('http://localhost:2002')

In [5]:
import numpy as np
import optuna
import pickle
import joblib
import dill
from optuna import Trial
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer, MaxAbsScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [6]:
from src.features.schedules import (
    reason_pipeline_steps,
    numeric_pipeline_steps
)
from src.data_prep_utils import chunk_transform, add_transformed_feature

In [7]:
features = pd.read_csv('../data/data_label_balanced.csv', index_col=0)

from sklearn.model_selection import train_test_split

In [8]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name
37839,10,gãy sương,cơ xương khớp
21257,10,mổ lấy lấy nẹp gãy xương đòn,cơ xương khớp
784,7,đau mắt cá trong,cơ xương khớp
19678,10,"đau nhức khuỷu tay, khó cầm nắm duỗi tay",cơ xương khớp
1163,3,"thóa vị đĩa đệm, gai khớp gối, tràn dịch khớp gối",cơ xương khớp


In [9]:
features.isnull().sum()

partner_id         0
reason_combind     1
specialist_name    0
dtype: int64

In [10]:
features.shape

(29100, 3)

In [11]:
features = features.dropna(subset=["reason_combind"])

In [12]:
features.shape

(29099, 3)

In [13]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [14]:
with open('../models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [15]:
features['target'].unique()

array([ 2, 11,  3, 14, 10,  9,  7, 13,  5,  1,  8,  6, 16, 12, 15,  4,  0,
       17])

In [16]:
for idx, label in zip(features['target'].unique(), label_encoder.inverse_transform(features['target'].unique())):
    print(f"idx : {idx}, label: {label}")


idx : 2, label: cơ xương khớp
idx : 11, label: thần kinh
idx : 3, label: da liễu
idx : 14, label: tiêu hoá
idx : 10, label: tai mũi họng
idx : 9, label: sức khỏe tâm thần
idx : 7, label: nội khoa
idx : 13, label: tim mạch
idx : 5, label: nam học
idx : 1, label: chuyên khoa mắt
idx : 8, label: sản phụ khoa
idx : 6, label: nhi khoa
idx : 16, label: ung bướu
idx : 12, label: thận - tiết niệu
idx : 15, label: tiểu đường - nội tiết
idx : 4, label: hô hấp - phổi
idx : 0, label: bệnh viêm gan
idx : 17, label: vô sinh - hiếm muộn


In [17]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
37839,10,gãy sương,cơ xương khớp,2
21257,10,mổ lấy lấy nẹp gãy xương đòn,cơ xương khớp,2
784,7,đau mắt cá trong,cơ xương khớp,2
19678,10,"đau nhức khuỷu tay, khó cầm nắm duỗi tay",cơ xương khớp,2
1163,3,"thóa vị đĩa đệm, gai khớp gối, tràn dịch khớp gối",cơ xương khớp,2
...,...,...,...,...
49637,23,có dịch sẹo mổ lấy thai em bé trước,vô sinh - hiếm muộn,17
49778,23,"buồng trứng đa nang, khám để làm ivf",vô sinh - hiếm muộn,17
49969,23,kiểm tra và tư vấn mong con,vô sinh - hiếm muộn,17
50096,23,khám sinh con chọn giới tính,vô sinh - hiếm muộn,17


In [18]:
X = features.drop(columns=["specialist_name", "target"], axis=1)
y = features['target']

In [19]:
text_col = "reason_combind"
# numeric_cols = ['partner_id']

tfm = [
    (
        "reason_combind",
        Pipeline(reason_pipeline_steps()),
        text_col
    ),
    # (
    #     "numeric_pipeline",
    #     Pipeline(numeric_pipeline_steps()),
    #     numeric_cols
    # )
]

preprocessor = ColumnTransformer(
    transformers=tfm, remainder="drop"
)

features_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        # (
        #     "normalize",
        #     MaxAbsScaler()
        # ),
    ]
)

In [20]:
# fit the pipeline
fit_df = X.drop_duplicates(subset=["reason_combind"])
features_pipeline.fit(fit_df)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
X_train = chunk_transform(
    X_train, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/24 [00:00<?, ?it/s]

In [23]:
X_test = chunk_transform(
    X_test, features_pipeline, chunk_size=1000
)

Transforming chunks:   0%|          | 0/6 [00:00<?, ?it/s]

In [24]:
X_train[0], X_train.shape

(array([0., 0., 0., ..., 0., 0., 0.], shape=(5000,)), (23279, 5000))

In [25]:
features.shape, type(X)

((29099, 4), pandas.core.frame.DataFrame)

In [26]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
37839,10,gãy sương,cơ xương khớp,2
21257,10,mổ lấy lấy nẹp gãy xương đòn,cơ xương khớp,2
784,7,đau mắt cá trong,cơ xương khớp,2
19678,10,"đau nhức khuỷu tay, khó cầm nắm duỗi tay",cơ xương khớp,2
1163,3,"thóa vị đĩa đệm, gai khớp gối, tràn dịch khớp gối",cơ xương khớp,2
...,...,...,...,...
49637,23,có dịch sẹo mổ lấy thai em bé trước,vô sinh - hiếm muộn,17
49778,23,"buồng trứng đa nang, khám để làm ivf",vô sinh - hiếm muộn,17
49969,23,kiểm tra và tư vấn mong con,vô sinh - hiếm muộn,17
50096,23,khám sinh con chọn giới tính,vô sinh - hiếm muộn,17


In [27]:
SEED=42

In [28]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [29]:
X_train.shape, y_train.shape

((23279, 5000), (23279,))

## Random Forest

In [30]:
mlflow.set_experiment("random_forest_v5")

<Experiment: artifact_location='mlflow-artifacts:/244051155133141666', creation_time=1743153006100, experiment_id='244051155133141666', last_update_time=1743153006100, lifecycle_stage='active', name='random_forest_v5', tags={}>

In [31]:
def optimize_random_forest(trial: Trial):
    n_estimators = trial.suggest_int(name="n_estimators", low=50, high=500, step=100)
    max_features = trial.suggest_categorical(name="max_features", choices=['log2', 'sqrt'])
    max_depth = trial.suggest_int(name="max_depth", low=5, high=20, step=5)
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)

    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }

    model = RandomForestClassifier(random_state=SEED, **params)

    with mlflow.start_run():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [32]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_random_forest, n_trials=10)

[I 2025-04-02 13:36:18,074] A new study created in memory with name: no-name-348ea54e-d940-420b-96d9-dee3e6b6bc61
[I 2025-04-02 13:37:11,129] Trial 0 finished with value: 0.7258750571881245 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7258750571881245.


🏃 View run loud-crane-708 at: http://localhost:2002/#/experiments/244051155133141666/runs/fd7e323d66cc4b6c9756c68f46bb59c8
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:40:47,712] Trial 1 finished with value: 0.7421657073656472 and parameters: {'n_estimators': 450, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run sneaky-whale-152 at: http://localhost:2002/#/experiments/244051155133141666/runs/67c36a3c1b664a9d899441ecc3d5560b
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:42:03,525] Trial 2 finished with value: 0.7341732548075377 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run bustling-goose-771 at: http://localhost:2002/#/experiments/244051155133141666/runs/7eca38ee072e417dadee03742fd098bd
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:47:36,279] Trial 3 finished with value: 0.7279875419937346 and parameters: {'n_estimators': 250, 'max_features': 'sqrt', 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run abundant-goat-749 at: http://localhost:2002/#/experiments/244051155133141666/runs/8ef8dc30317c49f3abeffd53e17d36f6
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:49:43,030] Trial 4 finished with value: 0.6870500101867663 and parameters: {'n_estimators': 350, 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run fearless-flea-254 at: http://localhost:2002/#/experiments/244051155133141666/runs/dd8b5797c4574592866efc91d16edeb0
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:54:38,097] Trial 5 finished with value: 0.712092696982715 and parameters: {'n_estimators': 250, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run hilarious-carp-746 at: http://localhost:2002/#/experiments/244051155133141666/runs/b41625ad92694aa696fbd4d376962b47
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:57:42,979] Trial 6 finished with value: 0.7122749158327497 and parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run funny-wren-16 at: http://localhost:2002/#/experiments/244051155133141666/runs/d4233beac37c45e68fb93783312e7f2e
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:58:45,762] Trial 7 finished with value: 0.7324993053983228 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run fearless-kit-389 at: http://localhost:2002/#/experiments/244051155133141666/runs/83ab21b1f9b545639e6687133a1e4de0
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 13:59:37,581] Trial 8 finished with value: 0.6891615630937643 and parameters: {'n_estimators': 150, 'max_features': 'log2', 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run zealous-goose-97 at: http://localhost:2002/#/experiments/244051155133141666/runs/f787dd3dce5140c8802d8be433782eb1
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


[I 2025-04-02 14:07:10,162] Trial 9 finished with value: 0.7125037600251134 and parameters: {'n_estimators': 450, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7421657073656472.


🏃 View run ambitious-swan-833 at: http://localhost:2002/#/experiments/244051155133141666/runs/a39b4a02cbab465284c7599396c65056
🧪 View experiment at: http://localhost:2002/#/experiments/244051155133141666


In [33]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.7421657073656472], datetime_start=datetime.datetime(2025, 4, 2, 13, 37, 11, 131495), datetime_complete=datetime.datetime(2025, 4, 2, 13, 40, 47, 712658), params={'n_estimators': 450, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=450, log=False, low=50, step=100), 'max_features': CategoricalDistribution(choices=('log2', 'sqrt')), 'max_depth': IntDistribution(high=20, log=False, low=5, step=5), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=2), 'min_samples_leaf': IntDistribution(high=4, log=False, low=1, step=1)}, trial_id=1, value=None)
Best hyperparameters: {'n_estimators': 450, 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}


## Logistic Regression

In [22]:
mlflow.set_experiment("linear_regression_v1")

2025/03/20 15:29:42 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression_v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/123102627901971667', creation_time=1742459382516, experiment_id='123102627901971667', last_update_time=1742459382516, lifecycle_stage='active', name='linear_regression_v1', tags={}>

In [24]:
def optimize_logistic_regression(trial: Trial):
    X, y = X_train, y_train

    solver = trial.suggest_categorical("solver", ["liblinear", "newton-cg", "lbfgs", "sag", "saga"])
    C = trial.suggest_uniform("C", 0.001, 100)

    params = {
        "solver": solver,
        "C": C
    }

    model = LogisticRegression(**params, random_state=SEED)

    with mlflow.start_run():
        model.fit(X, y)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "solver": solver,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_logistic_regression, n_trials=10)

[I 2025-03-20 15:30:13,136] A new study created in memory with name: no-name-fc6fa582-9dc1-4139-a131-32e44f27ff54
  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:30:37,903] Trial 0 finished with value: 0.7148514225936348 and parameters: {'solver': 'sag', 'C': 6.092308429423362}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run judicious-auk-578 at: http://localhost:2002/#/experiments/123102627901971667/runs/8f0324f1b2b743559cd5257e1d46af50
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:31:07,601] Trial 1 finished with value: 0.7147658294599712 and parameters: {'solver': 'saga', 'C': 31.961504878103675}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run worried-pug-572 at: http://localhost:2002/#/experiments/123102627901971667/runs/eb222d4a981d4f56bf96d2a21667fdd4
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:35:09,245] Trial 2 finished with value: 0.7044437078145703 and parameters: {'solver': 'liblinear', 'C': 94.7585783003262}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run beautiful-moose-862 at: http://localhost:2002/#/experiments/123102627901971667/runs/a3291a0c909541439a2dfea085aa8e7b
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-03-20 15:35:24,471] Trial 3 finished with value: 0.7148176073413588 and parameters: {'solver': 'lbfgs', 'C': 4.824384293432776}. Best is trial 0 with value: 0.7148514225936348.
  C = trial.suggest_uniform("C", 0.001, 100)


🏃 View run mysterious-crane-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/bed02c8374c144e7b80ae6b9eb83234e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


[I 2025-03-20 15:39:03,346] Trial 4 finished with value: 0.7045246460953708 and parameters: {'solver': 'liblinear', 'C': 52.26484015177877}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run bemused-carp-798 at: http://localhost:2002/#/experiments/123102627901971667/runs/4c695b79408b4d7587b03062b3b09a8c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:39:14,589] Trial 5 finished with value: 0.714620199656842 and parameters: {'solver': 'newton-cg', 'C': 74.64038215834337}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run big-cow-603 at: http://localhost:2002/#/experiments/123102627901971667/runs/84f2cfcffbc24d9aa0090033840ea2b6
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:42:41,992] Trial 6 finished with value: 0.7045165754601793 and parameters: {'solver': 'liblinear', 'C': 26.97552465816906}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run capricious-elk-248 at: http://localhost:2002/#/experiments/123102627901971667/runs/21807d435fcd4dc2b1db8a0c70b6758c
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:03,222] Trial 7 finished with value: 0.70440663421201 and parameters: {'solver': 'liblinear', 'C': 24.080164430543526}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run traveling-moose-289 at: http://localhost:2002/#/experiments/123102627901971667/runs/340e81dd860a4fecbddf75377377076e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:13,369] Trial 8 finished with value: 0.7146802847625534 and parameters: {'solver': 'newton-cg', 'C': 67.38174142434916}. Best is trial 0 with value: 0.7148514225936348.


🏃 View run learned-zebra-952 at: http://localhost:2002/#/experiments/123102627901971667/runs/29076cf6530147fb978378bfe187f93a
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


  C = trial.suggest_uniform("C", 0.001, 100)
[I 2025-03-20 15:46:45,764] Trial 9 finished with value: 0.7149226275282914 and parameters: {'solver': 'saga', 'C': 2.2712615130224454}. Best is trial 9 with value: 0.7149226275282914.


🏃 View run blushing-cub-533 at: http://localhost:2002/#/experiments/123102627901971667/runs/550b877cec054f53a1aba272e1dcfb3e
🧪 View experiment at: http://localhost:2002/#/experiments/123102627901971667


## Support Vector Machine

In [37]:
mlflow.set_experiment("support_vector_machine")

2025/03/20 09:47:20 INFO mlflow.tracking.fluent: Experiment with name 'support_vector_machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/862488884265769758', creation_time=1742438840703, experiment_id='862488884265769758', last_update_time=1742438840703, lifecycle_stage='active', name='support_vector_machine', tags={}>

In [70]:
def optimize_support_vector_machine(trial: Trial):
    X, y = X_train, y_train
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    C = trial.suggest_int("C", 1, 1000, step=99)  # Ensures divisibility

    params = {
        "kernel": kernel,
        "gamma": gamma,
        "C": C
    }

    model = SVC(kernel=kernel, gamma=gamma, C=C, random_state=SEED)
    model_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model)
        ]
    )

    with mlflow.start_run():
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params({
            "kernel": kernel,
            "gamma": gamma,
            "C": C
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return f1
        

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_support_vector_machine, n_trials=10)

[I 2025-03-20 10:58:39,341] A new study created in memory with name: no-name-09d0461b-a888-406c-8a7a-2603efe79ea1
[I 2025-03-20 11:05:46,413] Trial 0 finished with value: 0.7573041992698557 and parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}. Best is trial 0 with value: 0.7573041992698557.


🏃 View run shivering-wasp-674 at: http://localhost:2002/#/experiments/862488884265769758/runs/3b01b73235b546bfbcedea08581988bc
🧪 View experiment at: http://localhost:2002/#/experiments/862488884265769758




### XGboost

#### preparing dataset for xgboost

for multi-class classification, our target variable must take values in ${\{0,1, ..., K\}}$. However, from the histogram of cover type above, we see that it takes values not continous. to fix this we can use the `scikit-learn label encoder` to create a valid target column.

In [30]:
mlflow.set_experiment("xgboost_v5.1")

<Experiment: artifact_location='mlflow-artifacts:/628056210663196112', creation_time=1743153615504, experiment_id='628056210663196112', last_update_time=1743153615504, lifecycle_stage='active', name='xgboost_v5.1', tags={}>

In [31]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }
    model = xgb.XGBClassifier(
        objective="multi:softmax",
        num_class=20,
        **param,
        tree_method="hist"
    )
    
    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [32]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-04-02 15:14:04,090] A new study created in memory with name: no-name-0d354447-a86c-4330-ba5b-4d8a8f182a16


[0]	validation_0-mlogloss:2.74433	validation_1-mlogloss:2.75063
[2]	validation_0-mlogloss:2.54349	validation_1-mlogloss:2.56012
[4]	validation_0-mlogloss:2.39941	validation_1-mlogloss:2.42379
[6]	validation_0-mlogloss:2.28551	validation_1-mlogloss:2.31297
[8]	validation_0-mlogloss:2.19542	validation_1-mlogloss:2.22798
[10]	validation_0-mlogloss:2.12026	validation_1-mlogloss:2.15699
[12]	validation_0-mlogloss:2.05325	validation_1-mlogloss:2.09343
[14]	validation_0-mlogloss:1.99571	validation_1-mlogloss:2.03928
[16]	validation_0-mlogloss:1.94509	validation_1-mlogloss:1.99106
[18]	validation_0-mlogloss:1.89953	validation_1-mlogloss:1.94843
[20]	validation_0-mlogloss:1.85827	validation_1-mlogloss:1.90999
[22]	validation_0-mlogloss:1.82122	validation_1-mlogloss:1.87432
[24]	validation_0-mlogloss:1.78613	validation_1-mlogloss:1.84155
[26]	validation_0-mlogloss:1.75473	validation_1-mlogloss:1.81266
[28]	validation_0-mlogloss:1.72596	validation_1-mlogloss:1.78509
[30]	validation_0-mlogloss:1.6

[I 2025-04-02 15:27:59,978] Trial 0 finished with value: 0.684020618556701 and parameters: {'max_depth': 3, 'learning_rate': 0.054473540938745756, 'n_estimators': 665, 'subsample': 0.6086580458074587, 'colsample_bytree': 0.9903501769644968, 'min_child_weight': 6, 'gamma': 2.933659166289077}. Best is trial 0 with value: 0.684020618556701.


🏃 View run secretive-stork-150 at: http://localhost:2002/#/experiments/628056210663196112/runs/51efd28d4145428e8d563ff4d0add87b
🧪 View experiment at: http://localhost:2002/#/experiments/628056210663196112


In [33]:
 study.best_trial, study.best_params

(FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.684020618556701], datetime_start=datetime.datetime(2025, 4, 2, 15, 14, 4, 91417), datetime_complete=datetime.datetime(2025, 4, 2, 15, 27, 59, 978235), params={'max_depth': 3, 'learning_rate': 0.054473540938745756, 'n_estimators': 665, 'subsample': 0.6086580458074587, 'colsample_bytree': 0.9903501769644968, 'min_child_weight': 6, 'gamma': 2.933659166289077}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None)},

### Error analysis 

In [36]:
param =  {
    'max_depth': 8,
    'learning_rate': 0.044353992283905075,
    'n_estimators': 962,
    'subsample': 0.9161256648971945,
    'colsample_bytree': 0.7752581060503564,
    'min_child_weight': 3,
    'gamma': 1.782516264856805
}

model = xgb.XGBClassifier(
    **study.best_params,
    tree_method="hist"
)

In [37]:
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.55      0.60       339
           1       0.76      0.79      0.77       708
           2       0.73      0.74      0.73      1161
           3       0.74      0.86      0.79      1143
           4       0.63      0.81      0.71      1251
           5       0.54      0.27      0.36       209
           6       0.63      0.64      0.63      1094
           7       0.66      0.70      0.68      1217
           8       0.38      0.10      0.16       321
           9       0.29      0.12      0.17      1021
          10       0.50      0.57      0.53       740
          11       0.75      0.79      0.77      1235
          12       0.74      0.83      0.78      1187
          13       0.56      0.56      0.56      1132
          14       0.66      0.59      0.62       544
          15       0.70      0.82      0.75      1175
          16       0.73      0.77      0.75      1286
          17       0.46    

In [70]:
model.save_model('../models/xgb_v4.json')

In [68]:
type(y_test), type(y_pred)

(pandas.core.series.Series, numpy.ndarray)

In [61]:
indices = np.arange(len(y_test))

y_test_array = y_test.to_numpy()

missclassified_indices = indices[y_test != y_pred]
missclassified_samples = X_test[missclassified_indices]

for idx, sample in zip(missclassified_indices, missclassified_samples):
    print(f"\n\n===========Sample Index: {idx}============\n\n")
    print(f"True Label:{y_test_array[idx]}")
    print(f"Predicted Label: {y_pred[idx]}")
    print(f"Sample Data: {sample}\n")





True Label:15
Predicted Label: 6
Sample Data: [-0.13908093 -0.14457898 -0.12797779 -0.14222392 -0.13319853 -0.18499567
 -0.20856976 -0.38567183 -0.17948182 -0.19680505 -0.28827239 -0.12771494
 -0.41168933 -0.12954803 -0.16320786 -0.16277165 -0.24196358 -0.15795148
 -0.14340706 -0.1993695  -0.1601961  -0.16085056 -0.17375158 -0.13267836
 -0.15199265 -0.1891086  -0.17132276 -0.18564885 -0.15913934 -0.16756606
 -0.18223925 -0.15341039  2.05708714  9.78784621 -0.19287774 -0.23321828
 -0.14815948 -0.24788729 -0.18898315 -0.21668874 -0.20668534 -0.17509282
  7.18642669 -0.15099312 -0.15676132 -0.17582942 -0.22426307 -0.1476794
 -0.14676746 -0.33496981 -0.15995228 -0.18609043 -0.16754128 -0.14266724
 -0.18758728 -0.21422371 -0.16980777 -0.16256739 -0.16947577 -0.17823464
 -0.13902994 -0.17875841 -0.16690793 -0.21287434 -0.19854208 -0.17436856
 -0.13713111 -0.23002467 -0.15211049 -0.1404188  -0.23067168 -0.18788621
 -0.14862178 -0.14570461 -0.1795044  -0.16704693 -0.23222594 -0.15440686
 -

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

