### Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri('http://localhost:2002')

In [5]:
import dill
import numpy as np
import optuna
import pandas as pd
from optuna import Trial
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from src.features.embeddings import get_embeddings
from tqdm.notebook import tqdm
import xgboost as xgb

In [19]:
features = pd.read_csv('../data/data_label_balanced.csv', index_col=0)

In [20]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name
0,2,khám tiêu hóa,tiêu hoá
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch
4,17,cao huyết áp,tim mạch


In [21]:
features.isnull().sum()

partner_id          0
reason_combind     10
specialist_name     0
dtype: int64

In [22]:
features = features.dropna(subset=['reason_combind'])

In [23]:
features.shape

(84419, 3)

In [24]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [25]:
features

Unnamed: 0,partner_id,reason_combind,specialist_name,target
0,2,khám tiêu hóa,tiêu hoá,16
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15
4,17,cao huyết áp,tim mạch,15
...,...,...,...,...
154449,111,khám tuyến giáp,tiểu đường - nội tiết,17
154450,448,khám bằng lái xe b1,khám tổng quát,6
154451,111,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,9
154457,111,khám tuyến giáp,ung bướu,18


In [26]:
def prepare_embedding(row):
    return get_embeddings(row['reason_combind'])

In [28]:
features['reason_embedding'] = features.apply(prepare_embedding, axis=1)

In [29]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name,target,reason_embedding
0,2,khám tiêu hóa,tiêu hoá,16,"[-0.06037424877285957, 0.07001789659261703, -0..."
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2,"[-0.01998063735663891, 0.03326348960399628, -0..."
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15,"[-0.019692258909344673, 0.0619027316570282, 0...."
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15,"[-0.03724019601941109, 0.07348008453845978, 0...."
4,17,cao huyết áp,tim mạch,15,"[-0.0712781697511673, 0.026695098727941513, -0..."


In [30]:
features.to_csv('../data/features_with_embeddings.csv')

In [35]:
features.head()

Unnamed: 0,partner_id,reason_combind,specialist_name,target,reason_embedding
0,2,khám tiêu hóa,tiêu hoá,16,"[-0.06037424877285957, 0.07001789659261703, -0..."
1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,2,"[-0.01998063735663891, 0.03326348960399628, -0..."
2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,15,"[-0.019692258909344673, 0.0619027316570282, 0...."
3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,15,"[-0.03724019601941109, 0.07348008453845978, 0...."
4,17,cao huyết áp,tim mạch,15,"[-0.0712781697511673, 0.026695098727941513, -0..."


In [70]:
X = features['reason_embedding'].apply(np.array)
y = features['target']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
X_train = np.stack(X_train.values)
X_test = np.stack(X_test.values)

In [73]:
X_train.shape, X_test.shape

((67535, 768), (16884, 768))

### XGboost

In [74]:
mlflow.set_experiment('xgboost_v4.1')

<Experiment: artifact_location='mlflow-artifacts:/565468804605150420', creation_time=1742897414876, experiment_id='565468804605150420', last_update_time=1742897414876, lifecycle_stage='active', name='xgboost_v4.1', tags={}>

In [75]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }

    model = xgb.XGBClassifier(
        **param,
        tree_method="hist"
    )

    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-03-25 17:46:59,707] A new study created in memory with name: no-name-a6bd3a74-d2e1-4284-96a1-b2c8c4656491


[0]	validation_0-mlogloss:2.71514	validation_1-mlogloss:2.71912
[2]	validation_0-mlogloss:2.38049	validation_1-mlogloss:2.38764
[4]	validation_0-mlogloss:2.15980	validation_1-mlogloss:2.17020
[6]	validation_0-mlogloss:1.99569	validation_1-mlogloss:2.00914
[8]	validation_0-mlogloss:1.86690	validation_1-mlogloss:1.88317
[10]	validation_0-mlogloss:1.76021	validation_1-mlogloss:1.77946


In [6]:
with open('../data/features_pipeline.dill', 'rb') as f:
    featues_pipeline = dill.load(f)
    
featues_pipeline