### Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')

In [3]:
import mlflow

In [42]:
mlflow.set_tracking_uri('http://localhost:2002')

In [9]:
import dill
import numpy as np
import pickle
import optuna
import pandas as pd
from ast import literal_eval
from optuna import Trial
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from src.features.embeddings import get_embeddings
from tqdm.notebook import tqdm
import xgboost as xgb
from src.features.embeddings import get_embeddings

In [18]:
def embed_text(row):
    return get_embeddings(row['reason_combind'])

In [19]:
features = pd.read_csv("../data/data_label_balanced.csv")

In [20]:
features.head()

Unnamed: 0.1,Unnamed: 0,partner_id,reason_combind,specialist_name
0,0,2,khám tiêu hóa,tiêu hoá
1,1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp
2,2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch
3,3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch
4,4,17,cao huyết áp,tim mạch


In [21]:
features.isnull().sum()

Unnamed: 0         0
partner_id         0
reason_combind     1
specialist_name    0
dtype: int64

In [22]:
features = features.dropna(subset=['reason_combind'])

In [23]:
features.shape

(32313, 4)

In [None]:
features['embedding'] = features.apply(embed_text, axis=1)

In [50]:
features = pd.read_csv('../data/features_with_embeddings.csv')

In [29]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [55]:
features

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,partner_id,reason_combind,specialist_name,embedding,target
0,0,0,2,khám tiêu hóa,tiêu hoá,"[-0.06037424877285957, 0.07001789659261703, -0...",17
1,1,1,2,"đau lưng nhiều,ngồi lâu cứng lưng",cơ xương khớp,"[-0.01998063735663891, 0.03326348960399628, -0...",3
2,2,2,4,"hở van tim 3 lá,ngoại tâm thu",tim mạch,"[-0.019692258909344673, 0.0619027316570282, 0....",16
3,3,3,17,"đau tức ngực bên trái,cảm giác hồi hộp",tim mạch,"[-0.03724019601941109, 0.07348008453845978, 0....",16
4,4,4,17,cao huyết áp,tim mạch,"[-0.0712781697511673, 0.026695098727941513, -0...",16
...,...,...,...,...,...,...,...
32308,32309,52118,259,nóng người chảy máu cam,nhi khoa,"[-0.02294689044356346, 0.008230608887970448, -...",9
32309,32310,52119,10,"đau cột sống thời gian dài, đi lại rất khó khă...",cột sống,"[-0.021012093871831894, 0.009305012412369251, ...",4
32310,32311,52124,111,"u gan đang điều trị bị khô miệng, cổ",nội khoa,"[0.0461127869784832, 0.04775919020175934, 0.00...",10
32311,32312,52127,111,bệnh nhiều thứ . nên đi làm tổng quát,nội khoa,"[-0.05708944797515869, 0.0195799320936203, -0....",10


In [31]:
features.to_csv('../data/features_with_embeddings.csv')

In [56]:
X = features.embedding.apply(literal_eval).apply(np.array)
y = features['target']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [57]:
X_test.shape, X_train.shape

((6463,), (25850,))

In [58]:
X_train = np.stack(X_train.values)
X_test = np.stack(X_test.values)

In [59]:
X_train.shape, X_test.shape

((25850, 768), (6463, 768))

In [60]:
X_train[0]

array([-7.08632469e-02,  3.74969840e-02,  2.44817790e-02, -1.16183851e-02,
       -3.01834457e-02, -1.50782615e-02, -3.11173536e-02, -3.12731899e-02,
        1.76047941e-03, -7.25224093e-02, -1.55369556e-02, -6.70778230e-02,
       -3.51327397e-02, -1.09755155e-02, -2.72858180e-02,  1.24692339e-02,
       -8.26989394e-03, -5.53887188e-02,  8.96749794e-02,  9.49457213e-02,
        9.45641398e-02,  3.60710248e-02,  4.12633456e-02,  8.26143622e-02,
       -3.49782035e-02,  1.37828425e-01, -1.19609144e-02, -3.92535217e-02,
       -1.19602389e-03,  2.86216885e-02, -4.30080518e-02, -6.04646541e-02,
        3.92017812e-02, -9.59550887e-02,  1.07488878e-01,  3.18234861e-02,
       -1.98474135e-02,  3.28300111e-02, -1.27583593e-01,  5.53066358e-02,
        6.54343097e-03,  2.00293418e-02,  1.06230313e-02,  4.69815098e-02,
       -2.89551746e-02, -2.82978490e-02, -1.16342455e-01, -1.97907221e-02,
       -4.19620797e-02, -4.52913307e-02,  3.00126616e-02,  2.40988489e-02,
        9.12798569e-03,  

### Random forest

In [49]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)

In [50]:
print(report)

              precision    recall  f1-score   support

           0       0.68      0.58      0.62       339
           1       0.92      0.94      0.93       708
           2       0.75      0.78      0.76      1161
           3       0.73      0.87      0.79      1143
           4       0.84      0.90      0.87      1251
           5       0.59      0.48      0.53       209
           6       0.68      0.64      0.66      1094
           7       0.77      0.81      0.79      1217
           8       0.58      0.17      0.26       321
           9       0.40      0.17      0.24      1021
          10       0.79      0.78      0.79       740
          11       0.84      0.87      0.85      1235
          12       0.81      0.91      0.86      1187
          13       0.61      0.68      0.64      1132
          14       0.69      0.75      0.72       544
          15       0.73      0.83      0.78      1175
          16       0.77      0.86      0.81      1286
          17       0.69    

### XGboost

In [43]:
mlflow.set_experiment('xgboost_v5')

<Experiment: artifact_location='mlflow-artifacts:/430686376537508648', creation_time=1742959056915, experiment_id='430686376537508648', last_update_time=1742959056915, lifecycle_stage='active', name='xgboost_v5', tags={}>

In [44]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }

    model = xgb.XGBClassifier(
        **param,
        tree_method="hist"
    )

    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-03-27 15:47:45,370] A new study created in memory with name: no-name-0bb9ebc6-f961-41ba-94d7-f3e2afa140d5


[0]	validation_0-mlogloss:2.93434	validation_1-mlogloss:2.94002
[2]	validation_0-mlogloss:2.82434	validation_1-mlogloss:2.84172
[4]	validation_0-mlogloss:2.72696	validation_1-mlogloss:2.75528
[6]	validation_0-mlogloss:2.63960	validation_1-mlogloss:2.67950
[8]	validation_0-mlogloss:2.56004	validation_1-mlogloss:2.61030
[10]	validation_0-mlogloss:2.48697	validation_1-mlogloss:2.54758
[12]	validation_0-mlogloss:2.41942	validation_1-mlogloss:2.49007
[14]	validation_0-mlogloss:2.35652	validation_1-mlogloss:2.43683
[16]	validation_0-mlogloss:2.29777	validation_1-mlogloss:2.38721
[18]	validation_0-mlogloss:2.24245	validation_1-mlogloss:2.34164
[20]	validation_0-mlogloss:2.19084	validation_1-mlogloss:2.29920
[22]	validation_0-mlogloss:2.14187	validation_1-mlogloss:2.25838
[24]	validation_0-mlogloss:2.09531	validation_1-mlogloss:2.22020
[26]	validation_0-mlogloss:2.05118	validation_1-mlogloss:2.18415
[28]	validation_0-mlogloss:2.00938	validation_1-mlogloss:2.15032
[30]	validation_0-mlogloss:1.9

[I 2025-03-27 16:19:04,995] Trial 0 finished with value: 0.6848212904224045 and parameters: {'max_depth': 8, 'learning_rate': 0.016895246867747123, 'n_estimators': 856, 'subsample': 0.6811946620266709, 'colsample_bytree': 0.7808773563578378, 'min_child_weight': 6, 'gamma': 4.597246961119394}. Best is trial 0 with value: 0.6848212904224045.


🏃 View run grandiose-wren-63 at: http://localhost:2002/#/experiments/430686376537508648/runs/cb96329df15546508d29b858beec02ed
🧪 View experiment at: http://localhost:2002/#/experiments/430686376537508648


In [46]:
model = xgb.XGBClassifier()
model.load_model('../models/xgb_v4.1.json')

In [47]:
y_preds = model.predict(X_test)

In [48]:
print(classification_report(y_preds, y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        94
           1       0.00      0.00      0.00       341
           2       0.00      0.00      0.00       496
           3       0.05      0.06      0.05       388
           4       0.00      0.00      0.00       536
           5       0.00      0.00      0.00       106
           6       0.04      0.02      0.02       246
           7       0.01      0.00      0.01       432
           8       0.00      0.01      0.00       122
           9       0.01      0.01      0.01       317
          10       0.01      0.01      0.01       258
          11       0.00      0.00      0.00       521
          12       0.00      0.00      0.00       515
          13       0.01      0.01      0.01       487
          14       0.00      0.00      0.00       225
          15       0.00      0.00      0.00       491
          16       0.00      0.00      0.00       520
          17       0.00    

In [61]:
model.save_model('../models/xgb_v4.1.json')

In [62]:
model = xgb.XGBClassifier()
model.load_model('../models/xgb_v4.1.json')

In [63]:
model