### Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri('http://localhost:2002')

In [16]:
import dill
import numpy as np
import pickle
import optuna
import pandas as pd
from ast import literal_eval
from optuna import Trial
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from src.features.embeddings import get_embeddings
from tqdm.notebook import tqdm
import xgboost as xgb
from src.features.embeddings import get_embeddings

In [8]:
def embed_text(text):
    return get_embeddings(text)

In [9]:
features = pd.read_csv("../data/data_version1/combined_data.csv")

In [10]:
features.head()

Unnamed: 0,reason_combind,specialist_name,source_file
0,mất ngủ,thần kinh,reason_specialist - thần kinh.csv
1,rối loạn thần kinh thực vật,thần kinh,reason_specialist - thần kinh.csv
2,đau đầu,thần kinh,reason_specialist - thần kinh.csv
3,"đau đầu,đau sau ngực gần phổi",thần kinh,reason_specialist - thần kinh.csv
4,co giật 3 lần,thần kinh,reason_specialist - thần kinh.csv


In [11]:
features.isnull().sum()

reason_combind     2
specialist_name    0
source_file        0
dtype: int64

In [12]:
features = features.dropna(subset=['reason_combind'])

In [13]:
features.shape

(53056, 3)

In [14]:
features['specialist_name'].unique(), len(features.specialist_name.unique())

(array(['thần kinh', 'nhi khoa', 'thận - tiết niệu', 'ung bướu',
        'hô hấp - phổi', 'chuyên khoa mắt', 'cơ xương khớp', 'tim mạch',
        'tiêu hoá', 'sức khỏe tâm thần', 'nội khoa',
        'tiểu đường - nội tiết', 'tai mũi họng', 'nam học', 'da liễu',
        'sản phụ khoa'], dtype=object),
 16)

In [15]:
# Apply embeddings in batches to avoid memory issues
batch_size = 100
num_batches = len(features) // batch_size + (1 if len(features) % batch_size > 0 else 0)
embeddings = []

for i in tqdm(range(num_batches), desc="Generating embeddings"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(features))
    batch = features.iloc[start_idx:end_idx]
    batch_embeddings = batch.apply(embed_text, axis=1)
    embeddings.extend(batch_embeddings)

features['embedding'] = embeddings

ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))

In [26]:
len(features.specialist_name.unique())

17

In [27]:
label_encoder = LabelEncoder()
features['target'] = label_encoder.fit_transform(features['specialist_name'])

In [62]:
with open('../models/xgboost-v5.1/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [28]:
features

Unnamed: 0.1,Unnamed: 0,partner_id,reason_combind,specialist_name,embedding,target
0,69524,135,khám chẩn do bị ngã xe.,cơ xương khớp,"[-0.06507885456085205, -0.0011571341892704368,...",1
1,147559,299,đau đầu gối,cơ xương khớp,"[-0.060912344604730606, 0.043031957000494, 0.0...",1
2,77536,299,ở xa,cơ xương khớp,"[-0.051097575575113297, 0.03897751122713089, -...",1
3,130089,124,"đau khớp gối. phía sau gối, sợi gân nó căng, n...",cơ xương khớp,"[-0.06421235203742981, 0.06187708303332329, 0....",1
4,67621,10,đau thắt lưng,cơ xương khớp,"[-0.03485986590385437, 0.05823908746242523, 0....",1
...,...,...,...,...,...,...
75363,154161,23,khám hiếm muộn 2 vợ chồng,vô sinh - hiếm muộn,"[-0.03511172905564308, 0.07142797857522964, -0...",16
75364,154175,23,đăng ký làm ivf,vô sinh - hiếm muộn,"[-0.060410574078559875, 0.008837147615849972, ...",16
75365,154338,23,khám sinh sản,vô sinh - hiếm muộn,"[-0.04723597317934036, 0.054028306156396866, 0...",16
75366,154339,23,khám nam khoa,vô sinh - hiếm muộn,"[0.019004328176379204, 0.04893968999385834, -0...",16


In [29]:
features.to_csv('../data/features_with_embeddings.csv')

In [30]:
len(features.target.unique()), features.target.unique(), features.specialist_name.unique()

(17,
 array([ 1, 13, 10,  2,  9,  6,  8,  4, 12,  7,  0, 11, 15,  5, 14,  3, 16]),
 array(['cơ xương khớp', 'tiêu hoá', 'thần kinh', 'da liễu',
        'tai mũi họng', 'nội khoa', 'sức khỏe tâm thần', 'nam học',
        'tim mạch', 'sản phụ khoa', 'chuyên khoa mắt', 'thận - tiết niệu',
        'ung bướu', 'nhi khoa', 'tiểu đường - nội tiết', 'hô hấp - phổi',
        'vô sinh - hiếm muộn'], dtype=object))

In [39]:
X = features['embedding']
y = features['target']

In [41]:
len(X[0])# Convert X from pandas Series to numpy array with proper shape
# Extracting the embeddings and reshaping them into a 2D array
X = np.stack(X.values)
print(f"X shape after conversion: {X.shape}")

X shape after conversion: (75360, 768)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [43]:
X_test.shape, X_train.shape

((15072, 768), (60288, 768))

In [46]:
X_train[0]

array([-3.38968411e-02,  9.50052515e-02, -5.87758049e-02,  1.55183794e-02,
       -1.27889812e-02, -1.29017040e-01, -2.08470086e-03,  5.36970049e-02,
       -3.07082501e-03, -1.84312277e-02, -9.51915607e-03,  2.65663136e-02,
       -7.66303614e-02,  3.66557389e-02,  1.19604329e-02,  1.44758716e-01,
        2.94893309e-02,  6.67265989e-03,  4.46442105e-02,  7.24730790e-02,
        3.33665386e-02,  9.04534832e-02,  2.36086845e-02,  3.62921581e-02,
        3.24841170e-03,  1.54997529e-02,  4.14913110e-02, -2.20373850e-02,
        2.30262913e-02, -4.53358255e-02,  5.14138080e-02,  1.15484651e-03,
        1.63491885e-03, -5.21597154e-02,  3.71023975e-02, -2.58964300e-03,
       -3.24356481e-02, -9.94582660e-03, -2.53100470e-02,  4.22050953e-02,
        1.94744542e-02,  4.73139621e-03,  2.99521126e-02, -4.54868153e-02,
        3.50760150e-04, -4.87898523e-03, -3.48111689e-02,  8.34187046e-02,
       -2.25870255e-02, -4.12159637e-02,  5.51442280e-02,  4.30859700e-02,
       -2.32969150e-02,  

### Random forest

In [47]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)

In [48]:
print(report)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94       697
           1       0.76      0.92      0.84      1250
           2       0.84      0.92      0.88      1250
           3       0.68      0.44      0.54       228
           4       0.78      0.81      0.80      1250
           5       0.61      0.21      0.31       428
           6       0.62      0.47      0.54      1250
           7       0.86      0.78      0.82       907
           8       0.86      0.88      0.87      1251
           9       0.82      0.91      0.86      1251
          10       0.68      0.69      0.69      1251
          11       0.73      0.73      0.73       505
          12       0.77      0.85      0.81      1250
          13       0.78      0.87      0.82      1250
          14       0.72      0.61      0.66       421
          15       0.59      0.46      0.52       493
          16       0.58      0.69      0.63       140

    accuracy              

### XGboost

In [50]:
mlflow.set_experiment('xgboost_v6')

2025/04/04 14:32:29 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_v6' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/398793586970850389', creation_time=1743751949295, experiment_id='398793586970850389', last_update_time=1743751949295, lifecycle_stage='active', name='xgboost_v6', tags={}>

In [51]:
def optimize_xgboost_model(trial: Trial):
    X, y = X_train, y_train
    X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    param = {
        "max_depth": trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }

    model = xgb.XGBClassifier(
        **param,
        tree_method="hist"
    )

    with mlflow.start_run():
        bst = model.fit(X, y, eval_set=[(X, y), (X_valid, y_valid)], verbose=2)
        preds = bst.predict(X_test)
        y_pred = np.rint(preds)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        mlflow.log_params(
            param
        )
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })    
    return accuracy

In [52]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_xgboost_model, n_trials=10, timeout=600)

[I 2025-04-04 14:32:32,938] A new study created in memory with name: no-name-05d55664-fc80-49f4-8e15-98bc6173f738


[0]	validation_0-mlogloss:2.51993	validation_1-mlogloss:2.53981
[2]	validation_0-mlogloss:2.13285	validation_1-mlogloss:2.18573
[4]	validation_0-mlogloss:1.87298	validation_1-mlogloss:1.95501
[6]	validation_0-mlogloss:1.67457	validation_1-mlogloss:1.78280
[8]	validation_0-mlogloss:1.51535	validation_1-mlogloss:1.64821
[10]	validation_0-mlogloss:1.38281	validation_1-mlogloss:1.53741
[12]	validation_0-mlogloss:1.27062	validation_1-mlogloss:1.44495
[14]	validation_0-mlogloss:1.17292	validation_1-mlogloss:1.36661
[16]	validation_0-mlogloss:1.08782	validation_1-mlogloss:1.29950
[18]	validation_0-mlogloss:1.01331	validation_1-mlogloss:1.24126
[20]	validation_0-mlogloss:0.94695	validation_1-mlogloss:1.19129
[22]	validation_0-mlogloss:0.88750	validation_1-mlogloss:1.14691
[24]	validation_0-mlogloss:0.83463	validation_1-mlogloss:1.10798
[26]	validation_0-mlogloss:0.78714	validation_1-mlogloss:1.07411
[28]	validation_0-mlogloss:0.74404	validation_1-mlogloss:1.04402
[30]	validation_0-mlogloss:0.7

[I 2025-04-04 14:59:00,989] Trial 0 finished with value: 0.7772027600849257 and parameters: {'max_depth': 10, 'learning_rate': 0.06836604602332978, 'n_estimators': 444, 'subsample': 0.9587309838050999, 'colsample_bytree': 0.8254282954697382, 'min_child_weight': 6, 'gamma': 0.9661669511820042}. Best is trial 0 with value: 0.7772027600849257.


🏃 View run placid-bat-51 at: http://localhost:2002/#/experiments/398793586970850389/runs/89863b91ae17420ca6bc61b7216f9b03
🧪 View experiment at: http://localhost:2002/#/experiments/398793586970850389


In [54]:
study.best_params

{'max_depth': 10,
 'learning_rate': 0.06836604602332978,
 'n_estimators': 444,
 'subsample': 0.9587309838050999,
 'colsample_bytree': 0.8254282954697382,
 'min_child_weight': 6,
 'gamma': 0.9661669511820042}

In [53]:
model= xgb.XGBClassifier(
    **study.best_params,
    tree_method="hist"
)

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [56]:
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=2)

[0]	validation_0-mlogloss:2.51993	validation_1-mlogloss:2.53888
[2]	validation_0-mlogloss:2.13285	validation_1-mlogloss:2.18447
[4]	validation_0-mlogloss:1.87298	validation_1-mlogloss:1.95367
[6]	validation_0-mlogloss:1.67457	validation_1-mlogloss:1.78036
[8]	validation_0-mlogloss:1.51535	validation_1-mlogloss:1.64447
[10]	validation_0-mlogloss:1.38281	validation_1-mlogloss:1.53294
[12]	validation_0-mlogloss:1.27062	validation_1-mlogloss:1.44048
[14]	validation_0-mlogloss:1.17292	validation_1-mlogloss:1.36191
[16]	validation_0-mlogloss:1.08782	validation_1-mlogloss:1.29468
[18]	validation_0-mlogloss:1.01331	validation_1-mlogloss:1.23628
[20]	validation_0-mlogloss:0.94695	validation_1-mlogloss:1.18529
[22]	validation_0-mlogloss:0.88750	validation_1-mlogloss:1.14112
[24]	validation_0-mlogloss:0.83463	validation_1-mlogloss:1.10222
[26]	validation_0-mlogloss:0.78714	validation_1-mlogloss:1.06812
[28]	validation_0-mlogloss:0.74404	validation_1-mlogloss:1.03802
[30]	validation_0-mlogloss:0.7

In [57]:
y_preds = model.predict(X_test)

In [58]:
print(classification_report(y_preds, y_test))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94       689
           1       0.90      0.79      0.84      1421
           2       0.91      0.88      0.89      1295
           3       0.50      0.68      0.58       168
           4       0.81      0.78      0.79      1284
           5       0.31      0.53      0.39       245
           6       0.50      0.60      0.55      1046
           7       0.79      0.85      0.82       836
           8       0.87      0.85      0.86      1271
           9       0.89      0.84      0.87      1333
          10       0.71      0.70      0.70      1265
          11       0.72      0.70      0.71       517
          12       0.85      0.78      0.81      1362
          13       0.86      0.81      0.83      1326
          14       0.66      0.70      0.68       401
          15       0.51      0.57      0.54       443
          16       0.69      0.56      0.62       170

    accuracy              

In [59]:
model.save_model('../models/xgb_v5.1.json')

In [60]:
model = xgb.XGBClassifier()
model.load_model('../models/xgb_v5.1.json')

In [61]:
model