## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.insert(0, '..')

In [3]:
import pickle
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from src.model.model import MedicalSpecialistClassifer
from src.model.data import MedicalDataFrameDataset
from src.model.train import train

In [4]:
with open('../models/nampham1106/hybid-model-classification/specialist_encoder.pkl', 'rb') as f:
    specialist_encoder = pickle.load(f)

with open('../models/nampham1106/hybid-model-classification/age_encoder.pkl', 'rb') as f:
    age_encoder = pickle.load(f)

with open('../models/nampham1106/hybid-model-classification/gender_encoder.pkl', 'rb') as f:
    gender_encoder = pickle.load(f)

In [5]:
model = MedicalSpecialistClassifer(
    num_specialists=len(specialist_encoder.classes_),
    user_feature_dim=2,
    load_pretrained=False,
    trust_remote_code=True
)

In [6]:
model

MedicalSpecialistClassifer(
  (reason_encoder): NewModel(
    (embeddings): NewEmbeddings(
      (word_embeddings): Embedding(250048, 768, padding_idx=1)
      (rotary_emb): NTKScalingRotaryEmbedding()
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NewEncoder(
      (layer): ModuleList(
        (0-11): 12 x NewLayer(
          (attention): NewSdpaAttention(
            (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (o_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (mlp): NewGatedMLP(
            (up_gate_proj): Linear(in_features=768, out_features=6144, bias=False)
            (down_proj): Linear(in_features=3072, out_features=768, bias=True)
            (act_fn): GELUActivation()
            (hidden_dropout): Dropout(p=0.1, inp

In [7]:
model.load_state_dict(
    torch.load('../models/nampham1106/hybid-model-classification/best_model.pt')
)

<All keys matched successfully>

In [8]:
tokenizer = AutoTokenizer.from_pretrained('../models/nampham1106/hybid-model-classification/')

In [9]:
model.eval()
reason_text = "mệt mỏi kéo dài"
data = tokenizer(
    reason_text,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='pt'
)
age_category = "adult"
gender = "male"

In [10]:
# Fix the tensor shape for user_info
user_info = torch.tensor([[gender_encoder.transform([gender])[0], age_encoder.transform([age_category])[0]]], dtype=torch.float32)

In [11]:
data, user_info

({'input_ids': tensor([[     0, 114132, 138058,  44577,  19018,      2,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
  

In [12]:
model.predict(data['input_ids'], data['attention_mask'], user_info=user_info)

(tensor([7]), tensor([0.2358]))

In [13]:
res = specialist_encoder.inverse_transform([model.predict(data['input_ids'], data['attention_mask'], user_info=user_info)[0]])
res.tolist()

  y = column_or_1d(y, warn=True)


['nội khoa']

In [14]:
df = pd.read_csv('../data/data_version 3 - Sheet1(4).csv')

In [15]:
df.head()

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name
0,50.0,29.0,2.0,1,25.0,30,bệnh thiên đầu thống tái phát,chuyên khoa mắt
1,50.0,29.0,2.0,0,1.0,11,nghi ngờ glocom góc mở,chuyên khoa mắt
2,50.0,29.0,2.0,1,1.0,34,"khám mắt, viêm kết giác mạc",chuyên khoa mắt
3,50.0,29.0,2.0,1,1.0,29,khám mắt,chuyên khoa mắt
4,50.0,29.0,2.0,1,1.0,45,mắt bị nhòe,chuyên khoa mắt


In [16]:
df['gender'] = df['gender'].fillna('unknown')
df['gender'] = df['gender'].replace({1.0: 'female', 0.0: 'male'})

In [17]:
df['age'] = df['age'].fillna(0)
df['age_category'] = df['age'].apply(lambda x: 
                                                     'unknown' if x == 0 else 
                                                     'child' if 0 < x <= 15 else 
                                                     'adult')
# Display the results
print(df['age_category'].value_counts())

age_category
adult    46913
child     6479
Name: count, dtype: int64


In [18]:
df.head()

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name,age_category
0,50.0,29.0,2.0,female,25.0,30,bệnh thiên đầu thống tái phát,chuyên khoa mắt,adult
1,50.0,29.0,2.0,male,1.0,11,nghi ngờ glocom góc mở,chuyên khoa mắt,child
2,50.0,29.0,2.0,female,1.0,34,"khám mắt, viêm kết giác mạc",chuyên khoa mắt,adult
3,50.0,29.0,2.0,female,1.0,29,khám mắt,chuyên khoa mắt,adult
4,50.0,29.0,2.0,female,1.0,45,mắt bị nhòe,chuyên khoa mắt,adult


In [26]:
from tqdm import tqdm
from sklearn.exceptions import DataConversionWarning
import warnings

tqdm.pandas()

warnings.filterwarnings("ignore", category=DataConversionWarning)

def apply_model_predict(df, model, tokenizer, specialist_encoder, age_encoder, gender_encoder):
    def predict_row(row):
        reason_text = row['reason_combind']
        age_category = row['age_category']
        gender = row['gender']
        
        data = tokenizer(
            reason_text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        user_info = torch.tensor(
            [[gender_encoder.transform([gender])[0], age_encoder.transform([age_category])[0]]],
            dtype=torch.float32
        )
        prediction = model.predict(data['input_ids'], data['attention_mask'], user_info=user_info)
        return specialist_encoder.inverse_transform([prediction[0]])[0]
    
    df['predicted_specialist'] = df.progress_apply(predict_row, axis=1)
    return df


In [None]:
df = apply_model_predict(df.head(100), model, tokenizer, specialist_encoder, age_encoder, gender_encoder)

 20%|█████████████████████▏                                                                                    | 20/100 [00:05<00:24,  3.32it/s]

In [25]:
df

Unnamed: 0,partner_id,specialist_id,status,gender,province_id,age,reason_combind,specialist_name,age_category,predicted_specialist
0,50.0,29.0,2.0,female,25.0,30,bệnh thiên đầu thống tái phát,chuyên khoa mắt,adult,chuyên khoa mắt
1,50.0,29.0,2.0,male,1.0,11,nghi ngờ glocom góc mở,chuyên khoa mắt,child,thần kinh
2,50.0,29.0,2.0,female,1.0,34,"khám mắt, viêm kết giác mạc",chuyên khoa mắt,adult,nội khoa
3,50.0,29.0,2.0,female,1.0,29,khám mắt,chuyên khoa mắt,adult,nội khoa
4,50.0,29.0,2.0,female,1.0,45,mắt bị nhòe,chuyên khoa mắt,adult,thần kinh
...,...,...,...,...,...,...,...,...,...,...
95,50.0,29.0,2.0,female,22.0,43,"khô, mỏi mắt.",chuyên khoa mắt,adult,thần kinh
96,65.0,29.0,2.0,female,1.0,19,mắt có máu bầm bên trong,chuyên khoa mắt,adult,ung bướu
97,57.0,29.0,2.0,female,1.0,33,"mắt nhìn mờ, có dử, ngứa",chuyên khoa mắt,adult,nội khoa
98,50.0,29.0,2.0,female,38.0,60,bị đau mắt và chữa nhiều nơi không khỏi,chuyên khoa mắt,adult,thần kinh
