In [1]:
import pandas as pd
import pyvi
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import json

In [2]:
def train_ner_model(train_df, n_iter=30):
    """
    Hu·∫•n luy·ªán NER model t·ª´ m·ªôt DataFrame (ƒë·ªçc t·ª´ file .jsonl).
    """
    
    # --- 1. Chuy·ªÉn ƒë·ªïi DataFrame sang ƒë·ªãnh d·∫°ng list (text, annot) ---
    # ƒê√¢y l√† b∆∞·ªõc quan tr·ªçng ƒë·ªÉ h√†m t∆∞∆°ng th√≠ch v·ªõi d·ªØ li·ªáu t·ª´ DataFrame
    TRAIN_DATA = []
    for _, row in train_df.iterrows():
        TRAIN_DATA.append(
            (row['text'], {"entities": row['entities']})
        )
    
    # --- 2. T·∫°o model v√† pipeline ---
    # D√≤ng n√†y y√™u c·∫ßu th∆∞ vi·ªán 'pyvi' ph·∫£i ƒë∆∞·ª£c c√†i ƒë·∫∑t
    nlp = spacy.blank("vi")
    print("ƒê√£ t·∫°o m√¥ h√¨nh 'vi' tr·ªëng.")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    # --- 3. T·ª∞ ƒê·ªòNG TH√äM T·∫§T C·∫¢ LABEL (Thay ƒë·ªïi quan tr·ªçng) ---
    # T·ª± ƒë·ªông t√¨m t·∫•t c·∫£ c√°c nh√£n duy nh·∫•t t·ª´ d·ªØ li·ªáu
    all_labels = set()
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            all_labels.add(ent[2]) # ent[2] l√† nh√£n (vd: 'SYMPTOM', 'DISEASE')
            
    for label in all_labels:
        ner.add_label(label)
        
    print(f"ƒê√£ th√™m c√°c nh√£n v√†o pipeline: {all_labels}")

    # --- 4. B·∫Øt ƒë·∫ßu training (Gi·ªØ nguy√™n logic c·ªßa b·∫°n) ---
    print("üöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán NER model...")
    
    optimizer = nlp.begin_training()
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    with nlp.disable_pipes(*other_pipes):
        for iteration in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # S·ª≠ d·ª•ng TRAIN_DATA (list ƒë√£ chuy·ªÉn ƒë·ªïi) ·ªü ƒë√¢y
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    try:
                        doc = nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        examples.append(example)
                    except Exception as e:
                        # B·ªè qua n·∫øu c√≥ l·ªói ch√∫ th√≠ch (v√≠ d·ª•: entity n·∫±m ngo√†i c√¢u)
                        print(f"L·ªói khi t·∫°o Example: {e} - D·ªØ li·ªáu: {text[:50]}...")
                
                # C·∫≠p nh·∫≠t model
                if examples: # Ch·ªâ update n·∫øu c√≥ examples h·ª£p l·ªá
                    nlp.update(examples, drop=0.35, sgd=optimizer, losses=losses)
            
            # Ch·ªâ in loss n·∫øu c√≥ 'ner' trong losses
            if (iteration + 1) % 5 == 0 and 'ner' in losses:
                print(f"Iteration {iteration + 1}/{n_iter} - Loss: {losses['ner']:.4f}")
    
    print("‚úì Ho√†n th√†nh training!")
    return nlp

In [3]:
df = pd.read_json("../data/processed/train_data.spacy.jsonl", lines=True)
print(df.head())

                                               text  \
0                         T√¥i b·ªã ƒëau ƒë·∫ßu v√† s·ªët nh·∫π   
1               M·∫•y h√¥m nay t√¥i ho khan v√† ƒëau h·ªçng   
2           T√¥i b·ªã s·ªï m≈©i v√† ngh·∫πt m≈©i m·∫•y ng√†y nay   
3                  C∆° th·ªÉ t√¥i r·∫•t m·ªát m·ªèi v√† u·ªÉ o·∫£i   
4  T√¥i c√≥ tri·ªáu ch·ª©ng ·ªõn l·∫°nh v√† ƒëau nh·ª©c to√†n th√¢n   

                                 entities  
0   [[7, 14, SYMPTOM], [18, 25, SYMPTOM]]  
1  [[16, 23, SYMPTOM], [27, 35, SYMPTOM]]  
2   [[7, 13, SYMPTOM], [17, 26, SYMPTOM]]  
3  [[15, 22, SYMPTOM], [26, 32, SYMPTOM]]  
4  [[19, 26, SYMPTOM], [30, 48, SYMPTOM]]  


In [4]:
# Training
model = train_ner_model(df, n_iter=30)

# L∆∞u model
model.to_disk("../models/spacy_ner_model")
print("‚úì ƒê√£ l∆∞u model v√†o th∆∞ m·ª•c 'ner_model'")

ƒê√£ t·∫°o m√¥ h√¨nh 'vi' tr·ªëng.
ƒê√£ th√™m c√°c nh√£n v√†o pipeline: {'DISEASE', 'SYMPTOM'}
üöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán NER model...
Iteration 5/30 - Loss: 57.3066
Iteration 10/30 - Loss: 40.6455
Iteration 15/30 - Loss: 14.1750
Iteration 20/30 - Loss: 18.7759
Iteration 25/30 - Loss: 8.9134
Iteration 30/30 - Loss: 4.1295
‚úì Ho√†n th√†nh training!
‚úì ƒê√£ l∆∞u model v√†o th∆∞ m·ª•c 'ner_model'


In [5]:
# Test model
print("\nüìã Test model v·ªõi c√¢u m·∫´u:")
test_texts = [
    "T√¥i b·ªã s·ªët v√† ho nhi·ªÅu",
    "Em b·ªã ƒëau b·ª•ng d·ªØ d·ªôi",
    "B·ªánh nh√¢n c√≥ tri·ªáu ch·ª©ng ch√≥ng m·∫∑t v√† bu·ªìn n√¥n"
]

for text in test_texts:
    doc = model(text)
    print(f"\nüìù Input: {text}")
    print(f"üîç Tri·ªáu ch·ª©ng t√¨m ƒë∆∞·ª£c:")
    if doc.ents:
        for ent in doc.ents:
            print(f"   - {ent.text} [{ent.label_}]")
    else:
        print("   (Kh√¥ng t√¨m th·∫•y)")


üìã Test model v·ªõi c√¢u m·∫´u:

üìù Input: T√¥i b·ªã s·ªët v√† ho nhi·ªÅu
üîç Tri·ªáu ch·ª©ng t√¨m ƒë∆∞·ª£c:
   - s·ªët [SYMPTOM]
   - ho nhi·ªÅu [SYMPTOM]

üìù Input: Em b·ªã ƒëau b·ª•ng d·ªØ d·ªôi
üîç Tri·ªáu ch·ª©ng t√¨m ƒë∆∞·ª£c:
   - ƒëau b·ª•ng d·ªØ d·ªôi [SYMPTOM]

üìù Input: B·ªánh nh√¢n c√≥ tri·ªáu ch·ª©ng ch√≥ng m·∫∑t v√† bu·ªìn n√¥n
üîç Tri·ªáu ch·ª©ng t√¨m ƒë∆∞·ª£c:
   - bu·ªìn n√¥n [SYMPTOM]
