## Install lirabries

In [1]:
! pip install spacy spacy-transformers pyvi --quiet

In [2]:
! pip freeze | grep spacy > req_spacy.txt
! cat req_spacy.txt

en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
spacy @ file:///home/conda/feedstock_root/build_artifacts/spacy_1722252006613/work
spacy-alignments==0.9.1
spacy-legacy @ file:///home/conda/feedstock_root/build_artifacts/spacy-legacy_1674550301837/work
spacy-loggers @ file:///home/conda/feedstock_root/build_artifacts/spacy-loggers_1694527114282/work
spacy-transformers==1.3.5


In [3]:
# ! pip uninstall -r req_spacy.txt -y

## Download dataset

In [4]:
! wget https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/test_syllable.json

--2024-09-12 07:33:03--  https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/test_syllable.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1806209 (1.7M) [text/plain]
Saving to: 'test_syllable.json'


2024-09-12 07:33:04 (7.79 MB/s) - 'test_syllable.json' saved [1806209/1806209]



## Mine training data

In [5]:
import json

data = []
with open("/kaggle/working/test_syllable.json", "r") as f:
    for line in f:
        data.append(json.loads(line))

len(data), data[0]

(3000,
 {'words': ['Từ',
   '24',
   '-',
   '7',
   'đến',
   '31',
   '-',
   '7',
   ',',
   'bệnh',
   'nhân',
   'được',
   'mẹ',
   'là',
   'bà',
   'H.T.P',
   '(',
   '47',
   'tuổi',
   ')',
   'đón',
   'về',
   'nhà',
   'ở',
   'phường',
   'Phước',
   'Hoà',
   '(',
   'bằng',
   'xe',
   'máy',
   ')',
   ',',
   'không',
   'đi',
   'đâu',
   'chỉ',
   'ra',
   'Tạp',
   'hoá',
   'Phượng',
   ',',
   'chợ',
   'Vườn',
   'Lài',
   ',',
   'phường',
   'An',
   'Sơn',
   'cùng',
   'mẹ',
   'bán',
   'tạp',
   'hoá',
   'ở',
   'đây',
   '.'],
  'tags': ['O',
   'B-DATE',
   'I-DATE',
   'I-DATE',
   'O',
   'B-DATE',
   'I-DATE',
   'I-DATE',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-NAME',
   'O',
   'B-AGE',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOCATION',
   'I-LOCATION',
   'I-LOCATION',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOCATION',
   'I-LOCATION',
   'I-LOCATION',
   'O

In [6]:
training_data = []
for example in data:
    temp_dict = {}
    temp_dict["text"] = " ".join(example["words"])
    temp_dict["entities"] = []
    
    start = None
    label = None
    current_pos = 0  
    
    for i, (word, tag) in enumerate(zip(example["words"], example["tags"])):
        word_len = len(word)
        
        if tag.startswith("B-"):
            if start is not None:
                temp_dict["entities"].append((start, current_pos - 1, label))
                
            start = current_pos
            label = tag[2:]
        elif tag.startswith("I-"):
            if start is not None and tag[2:] == label:
                pass
            else:
                raise ValueError("Error in annotation.")
        else:
            if start is not None:
                temp_dict["entities"].append((start, current_pos - 1, label))
                start = None
                label = None
                
        current_pos += word_len + 1
        
    # If the last entity hasn't been appended yet, append it
    if start is not None:
        temp_dict["entities"].append((start, current_pos - 1, label))
        
    training_data.append(temp_dict)
    
training_data[0]

{'text': 'Từ 24 - 7 đến 31 - 7 , bệnh nhân được mẹ là bà H.T.P ( 47 tuổi ) đón về nhà ở phường Phước Hoà ( bằng xe máy ) , không đi đâu chỉ ra Tạp hoá Phượng , chợ Vườn Lài , phường An Sơn cùng mẹ bán tạp hoá ở đây .',
 'entities': [(3, 9, 'DATE'),
  (14, 20, 'DATE'),
  (47, 52, 'NAME'),
  (55, 57, 'AGE'),
  (78, 94, 'LOCATION'),
  (133, 147, 'LOCATION'),
  (150, 162, 'LOCATION'),
  (165, 178, 'LOCATION'),
  (187, 198, 'JOB')]}

## Process training data

In [7]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm

In [8]:
nlp = spacy.blank("vi")
doc_bin = DocBin()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [9]:
for example in tqdm(training_data):
    text = example["text"]
    labels = example["entities"]
    doc = nlp.make_doc(text)
    ents = []
    
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if not span:
            print("Empty entry. Skip.")
        else:
            ents.append(span)
        
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)
        
    

  3%|▎         | 97/3000 [00:00<00:08, 337.99it/s]

Empty entry. Skip.
Empty entry. Skip.


 10%|█         | 310/3000 [00:00<00:06, 391.61it/s]

Empty entry. Skip.


 15%|█▍        | 442/3000 [00:01<00:06, 422.23it/s]

Empty entry. Skip.


 22%|██▏       | 673/3000 [00:01<00:05, 428.85it/s]

Empty entry. Skip.


 33%|███▎      | 995/3000 [00:02<00:04, 432.50it/s]

Empty entry. Skip.


 38%|███▊      | 1138/3000 [00:02<00:04, 451.56it/s]

Empty entry. Skip.
Empty entry. Skip.


 46%|████▌     | 1370/3000 [00:03<00:02, 545.16it/s]

Empty entry. Skip.
Empty entry. Skip.


 58%|█████▊    | 1729/3000 [00:03<00:02, 535.93it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 69%|██████▉   | 2068/3000 [00:04<00:01, 604.17it/s]

Empty entry. Skip.
Empty entry. Skip.


 79%|███████▉  | 2378/3000 [00:04<00:01, 609.13it/s]

Empty entry. Skip.


 88%|████████▊ | 2636/3000 [00:05<00:00, 608.88it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


100%|██████████| 3000/3000 [00:05<00:00, 513.23it/s]

Empty entry. Skip.
Empty entry. Skip.





In [10]:
doc_bin.to_disk("/kaggle/working/train.spacy")

## Train data using command line

Follow official docs at [spacy.io](https://spacy.io/usage/training#quickstart)

Download and modify config file locally. Then upload it into kaggle datasets 

In [11]:
!python -m spacy init fill-config /kaggle/input/spacy-config/ner_config.cfg config.cfg

  pid, fd = os.forkpty()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Command to train the model. Here we use dev as train also for learning purpose only.

In [12]:
!mkdir ./result
!python -m spacy train ./config.cfg --output ./result --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: result[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[2024-09-12 07:33:34,916] [INFO] Set up nlp object from config
[2024-09-12 07:33:34,956] [INFO] Pipeline: ['transformer', 'ner']
[2024-09-12 07:33:34,962] [INFO] Created vocabulary
[2024-09-12 07:33:34,962] [INFO] Finished initializing nlp object
tokenizer_config.json: 100%|██████████████████| 48.0/48.0 [00:00<00:00, 331kB/s]
config.json: 100%|█████████████████████████████| 625/625 [00:00<00:00, 4.83MB/s]
vocab.txt: 100%|█████████████████████████████| 872k/872k [00:00<00:00, 4.20MB/s]
tokenizer.json: 100%|██████████████████████| 1.72M/1.72M [00:00<00:00, 8.39MB/s]
  _torch_pytree._register_pytree_node(
model.safetensors: 100%|█████████████████████| 672M/672M [00:09<00:00, 68.6MB/s]
  with torch.cuda.amp.autocast(self._mixed_precision):
  jitify._init_module()
[2024-09-12 07:35:07,662] [INFO] Initialized pipelin

In [13]:
nlp_ner = spacy.load("./result/model-best")

  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))


In [14]:
from random import choice

text = choice([data["text"] for data in training_data])
doc = nlp_ner(text)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [15]:
colors = [
    '#ff0000',  # Red
    '#00ff00',  # Green
    '#0000ff',  # Blue
    '#ffff00',  # Yellow
    '#ff00ff',  # Magenta
    '#00ffff',  # Cyan
    '#c0c0c0',  # Silver
    '#800000',  # Maroon
    '#808000',  # Olive
    '#008080'   # Teal
]
ents = [
    "PATIENT_ID", 
    "PERSON_NAME", 
    "AGE", 
    "GENDER", 
    "OCCUPATION", 
    "LOCATION", 
    "ORGANIZATION", 
    "SYMPTOM&DISEASE", 
    "TRANSPORTATION", 
    "DATE"
]
colors = {key: value for key, value in zip(ents, colors)}
colors

{'PATIENT_ID': '#ff0000',
 'PERSON_NAME': '#00ff00',
 'AGE': '#0000ff',
 'GENDER': '#ffff00',
 'OCCUPATION': '#ff00ff',
 'LOCATION': '#00ffff',
 'ORGANIZATION': '#c0c0c0',
 'SYMPTOM&DISEASE': '#800000',
 'TRANSPORTATION': '#808000',
 'DATE': '#008080'}

In [16]:
spacy.displacy.render(doc, style="ent", options={"colors": colors}, jupyter=True)