## Install lirabries

In [1]:
! pip install pyvi spacy-transformers --quiet

In [2]:
! pip freeze | grep transformers

spacy-transformers==1.3.5
transformers==4.36.2


In [3]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
import transformers

  _torch_pytree._register_pytree_node(


## Download dataset

In [4]:
! wget https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/train_syllable.json
! wget https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/test_syllable.json

  pid, fd = os.forkpty()


--2024-09-12 08:37:33--  https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/train_syllable.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2710448 (2.6M) [text/plain]
Saving to: 'train_syllable.json.1'


2024-09-12 08:37:34 (48.8 MB/s) - 'train_syllable.json.1' saved [2710448/2710448]

--2024-09-12 08:37:35--  https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/test_syllable.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1806209 (1.7M) [text/plain]
Saving to: 

## Mine training data

In [5]:
import json

def extract_data(json_file: str) -> list:
    data = []
    with open(json_file, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [6]:
def convert_data(data: list) -> dict:
    training_data = []
    for example in data:
        temp_dict = {}
        temp_dict["text"] = " ".join(example["words"])
        temp_dict["entities"] = []

        start = None
        label = None
        current_pos = 0  

        for i, (word, tag) in enumerate(zip(example["words"], example["tags"])):
            word_len = len(word)

            if tag.startswith("B-"):
                if start is not None:
                    temp_dict["entities"].append((start, current_pos - 1, label))

                start = current_pos
                label = tag[2:]
            elif tag.startswith("I-"):
                if start is not None and tag[2:] == label:
                    pass
                else:
                    raise ValueError("Error in annotation.")
            else:
                if start is not None:
                    temp_dict["entities"].append((start, current_pos - 1, label))
                    start = None
                    label = None

            current_pos += word_len + 1

        # If the last entity hasn't been appended yet, append it
        if start is not None:
            temp_dict["entities"].append((start, current_pos - 1, label))

        training_data.append(temp_dict)
    return training_data

In [7]:
train_json_file = "/kaggle/working/train_syllable.json"
test_json_file = "/kaggle/working/test_syllable.json"

train_data = convert_data(extract_data(train_json_file))
test_data = convert_data(extract_data(test_json_file))

## Process training data

In [8]:
nlp = spacy.blank("vi")
doc_bin = DocBin()

  _torch_pytree._register_pytree_node(


In [9]:
def save_data(data: dict, save_path: str):
    for example in tqdm(data):
        text = example["text"]
        labels = example["entities"]
        doc = nlp.make_doc(text)
        ents = []

        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if not span:
                print("Empty entry. Skip.")
            else:
                ents.append(span)

            filtered_ents = filter_spans(ents)
            doc.ents = filtered_ents
            doc_bin.add(doc)
    
    doc_bin.to_disk(save_path)

In [10]:
save_data(train_data, "/kaggle/working/train.spacy")
save_data(test_data, "/kaggle/working/test.spacy")

  4%|▎         | 186/5027 [00:00<00:07, 637.86it/s]

Empty entry. Skip.


  8%|▊         | 383/5027 [00:00<00:07, 628.13it/s]

Empty entry. Skip.


 17%|█▋        | 840/5027 [00:01<00:06, 628.16it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 26%|██▌       | 1318/5027 [00:02<00:05, 678.40it/s]

Empty entry. Skip.
Empty entry. Skip.


 33%|███▎      | 1652/5027 [00:02<00:05, 592.39it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 42%|████▏     | 2132/5027 [00:03<00:04, 673.60it/s]

Empty entry. Skip.
Empty entry. Skip.


 51%|█████     | 2540/5027 [00:03<00:03, 650.93it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 55%|█████▍    | 2757/5027 [00:04<00:03, 670.95it/s]

Empty entry. Skip.


 59%|█████▉    | 2968/5027 [00:04<00:03, 676.99it/s]

Empty entry. Skip.
Empty entry. Skip.


 67%|██████▋   | 3378/5027 [00:05<00:02, 670.60it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 74%|███████▍  | 3743/5027 [00:05<00:01, 678.46it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 77%|███████▋  | 3878/5027 [00:05<00:01, 654.08it/s]

Empty entry. Skip.


 85%|████████▌ | 4290/5027 [00:06<00:01, 649.78it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 89%|████████▉ | 4489/5027 [00:06<00:00, 641.49it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 96%|█████████▋| 4851/5027 [00:07<00:00, 686.43it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


100%|██████████| 5027/5027 [00:07<00:00, 654.88it/s]


Empty entry. Skip.
Empty entry. Skip.


  4%|▍         | 124/3000 [00:00<00:06, 427.57it/s]

Empty entry. Skip.
Empty entry. Skip.


 11%|█         | 317/3000 [00:00<00:06, 430.59it/s]

Empty entry. Skip.


 15%|█▌        | 454/3000 [00:01<00:05, 439.04it/s]

Empty entry. Skip.


 20%|█▉        | 599/3000 [00:01<00:05, 463.21it/s]

Empty entry. Skip.


 33%|███▎      | 984/3000 [00:02<00:04, 455.90it/s]

Empty entry. Skip.


 39%|███▉      | 1180/3000 [00:02<00:04, 422.44it/s]

Empty entry. Skip.
Empty entry. Skip.


 46%|████▌     | 1382/3000 [00:02<00:02, 584.77it/s]

Empty entry. Skip.
Empty entry. Skip.


 59%|█████▉    | 1775/3000 [00:03<00:01, 614.46it/s]

Empty entry. Skip.
Empty entry. Skip.


 66%|██████▌   | 1981/3000 [00:03<00:01, 649.32it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


 79%|███████▉  | 2378/3000 [00:04<00:00, 646.82it/s]

Empty entry. Skip.


 88%|████████▊ | 2655/3000 [00:04<00:00, 669.56it/s]

Empty entry. Skip.
Empty entry. Skip.
Empty entry. Skip.


100%|█████████▉| 2995/3000 [00:05<00:00, 642.74it/s]

Empty entry. Skip.
Empty entry. Skip.


100%|██████████| 3000/3000 [00:05<00:00, 544.97it/s]


## Train data using command line

Follow official docs at [spacy.io](https://spacy.io/usage/training#quickstart)

Download and modify config file locally. Then upload it into kaggle datasets 

In [21]:
!python -m spacy init fill-config /kaggle/input/spacy-ner-config/ner_config.cfg config.cfg

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Command to train the model. Here we use dev as train also for learning purpose only.

In [22]:
!mkdir ./result5
!python -m spacy train ./config.cfg --output ./result5 --paths.train ./train.spacy --paths.dev ./test.spacy --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Saving to output directory: result5[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[2024-09-12 08:46:21,209] [INFO] Set up nlp object from config
[2024-09-12 08:46:21,249] [INFO] Pipeline: ['transformer', 'ner']
[2024-09-12 08:46:21,255] [INFO] Created vocabulary
[2024-09-12 08:46:21,255] [INFO] Finished initializing nlp object
  _torch_pytree._register_pytree_node(
  with torch.cuda.amp.autocast(self._mixed_precision):
[2024-09-12 08:47:02,103] [INFO] Initialized pipeline components: ['transformer', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  with torch.cuda.amp.autocast(self._mixed_precision):
  0       0         541.16    529.41    0.28    0.17    0.68    0.00
  0     200       

In [31]:
nlp_ner = spacy.load("./result5/model-best")

In [33]:
colors = [
    '#ff0000',  # Red
    '#00ff00',  # Green
    '#0000ff',  # Blue
    '#ffff00',  # Yellow
    '#ff00ff',  # Magenta
    '#00ffff',  # Cyan
    '#c0c0c0',  # Silver
    '#800000',  # Maroon
    '#808000',  # Olive
    '#008080'   # Teal
]
ents = [
    "PATIENT_ID", 
    "PERSON_NAME", 
    "AGE", 
    "GENDER", 
    "OCCUPATION", 
    "LOCATION", 
    "ORGANIZATION", 
    "SYMPTOM_AND_DISEASE", 
    "TRANSPORTATION", 
    "DATE"
]
colors = {key: value for key, value in zip(ents, colors)}
colors

{'PATIENT_ID': '#ff0000',
 'PERSON_NAME': '#00ff00',
 'AGE': '#0000ff',
 'GENDER': '#ffff00',
 'OCCUPATION': '#ff00ff',
 'LOCATION': '#00ffff',
 'ORGANIZATION': '#c0c0c0',
 'SYMPTOM_AND_DISEASE': '#800000',
 'TRANSPORTATION': '#808000',
 'DATE': '#008080'}

In [75]:
from random import choice

text = sorted([data["text"] for data in test_data], key=len, reverse=True)[17]
doc = nlp_ner(text)

In [76]:
spacy.displacy.render(doc, style="ent", options={"colors": colors}, jupyter=True)