<a href="https://colab.research.google.com/github/nguyenducminh2206/NLP-Projects/blob/main/NER_CMC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification


In [4]:
train_data = pd.read_csv('/content/train_processed.csv')
val_data = pd.read_csv('/content/dev_processed.csv')

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [6]:
unique_labels_train = set(label for labels in train_data['labels'].apply(eval) for label in labels)

In [7]:
unique_labels_val = set(label for labels in val_data['labels'].apply(eval) for label in labels)

In [8]:
all_unique_labels = unique_labels_train.union(unique_labels_val)

In [9]:
all_unique_labels

{'B-LOCATION',
 'B-MISCELLANEOUS',
 'B-ORGANIZATION',
 'B-PERSON',
 'I-LOCATION',
 'I-MISCELLANEOUS',
 'I-ORGANIZATION',
 'I-PERSON',
 'O'}

In [10]:
label_to_id = {label: idx for idx, label in enumerate(sorted(all_unique_labels))}
id_to_label = {id: label for label, id in label_to_id.items()}

In [28]:
id_to_label

{0: 'B-LOCATION',
 1: 'B-MISCELLANEOUS',
 2: 'B-ORGANIZATION',
 3: 'B-PERSON',
 4: 'I-LOCATION',
 5: 'I-MISCELLANEOUS',
 6: 'I-ORGANIZATION',
 7: 'I-PERSON',
 8: 'O'}

In [11]:
label_to_id

{'B-LOCATION': 0,
 'B-MISCELLANEOUS': 1,
 'B-ORGANIZATION': 2,
 'B-PERSON': 3,
 'I-LOCATION': 4,
 'I-MISCELLANEOUS': 5,
 'I-ORGANIZATION': 6,
 'I-PERSON': 7,
 'O': 8}

In [12]:
def encode_examples(texts, token_labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    labels = []

    for i, (text, label) in enumerate(zip(texts, token_labels)):
        # Tokenize input and get the corresponding labels
        tokenized_inputs = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, is_split_into_words=True)
        word_ids = tokenized_inputs.word_ids(batch_index=0)  # Map tokens to their corresponding word ids.
        previous_word_idx = None
        label_ids = []

        # Align label ids with word ids
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(-100)  # Pad tokens
            previous_word_idx = word_idx

        input_ids.append(tokenized_inputs['input_ids'])
        attention_masks.append(tokenized_inputs['attention_mask'])
        labels.append(label_ids)

    return input_ids, attention_masks, labels

# Prepare the training and validation data
train_texts = train_data['tokens'].apply(eval).tolist()
train_labels = train_data['labels'].apply(eval).tolist()
val_texts = val_data['tokens'].apply(eval).tolist()
val_labels = val_data['labels'].apply(eval).tolist()

# Encode the training and validation datasets
train_input_ids, train_attention_masks, train_labels = encode_examples(train_texts, train_labels, tokenizer)
val_input_ids, val_attention_masks, val_labels = encode_examples(val_texts, val_labels, tokenizer)

In [13]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForTokenClassification, Trainer, TrainingArguments

In [14]:
class NERDataset(Dataset):
  def __init__(self, input_ids, attention_masks, labels):
    self.input_ids = input_ids
    self.attention_masks = attention_masks
    self.labels = labels

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {
        'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
        'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
        'labels': torch.tensor(self.labels[idx], dtype=torch.long)
    }

In [15]:
train_dataset = NERDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = NERDataset(val_input_ids, val_attention_masks, val_labels)

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_to_id))

In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)



In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0521,0.060771
2,0.039,0.051667
3,0.0364,0.051765
4,0.0256,0.051298
5,0.007,0.060964
6,0.0109,0.06488
7,0.008,0.064653
8,0.0012,0.074028
9,0.0008,0.071247
10,0.0006,0.072417


TrainOutput(global_step=5860, training_loss=0.01891537663255065, metrics={'train_runtime': 2882.2969, 'train_samples_per_second': 32.488, 'train_steps_per_second': 2.033, 'total_flos': 6117344196433920.0, 'train_loss': 0.01891537663255065, 'epoch': 10.0})

In [21]:
from transformers import pipeline

In [22]:
text = 'tôi tên là Nguyễn Đức Minh, sinh năm 2004, học ở Phần Lan, sinh ra và lớn lên ở Hà Nội'

In [35]:
label_mapping = {
    'LABEL_0': 'B-LOCATION',
    'LABEL_1': 'B-MISCELLANEOUS',
    'LABEL_2': 'B-ORGANIZATION',
    'LABEL_3': 'B-PERSON',
    'LABEL_4': 'I-LOCATION',
    'LABEL_5': 'I-MISCELLANEOUS',
    'LABEL_6': 'I-ORGANIZATION',
    'LABEL_7': 'I-PERSON',
    'LABEL_8': 'O'
}

In [37]:
ner = pipeline('ner', model=model, tokenizer=tokenizer)
results = ner(text)
for result in results:
  if result['entity'] in label_mapping:
    result['entity'] = label_mapping[result['entity']]
results

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'O',
  'score': 0.99998426,
  'index': 1,
  'word': 'tôi',
  'start': 0,
  'end': 3},
 {'entity': 'O',
  'score': 0.999984,
  'index': 2,
  'word': 'tên',
  'start': 4,
  'end': 7},
 {'entity': 'O',
  'score': 0.99997616,
  'index': 3,
  'word': 'là',
  'start': 8,
  'end': 10},
 {'entity': 'B-PERSON',
  'score': 0.9998267,
  'index': 4,
  'word': 'Nguyễn',
  'start': 11,
  'end': 17},
 {'entity': 'I-PERSON',
  'score': 0.9998534,
  'index': 5,
  'word': 'Đức',
  'start': 18,
  'end': 21},
 {'entity': 'I-PERSON',
  'score': 0.9998493,
  'index': 6,
  'word': 'Minh',
  'start': 22,
  'end': 26},
 {'entity': 'O',
  'score': 0.99996984,
  'index': 7,
  'word': ',',
  'start': 26,
  'end': 27},
 {'entity': 'O',
  'score': 0.9999813,
  'index': 8,
  'word': 'sinh',
  'start': 28,
  'end': 32},
 {'entity': 'O',
  'score': 0.99998534,
  'index': 9,
  'word': 'năm',
  'start': 33,
  'end': 36},
 {'entity': 'O',
  'score': 0.99998236,
  'index': 10,
  'word': '2004',
  'start': 37,
