<a href="https://colab.research.google.com/github/nguyenanhtienabcd/AIO2024_EXERCISE/blob/feature%2FMODULE8-WEEK1/m08w01_ex01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part of Speech tagging (POS Tagging)

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [2]:

# import các thư viện
from typing import List
from sklearn. model_selection import train_test_split

import numpy as np
import torch
import evaluate
import nltk
nltk.download('treebank')


# lấy dữ liệu đã được phân loại sẵn, sau đó fine_tune
#load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples",len(tagged_sentences))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Number of samples 3914


In [3]:
# save sentence and tags
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
  sentence, tags = zip(*tagged_sentence)
  sentences.append([word.lower() for word in sentence])
  sentence_tags.append([tag for tag in tags])


In [4]:
# chuẩn bị dữ liêu
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.3, random_state=42)
test_sentences, val_sentences, test_tags, val_tags = train_test_split(test_sentences, test_tags, test_size=0.5, random_state=42)

In [14]:
# Tokenization
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
#model = automodel.from_pretrained(model_name)

MAX_LEN = max([len(sentence) for sentence in train_sentences])

class PosTagging_Dataset(Dataset):
  def __init__(self, sentences: List[List[str]],
               tags: List[List[str]],
               tokenizer,
               label2id,
               max_len = MAX_LEN):
    super().__init__()
    self.sentences = sentences
    self.tags = tags
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.label2id = label2id

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, idx):
    input_token = self.sentences[idx]
    label_token = self.tags[idx]

    # chuyển các từ sau dấu space thành id
    input_token = self.tokenizer.convert_tokens_to_ids(input_token)
    # tạo mask cho token
    attention_mask = [1] * len(input_token)
    # chuyển label thành id
    labels = [self.label2id[token] for token in label_token]

    return {
        "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
        "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0),
        "labels": self.pad_and_truncate(labels, pad_id=label2id["O"])
    }

  def pad_and_truncate(self, tokens: List[int], pad_id: int):
    if len(tokens) < self.max_len:
      padded_inputs = tokens + [pad_id] * (self.max_len - len(tokens))
    else:
      padded_inputs = tokens[:self.max_len]
    return torch.as_tensor(padded_inputs)


### Modeling

In [6]:
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(model_name)


pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
from collections import defaultdict

label2id = defaultdict(int, model.config.label2id)
id2label = {id: label for label, id in label2id.items()}

In [16]:
train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTagging_Dataset(val_sentences, val_tags, tokenizer, label2id)
test_dataset = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)

### Metric

In [17]:
import evaluate
accuracy = evaluate.load("accuracy")
ignore_label = len(label2id)

def compute_metric(eval_pred):
  predictions, labels  = eval_pred
  mask = labels != ignore_label
  predictions = np.argmax(predictions, axis = -1)
  return accuracy.compute(predictions=predictions[mask], references=labels[mask])

In [18]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir = "./results",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    load_best_model_at_end = True)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metric
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.043952,0.987411
2,No log,0.036562,0.989344
3,0.144700,0.032963,0.990436
4,0.144700,0.031557,0.990769
5,0.144700,0.030572,0.990995
6,0.029800,0.029991,0.991434
7,0.029800,0.029871,0.991377
8,0.029800,0.029495,0.991616
9,0.023800,0.029326,0.991666
10,0.023800,0.029277,0.99166


TrainOutput(global_step=1720, training_loss=0.06044566520424776, metrics={'train_runtime': 1673.3867, 'train_samples_per_second': 16.368, 'train_steps_per_second': 1.028, 'total_flos': 3789641345256360.0, 'train_loss': 0.06044566520424776, 'epoch': 10.0})

In [20]:
# Tokenization
test_sentence = "We have been learning math since the grade 1"
input_tokens = tokenizer.convert_tokens_to_ids(test_sentence.split())  # Chuyển câu thành token IDs
input_tensor = torch.tensor([input_tokens])  # Đưa vào tensor
input_tensor = input_tensor.to("cuda")  # Chuyển tensor sang GPU (nếu có CUDA)

# Prediction
outputs = model(input_tensor)  # Dự đoán đầu ra từ mô hình
_, preds = torch.max(outputs.logits, dim=-1)  # Lấy nhãn dự đoán với xác suất cao nhất
preds = preds[0].cpu().numpy()  # Chuyển kết quả về NumPy (nếu cần xử lý thêm)

# Decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "  # Ánh xạ từ ID sang nhãn POS

print(pred_tags)  # In ra kết quả gán nhãn

PRP VBP VBN VBG NN IN DT NN CD 
