#**Part-of-Speech Tagging**

In [None]:
# 1. Load Dataset
# Install library
%pip install evaluate

# Import library
from typing import List # type: ignore
import numpy as np # type: ignore
import torch # type: ignore
import evaluate # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
import nltk # type: ignore
nltk.download('treebank') # type: ignore

# Load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples: ", len(tagged_sentences))

# Save sentences and tags
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Number of samples:  3914


In [None]:
# 2. Preprocessing
# Split dataset
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3, random_state=42
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5, random_state=42
)

# Build dataset
# Tokenization
from transformers import AutoTokenizer # type: ignore
from torch.utils.data import Dataset # type: ignore

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

MAX_LEN = 256

# Create label2id dictionary
# This dictionary will map each unique tag to a numerical ID
unique_tags = set([tag for sublist in sentence_tags for tag in sublist])  # Get all unique tags
label2id = {tag: i for i, tag in enumerate(unique_tags)}  # Assign IDs
label2id['O'] = len(label2id) # Add 'O' tag for padding

class PosTaggingDataset(Dataset):
    def __init__(self,
                 sentences: List[List[str]],
                 tags: List[List[str]],
                 tokenizer,
                 label2id,
                 max_len=MAX_LEN):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0),
            "labels": self.pad_and_truncate(labels, pad_id=self.label2id["O"])
        }

    def pad_and_truncate(self,
                         inputs: List[int],
                         pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return torch.as_tensor(padded_inputs)

# Dataset loader
train_dataset = PosTaggingDataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTaggingDataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTaggingDataset(test_sentences, test_tags, tokenizer, label2id)

In [None]:
# 3. Modeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification # type: ignore

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 4. Metric
accuracy = evaluate.load("accuracy")
ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

# 5. Trainer
from transformers import TrainingArguments, Trainer # type: ignore

training_args = TrainingArguments(
                                output_dir="out_dir",
                                learning_rate=1e-5,
                                per_device_train_batch_size=16,
                                per_device_eval_batch_size=16,
                                num_train_epochs=10,
                                eval_strategy="epoch",
                                save_strategy="epoch",
                                load_best_model_at_end=True,
                                )

trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
            )

trainer.train()

# 7. Inference
# Tokenization
test_sentence = "We are exploring the topic of deep learning"
input = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())])
input = input.to("cuda")

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds:
    pred_tags += label2id[pred] + " "

print(pred_tags)