# Part-of-Speech Tagging

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [None]:
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download("treebank")

tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples:",len(tagged_sentences))

sentences, sentence_tags = [],[]

for tagged_sentence in tagged_sentences:
  sentence,tags = zip(*tagged_sentence)
  sentences.append([word.lower() for word in sentence])
  sentence_tags.append([tag for tag in tags])



[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Number of samples: 3914


split dataset

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences,sentence_tags,test_size=0.3)
valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(test_sentences, test_tags, test_size=0.5)



build dataset

In [None]:
from transformers import AutoTokenizer , AutoModelForTokenClassification
from torch.utils.data import Dataset
from collections import defaultdict
model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)
label2id = defaultdict(int,model.config.label2id)
id2label = {id:label for label,id in label2id.items()}
MAX_LEN = 256

class PosTagging_Dataset(Dataset):
  def __init__(self,
               sentences:[List[List[str]]],
               tags:List[List[str]],
               tokenizer,
               label2id,
               max_len=MAX_LEN
  ):

    super().__init__()
    self.sentences =sentences
    self.tags = tags
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.label2id = label2id
  def __len__(self):
    return len(self.sentences)
  def __getitem__(self,idx):

    input_token = self.sentences[idx]
    label_token = self.tags[idx]
    input_token = self.tokenizer.convert_tokens_to_ids(input_token)
    attention_mask = [1]*len(input_token)
    labels = [self.label2id[token] for token in label_token]

    return {
        "input_ids":self.pad_and_truncate(input_token,pad_id=self.tokenizer.pad_token_id),
        "labels": self.pad_and_truncate(labels,pad_id=label2id['0']),
        "attention_mask":self.pad_and_truncate(attention_mask,pad_id=0)

    }
  def pad_and_truncate(self,inputs:List[int],pad_id:int):
    if len(inputs) < self.max_len:
      padded_inputs = inputs + [pad_id]*(self.max_len-len(inputs))
    else:
      padded_inputs = inputs[:self.max_len]
    return torch.as_tensor(padded_inputs)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dataset = PosTagging_Dataset(train_sentences,train_tags,tokenizer,label2id)
val_dataset = PosTagging_Dataset(valid_sentences,valid_tags,tokenizer,label2id)
test_dataset = PosTagging_Dataset(test_sentences,test_tags,tokenizer,label2id)

In [None]:
accuracy = evaluate.load("accuracy")

ignore_label = len(label2id)

def compute_metrics(eval_pred):
  predictions,labels =  eval_pred
  mask = labels != ignore_label
  predictions = np.argmax(predictions,axis=-1)
  return accuracy.compute(predictions=predictions[mask],references=labels[mask])


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
!pip install wandb



In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mphuochuunguyen2009[0m ([33mphuochuunguyen2009-localcompany[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.04602,0.986917
2,No log,0.037834,0.98908
3,0.146100,0.033884,0.990224
4,0.146100,0.032332,0.99071
5,0.146100,0.031017,0.990903
6,0.031300,0.030074,0.991216
7,0.031300,0.030214,0.991189
8,0.031300,0.029908,0.991449
9,0.025200,0.029715,0.991402
10,0.025200,0.029657,0.991396


TrainOutput(global_step=1720, training_loss=0.061838517632595326, metrics={'train_runtime': 1547.1333, 'train_samples_per_second': 17.704, 'train_steps_per_second': 1.112, 'total_flos': 3579882599208960.0, 'train_loss': 0.061838517632595326, 'epoch': 10.0})