# Trabajo: POS tagging using Transformers.

The objective of this project is to develop and evaluate a POS tagger for
Spanish, leveraging a pre-trained transformer model MarIA ("PlanTL-
GOB-ES/roberta-large-bne"). This model will be fine-tuned using the
CESS-ESP corpus, and their performance will be compared against tra-
ditional HMM and TNT taggers. This comparison aims to highlight the
advantages and limitations of modern deep learning approaches relative
to traditional techniques.

## 1.Classical POS-Tagging
First the classical POS-taggers are implemented.
### 1.1 Preprocessing of the corpus
The corpus is processed as described in practical exercise 2.

In [None]:
import nltk
from nltk.corpus import cess_esp
import numpy as np
nltk.download('cess_esp')

corpus_sentences=cess_esp.tagged_sents()
number_sentences=len(corpus_sentences)
print ('number sentences:',number_sentences)
print ('number words:',end=" ")
nw=0
for s in range (len(corpus_sentences)):
   nw=nw+len(corpus_sentences[s])
print (nw)
print (corpus_sentences[0:2])

[nltk_data] Downloading package cess_esp to /root/nltk_data...
[nltk_data]   Unzipping corpora/cess_esp.zip.


number sentences: 6030
number words: 192686
[[('El', 'da0ms0'), ('grupo', 'ncms000'), ('estatal', 'aq0cs0'), ('Electricité_de_France', 'np00000'), ('-Fpa-', 'Fpa'), ('EDF', 'np00000'), ('-Fpt-', 'Fpt'), ('anunció', 'vmis3s0'), ('hoy', 'rg'), (',', 'Fc'), ('jueves', 'W'), (',', 'Fc'), ('la', 'da0fs0'), ('compra', 'ncfs000'), ('del', 'spcms'), ('51_por_ciento', 'Zp'), ('de', 'sps00'), ('la', 'da0fs0'), ('empresa', 'ncfs000'), ('mexicana', 'aq0fs0'), ('Electricidad_Águila_de_Altamira', 'np00000'), ('-Fpa-', 'Fpa'), ('EAA', 'np00000'), ('-Fpt-', 'Fpt'), (',', 'Fc'), ('creada', 'aq0fsp'), ('por', 'sps00'), ('el', 'da0ms0'), ('japonés', 'aq0ms0'), ('Mitsubishi_Corporation', 'np00000'), ('para', 'sps00'), ('poner_en_marcha', 'vmn0000'), ('una', 'di0fs0'), ('central', 'ncfs000'), ('de', 'sps00'), ('gas', 'ncms000'), ('de', 'sps00'), ('495', 'Z'), ('megavatios', 'ncmp000'), ('.', 'Fp')], [('Una', 'di0fs0'), ('portavoz', 'nccs000'), ('de', 'sps00'), ('EDF', 'np00000'), ('explicó', 'vmis3s0'), ('

Preprocess the sentences of the corpus (similar to exercise 2):

In [None]:
def transform_tag(tag):
    if tag.startswith('v'):
        return tag[:3]
    elif tag.startswith('F'):
        return tag[:3]
    else:
        return tag[:2]

def process_sentence(sentence):
    processed_sentence = []
    for word, tag in sentence:
        if word == '*0*' and tag == 'sn':
            continue
        processed_sentence.append((word, transform_tag(tag)))
    return processed_sentence

processed_corpus = [process_sentence(sentence) for sentence in corpus_sentences]

Split the corpus in training and test data set:

In [None]:
import random
random.seed(42)
random.shuffle(processed_corpus)

s_index = int(0.9 * number_sentences)
train_sentences = processed_corpus[:s_index]
test_sentences = processed_corpus[s_index:]

print('train sentences:', len(train_sentences), ', split: ' ,len(train_sentences) / number_sentences * 100, '%')
print('test sentences:', len(test_sentences), ', split: ', len(test_sentences) / number_sentences * 100, '%')

train sentences: 5427 , split:  90.0 %
test sentences: 603 , split:  10.0 %


In [None]:
train_sentences_set = set(tuple(sentence) for sentence in train_sentences)
test_sentences_set = set(tuple(sentence) for sentence in test_sentences)

overlap = train_sentences_set.intersection(test_sentences_set)

# Output the results
print(f"Number of overlapping sentences: {len(overlap)}")
if len(overlap) > 0:
    print("Example of overlapping sentences:", list(overlap)[:5])

Number of overlapping sentences: 2
Example of overlapping sentences: [(('La', 'da'), ('legislación', 'nc'), ('electoral', 'aq'), ('dominicana', 'aq'), (',', 'Fc'), ('que', 'pr'), ('desde', 'sp'), ('1996', 'W'), ('dispone', 'vmi'), ('la', 'da'), ('modalidad', 'nc'), ('de', 'sp'), ('colegios', 'nc'), ('cerrados', 'aq'), ('de', 'sp'), ('votación', 'nc'), (',', 'Fc'), ('establece', 'vmi'), ('que', 'cs'), ('las', 'da'), ('mujeres', 'nc'), ('votan', 'vmi'), ('en', 'sp'), ('las', 'da'), ('primeras', 'ao'), ('horas', 'nc'), ('de', 'sp'), ('la', 'da'), ('mañana', 'nc'), ('y', 'cc'), ('los', 'da'), ('hombres', 'nc'), ('lo', 'pp'), ('hacen', 'vmi'), ('por', 'sp'), ('la', 'da'), ('tarde', 'nc'), ('.', 'Fp')), (('.', 'Fp'),)]


### 1.2 Train HMM and TNT Tagger

The accuracy of the transformer-based tagger will be compared with that of the HMM and TNT taggers. To facilitate this comparison, the HMM and TNT taggers are first trained and evaluated using the test dataset.

In [None]:
from nltk.tag import hmm, tnt

#HMM Tagger
hmm_tagger = hmm.HiddenMarkovModelTagger.train(train_sentences)

#TNT Tagger
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sentences)

hmm_accuracy = hmm_tagger.accuracy(test_sentences)
hmm_precision = sum(hmm_tagger.precision(test_sentences).values()) / len(hmm_tagger.precision(test_sentences))
hmm_recall = sum(hmm_tagger.recall(test_sentences).values()) / len(hmm_tagger.recall(test_sentences))
hmm_f1 = sum(hmm_tagger.f_measure(test_sentences).values()) / len(hmm_tagger.f_measure(test_sentences))

tnt_accuracy = tnt_tagger.accuracy(test_sentences)
tnt_precision = sum(tnt_tagger.precision(test_sentences).values()) / len(tnt_tagger.precision(test_sentences))
tnt_recall = sum(tnt_tagger.recall(test_sentences).values()) / len(tnt_tagger.recall(test_sentences))
tnt_f1 = sum(tnt_tagger.f_measure(test_sentences).values()) / len(tnt_tagger.f_measure(test_sentences))


print(f'HMM Tagger Accuracy: {hmm_accuracy:.4f}, Precision: {hmm_precision:.4f}, Recall: {hmm_recall:.4f}, F1: {hmm_f1:.4f}')

print(f'TNT Tagger Accuracy: {tnt_accuracy:.4f}, Precision: {tnt_precision:.4f}, Recall: {tnt_recall:.4f}, F1: {tnt_f1:.4f}')

HMM Tagger Accuracy: 0.9307, Precision: 0.8892, Recall: 0.8274, F1: 0.8373
TNT Tagger Accuracy: 0.9010, Precision: 0.9162, Recall: 0.8392, F1: 0.8675


## 2.Transformer model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DataCollatorForTokenClassification
import torch
import gc

### 2.1 Preprocessing
For the Transformer model, a portion of the training data is reserved as a validation set to monitor and optimize performance during the training process, helping prevent overfitting and ensuring model generalization. Additionally, the data requires specific preprocessing steps tailored to the Transformer architecture. This includes specialized tokenization, where words are broken down into subword units to handle complex or rare words effectively and align with the pre-trained Transformer’s vocabulary. These steps ensure that the data is processed in a way that maximizes the model’s ability to learn nuanced patterns in language structure.

In [None]:
train_index = int(0.8 * number_sentences)  # 80% for training
val_index = int(0.9 * number_sentences)    # Next 10% for validation

# Split the data
train_sentences = processed_corpus[:train_index]
val_sentences = processed_corpus[train_index:val_index]
test_sentences = processed_corpus[val_index:]

In [None]:
def create_tag_mappings(processed_corpus):
    pos_tags = sorted(set(tag for sentence in processed_corpus for _, tag in sentence))
    tag2id = {tag: idx for idx, tag in enumerate(pos_tags)}
    id2tag = {idx: tag for tag, idx in tag2id.items()}
    return tag2id, id2tag

In [None]:
def preprocess_data(examples, tokenizer, tag2id):
    tokenized_inputs = tokenizer([word for word, tag in examples], is_split_into_words=True, truncation=True, padding=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            tag = transform_tag(examples[word_idx][1])
            tag_id = tag2id.get(tag, tag2id.get('[UNK]', -100))
            label_ids.append(tag_id)
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [None]:
tag2id, id2tag = create_tag_mappings(train_sentences)

print("Tag to ID Mapping (tag2id):")
for tag, id in list(tag2id.items())[:5]:
    print(f"{tag}: {id}")

print("\nID to Tag Mapping (id2tag):")
for id, tag in list(id2tag.items())[:5]:
    print(f"{id}: {tag}")

Tag to ID Mapping (tag2id):
Faa: 0
Fat: 1
Fc: 2
Fd: 3
Fe: 4

ID to Tag Mapping (id2tag):
0: Faa
1: Fat
2: Fc
3: Fd
4: Fe


In [None]:
tokenizer_maria = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-large-bne")

train_data_maria = [preprocess_data(sentence, tokenizer_maria, tag2id) for sentence in train_sentences]
val_data_maria = [preprocess_data(sentence, tokenizer_maria, tag2id) for sentence in val_sentences]
test_data_maria = [preprocess_data(sentence, tokenizer_maria, tag2id) for sentence in test_sentences]

print("Sample from train_data_maria:")
print(train_data_maria[0])


print("\nSample from test_data_maria:")
print(test_data_maria[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]



Sample from train_data_maria:
{'input_ids': [0, 32734, 567, 577, 2900, 70, 64, 590, 1218, 5818, 341, 1207, 362, 16343, 1205, 851, 342, 844, 4043, 512, 2900, 70, 64, 6621, 408, 5519, 16468, 275, 68, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 56, -100, -100, 46, -100, -100, 47, 29, 34, 41, 44, 36, 55, 44, 23, 24, 55, 57, 25, 46, -100, -100, 55, 29, 34, -100, 9, -100, -100]}

Sample from test_data_maria:
{'input_ids': [0, 477, 1154, 2945, 371, 1081, 383, 977, 22345, 320, 365, 12116, 313, 390, 18998, 320, 8832, 632, 2906, 67, 313, 897, 5934, 632, 2906, 67, 313, 365, 9256, 28138, 342, 320, 407, 41248, 334, 390, 9177, 313, 332, 5175, 2967, 275, 68, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 26, 22, 34, 47, 35, 49, 64, 57, 47, 26, 34, 47, 26, 34, 47, 57, 4, -100, -100, 47, 34, 2

### 2.2 Train the transformer

In [None]:
model_name = "PlanTL-GOB-ES/roberta-large-bne"
tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForTokenClassification(tokenizer)

model = AutoModelForTokenClassification.from_pretrained(model_name,
          id2label=id2tag, label2id=tag2id, num_labels=len(tag2id))

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results_{model_name}",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred, id2tag):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=2)
    true_labels = [id2tag[l] for label, pred in zip(labels, preds) for l, p in zip(label, pred) if l != -100]
    true_preds = [id2tag[p] for label, pred in zip(labels, preds) for l, p in zip(label, pred) if l != -100]
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average="weighted")
    accuracy = accuracy_score(true_labels, true_preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_maria,
    eval_dataset=val_data_maria,
    data_collator=data_collator,
    compute_metrics=lambda pred: compute_metrics(pred, id2tag)
)

trainer.train()

eval_results = trainer.evaluate(eval_dataset=test_data_maria)
print(eval_results)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.3506,0.043673,0.98892,0.98863,0.98892,0.988615
1,0.0244,0.037645,0.990259,0.990303,0.990259,0.990132
2,0.0104,0.034684,0.99149,0.991551,0.99149,0.991399


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.04045610874891281, 'eval_accuracy': 0.9903781836892205, 'eval_precision': 0.9904031923019176, 'eval_recall': 0.9903781836892205, 'eval_f1': 0.990353592597733, 'eval_runtime': 4.8468, 'eval_samples_per_second': 124.413, 'eval_steps_per_second': 15.681, 'epoch': 2.9850746268656714}


{'eval_loss': 0.00651115458458662, 'eval_accuracy': 0.9988513708513709, 'eval_precision': 0.9987735856976382, 'eval_recall': 0.9988513708513709, 'eval_f1': 0.9987943435519481, 'eval_runtime': 46.7682, 'eval_samples_per_second': 116.04, 'eval_steps_per_second': 14.518, 'epoch': 2.9867452135493373}