In [1]:
# Disable jedi autocompleter
%config Completer.use_jedi = False

In [2]:
%%writefile requirements_custom.txt
transformers==4.6.1
torch==1.8.1
torchvision==0.9.1
pandas==1.1.5
sklearn==0.0
matplotlib==3.4.2
ipywidgets==7.6.3
datasets==1.6.2
seqeval==1.2.2

Overwriting requirements_custom.txt


In [3]:
!pip install -r requirements_custom.txt



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

# NER BETO

A modo de ejemplo, usaremos la implementación mostrada en [Token Classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb), ajustada al caso de BETO [publicación asociada](https://users.dcc.uchile.cl/~jperez/papers/pml4dc2020.pdf). Otro ejemplo interesante a revisar es [TinyBERT](https://huggingface.co/mrm8488/TinyBERT-spanish-uncased-finetuned-ner)

In [5]:
import torch
# from transformers import BertForMaskedLM, BertTokenizer
# from transformers import BertForTokenClassification
from transformers import DistilBertTokenizerFast

In [6]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-cased"
batch_size = 16

Definimos el dispositivo sobre el cual se hará entrenamiento, en caso de tener disponible una GPU

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
model = DistilBertTokenizerFast.from_pretrained(model_checkpoint)

## Carga de dataset

In [9]:
import os

In [10]:
root_folder = os.getcwd().split('ner_tests')[0]
project_folder = '{}ner_tests'.format(root_folder)
# project_folder
data_path = '{}/data/raw/archive/ner_dataset.csv'.format(project_folder)
data_path

'/Users/iacastro/PycharmProjects/ner_tests/data/raw/archive/ner_dataset.csv'

In [11]:
df_ner = pd.read_csv(data_path,delimiter=',',encoding='latin1',header=0)

In [12]:
df_ner['Sentence #'] = df_ner['Sentence #'].ffill()

In [13]:
df_ner.head(50)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [14]:
unique_texts = df_ner['Sentence #'].unique()

In [15]:
texts = df_ner.groupby('Sentence #')['Word'].apply(list).to_list()#.groupby(level=0).apply(list)
tags = df_ner.groupby('Sentence #')['Tag'].apply(list).to_list()#.groupby(level=0).apply(list)

In [16]:
print(texts[0][10:17], tags[0][10:17], sep='\n')

['war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal']
['O', 'O', 'B-geo', 'O', 'O', 'O', 'O']


In [17]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

Generaremos encodings

In [18]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

Se toman pads para tokenizar

In [19]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Se codifican los tags

In [20]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    count = 0
    error_encodings = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        # print(count)
        # set labels whose first offset position is 0 and the second is not 0
        # print(arr_offset[:,0])
        try:
            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
            encoded_labels.append(doc_enc_labels.tolist())
        except:
            error_encodings.append(count)
        count += 1

    return encoded_labels,error_encodings

train_labels, train_errors = encode_tags(train_tags, train_encodings)
val_labels, val_errors = encode_tags(val_tags, val_encodings)

In [21]:
train_errors

[340,
 4077,
 12605,
 14686,
 15798,
 16941,
 22161,
 22350,
 23028,
 24107,
 25383,
 28715,
 28828,
 31176,
 31703,
 33963,
 34123,
 35047]

In [22]:
val_errors

[699, 4864, 5388]

In [23]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = NERDataset(train_encodings, train_labels)
val_dataset = NERDataset(val_encodings, val_labels)

## Ajuste fino de modelo

Con los datos listos, descargamos el modelo pre-entrenado y lo ajustamos. Dado que todas las tareas a realizar son para clasificación de tokens, usamos la clase ```AutoModelForTokenClassification```. Como con el tokenizado, usamos la función ```from_pretrained``` para descargar y cachear el modelo. Lo único que debemos especificar para el problema es la cantidad de etiquetas

In [31]:
batch_size = 128

In [32]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint)
model = DistilBertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags))

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [None]:
trainer.train()

Step,Training Loss


In [None]:
trainer.evaluate()

La advertencia dice que perdemos algunos pesos asociados a las capas de vocabulario y otras inicializadas aleatoriamente (capas del clasificador). Lo anterior es esperado, ya que estamos removiento las capas finales para ajustarlas con los datos que tenemos

Para instanciar un ```Trainer```, necesitamos definir 3 cosas más. La más importante son los ```TrainingArgumnts```, la cual es una clase que contiene todos los atributos para personalizar el entrenamiento. Ésta requiere un nombre de carpeta, el cual se usará para guardar los checkpoints del modelo y los otros argumentos son opcionales.

Aquí vemos que la evaluación se hace en cada epoch, se manipula la tasa de aprendizaje, se usa el tamaño del batch y se define el número de epochs para el entrenamiento, así como el decaimiento de los pesos.

Luego necesitaremos un ```data_collator``` para que tomen los ejemplos por lotes mientras le aplica padding del mismo tamaño (el padding será del largo del tamaño del más largo ejemplo). Existe un collator para esta tarea en la librería de Transformers, que no sólo aplica el padding sobre las entradas, pero también en las etiquetas:

Finalmente definimos la métrica sobre la cual se evaluará el entrenamiento, en este caso representada por el framework ```seqeval``` [documentación asociada](https://github.com/chakki-works/seqeval)

In [104]:
from datasets import load_metric

metric = load_metric("seqeval")

La métrica toma una lista de etiquetas para las predicciones y referencias

In [116]:
# labels = [label_list[i] for i in example[f"{task}_tags"]]
labels = [tag_list for tag_list in tags]
metric.compute(predictions=[labels], references=[labels])

ArrowTypeError: Expected bytes, got a 'list' object

Necesitaremos post procesar las predicciones

* Seleccionar el índice predicho (salida con máximo logit) para cada token
* Convertir el token a su etiqueta en string asociada
* Ignorar todo lo que tenga la etiqueta -100

La siguiente función realiza todo este post procesamiento sobre el resultado ```Trainer.evaluate``` antes de aplicar la métrica

In [36]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Notar que se botan las métricas por etiqueta (la agregaremos luego) y se focalizan sobre los globales. Con lo anterior está listo para entregarlo al ```Trainer```

In [37]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2106,0.14608,0.796673,0.774286,0.78532,0.957432
2,0.092,0.137706,0.81202,0.83079,0.821298,0.962588
3,0.0601,0.143024,0.822331,0.835625,0.828925,0.964022


TrainOutput(global_step=1563, training_loss=0.11845848412370347, metrics={'train_runtime': 28845.8315, 'train_samples_per_second': 0.054, 'total_flos': 51706117436472.0, 'epoch': 3.0})

In [40]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.8322932917316692,
  'recall': 0.8508771929824561,
  'f1': 0.8414826498422713,
  'number': 1254},
 'MISC': {'precision': 0.5792682926829268,
  'recall': 0.521978021978022,
  'f1': 0.5491329479768786,
  'number': 728},
 'ORG': {'precision': 0.8139450867052023,
  'recall': 0.8553530751708428,
  'f1': 0.8341355053683821,
  'number': 2634},
 'PER': {'precision': 0.9064356435643565,
  'recall': 0.9141288067898152,
  'f1': 0.9102659706686551,
  'number': 2003},
 'overall_precision': 0.8223312518584597,
 'overall_recall': 0.8356247167245807,
 'overall_f1': 0.8289246908954664,
 'overall_accuracy': 0.9640221699481197}