<a href="https://colab.research.google.com/github/orlandxrf/curso-dl/blob/main/notebooks/10f_FineTuning_NamedEntityRecognition_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning para Reconocimiento de Entidades Nomnbradas 

In [None]:
# establecer parametros para almacenar y cargar el conjunto de datos del repositorio de Github
import os

URL = 'https://raw.githubusercontent.com/orlandxrf/spanish-ner/master/data/ensemble.mx-news.txt'
data_folder = 'data'
filepath = os.path.join(data_folder, 'named_entities_mx.txt')

In [None]:
# crear carpeta para almacenar el conjunto de datos
! mkdir {data_folder}
# descargar conjunto de datos y alamcenar
! wget -nc {URL} -O {filepath}

mkdir: cannot create directory ‘data’: File exists
File ‘data/named_entities_mx.txt’ already there; not retrieving.


In [None]:
! ls -lh data/*

-rw-r--r-- 1 root root 3.9M Apr 21 12:45 data/named_entities_mx.txt

data/dataset_entities:
total 6.2M
-rw-r--r-- 1 root root  24K Apr 21 12:53 cache-01861b5339be91a2.arrow
-rw-r--r-- 1 root root 853K Apr 21 12:53 cache-05e2262bc418a9dc.arrow
-rw-r--r-- 1 root root  20K Apr 21 12:53 cache-93c42c61d7b1828e.arrow
-rw-r--r-- 1 root root 3.4M Apr 21 12:53 cache-db95e8f095de578c.arrow
-rw-r--r-- 1 root root 5.1K Apr 21 12:53 cache-e6a7b9f07ab256a8.arrow
-rw-r--r-- 1 root root 1.9M Apr 21 12:53 dataset.arrow
-rw-r--r-- 1 root root  771 Apr 21 12:53 dataset_info.json
-rw-r--r-- 1 root root  253 Apr 21 12:53 state.json


In [None]:
!pip install transformers
!pip install datasets



In [None]:
def changeLabelling(triple):
  # cambiar el etiquetado IOBES a IOB
  if 'S' in triple[2].split('-')[0]: triple[2] = f"B-{triple[2].split('-')[1]}"
  elif 'E' in triple[2].split('-')[0]: triple[2] = f"I-{triple[2].split('-')[1]}"
  return triple

def loadConllData(path, header=False):
  data, tmp = [], []
  count = 0
  with open(path, 'r') as f:
    for i, row in enumerate(f):
      if header and i==0: continue # evitar primera línea de encabezados
      row = row.replace('\n','').split('\t')
      if len(row) <= 1: # espacio en blanco entre oraciones
        count += 1
        tmp_labels = [triple[2] for triple in tmp]
        if any('B-' in sc for sc in tmp_labels):
          data.append(tmp)
        tmp = []
      else:
        row.pop(0) # eliminar columna 'Sentence #'
        row = list(row) # cambiar de tuple a list
        if 'O' not in row[2].split('-')[0]: row = changeLabelling(row)
        tmp.append(row)
  f.close()
  return data

def convertData2Dataset(data, path=''):
  from datasets import Dataset
  import pandas as pd

  tokens, labels = [], []
  for sentence in data:
    tokens.append([triple[0] for triple in sentence])
    labels.append([triple[2] for triple in sentence])

  df = pd.DataFrame({'tokens':tokens, 'ner_tags':labels})
  result = Dataset.from_pandas(df)
  if path != '': result.save_to_disk(path)

In [None]:
path = 'data/named_entities_mx.txt'
dataset_folder_path = 'data/dataset_entities'

In [None]:
from datasets import load_from_disk

data = loadConllData(path, header=True)

print (f"{len(data):,} oraciones en el conjunto de datos")

convertData2Dataset(data, dataset_folder_path)

data = load_from_disk(dataset_folder_path)

data.cleanup_cache_files()

data = data.shuffle(seed=42)
data = data.train_test_split(test_size=0.2)

train_dataset = data['train']
test_dataset = data['test']
del data

2,966 oraciones en el conjunto de datos


In [None]:
print (f"\nTrain:\n{train_dataset}\n")

print (f"\nTest:\n{test_dataset}\n")

print (f"{train_dataset['tokens'][0]}")
print (f"{train_dataset['ner_tags'][0]}")


Train:
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 2372
})


Test:
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 594
})

['Sin', 'embargo', ',', 'será', 'hasta', 'que', 'este', 'documento', 'esté', 'en', 'manos', 'del', 'presidente', 'Trump', ',', 'cuando', 'se', 'dé', 'a', 'conocer', 'al', 'pueblo', 'mexicano', 'siguiendo', 'la', 'política', 'de', 'transparencia', ',', 'detalló', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIT', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DEM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Codificar Train y Test

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch, os
import numpy as np

In [None]:
if torch.cuda.is_available():     
    device = torch.device("cuda:0")
    print(f'Hay {torch.cuda.device_count()} GPU(s) disponibles para el entrenamiento')
    print(f'Se usará la GPU: {torch.cuda.get_device_name(0)}')
else:
    print('No hay GPU disponible, así que se usará CPU')
    device = torch.device("cpu")

Hay 1 GPU(s) disponibles para el entrenamiento
Se usará la GPU: Tesla T4


Modelo pre-entreando para hacer el Fine-Tuning

In [None]:
checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

Función para alinear y tokenizar.<br>
Crear 'input_ids', 'token_type_ids', 'attention_mask', 'labels'

In [None]:
def tokenize_and_align(samples):
	tokenized_inputs = tokenizer(list(samples["tokens"]), truncation=True, is_split_into_words=True)
	labels = []
	for i, label in enumerate(samples["ner_tags"]):
		word_ids = tokenized_inputs.word_ids(batch_index=i)
		label_ids = []
		for j, word_idx in enumerate(word_ids):
			if word_idx is None:
				label_ids.append(-100)
			else:
				label_ids.append(label_encoding_dict[label[word_idx]])
		labels.append(label_ids)
	tokenized_inputs["labels"] = labels
	return tokenized_inputs

In [None]:
my_model_name = "data/ner_model"
print (f"Guardar Modelo en:\t{my_model_name}")

Guardar Modelo en:	data/ner_model


In [None]:
label_encoding_dict = list(set([tag for sentence in train_dataset['ner_tags'] for tag in sentence]))
label_encoding_dict.sort(reverse=True)
label_encoding_dict = {tag:i for i, tag in enumerate(label_encoding_dict)}
label_list = list(label_encoding_dict.keys())

train_tokenized_dataset = train_dataset.map(tokenize_and_align, batched=True) #
test_tokenized_dataset = test_dataset.map(tokenize_and_align, batched=True) #


print (train_tokenized_dataset)

print (test_tokenized_dataset)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2372
})
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 594
})


## Modelo pre-entrenado


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, 
    num_labels = len(label_list), 
    ignore_mismatched_sizes = True # importante 
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-ner-hrl and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([35, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([35]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
batch_size = 16
epochs = 1

In [None]:
args = TrainingArguments(
	"my-modelo-ner",
	evaluation_strategy = "epoch",
	learning_rate = 1e-4,
	per_device_train_batch_size = batch_size,
	per_device_eval_batch_size = batch_size,
	num_train_epochs = epochs,
	weight_decay = 1e-5,
	push_to_hub = False,
	no_cuda  =  True,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# biblioteca para la evaluación
!pip install seqeval



In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

In [None]:
def compute_metrics(p):
    from sklearn.metrics import classification_report as cr_all
    from seqeval.metrics import classification_report as cr_single
    import warnings

    warnings.filterwarnings('ignore')

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # # ----------------------------------------------------------------------------------------------
    # # mostrar evaluaciones de las clases
    print (f"\n\n\t{'-'*100}")
    n_labels = label_list.copy()
    n_labels.remove('O')
    # print (f"{type(true_predictions)=}\t{len(true_predictions)=}\t{len(true_labels)=}\t{n_labels=}")
    y_test = [t for true_list in true_predictions for t in true_list]
    y_pred = [p for true_label_list in true_labels for p in true_label_list]
    metric1 = cr_all(y_test, y_pred, labels=n_labels)
    print (metric1)
    print (f"\t{'-'*100}")
    metric2 = cr_single(true_predictions, true_labels)
    print (metric2)
    # # ----------------------------------------------------------------------------------------------

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

In [None]:
trainer = Trainer(
	model,
	args,
	train_dataset = train_tokenized_dataset,
	eval_dataset = test_tokenized_dataset,
	data_collator = data_collator,
	tokenizer = tokenizer,
	compute_metrics = compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2372
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 149


Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
trainer.save_model(my_model_name)