In [3]:
# Install required packages
!pip install pytorch-crf datasets spacy fasttext seqeval ipdb

# Import the os module to modify environment variables
import os

# Set the CUDA_LAUNCH_BLOCKING environment variable to "1"
# This variable is specific to the CUDA library used for GPU acceleration
# Setting it to "1" enables synchronization mode, causing the program to wait for GPU kernel completion before proceeding
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-

In [4]:
# Import the load_dataset function from the datasets module
from datasets import load_dataset

# Load the "conll2002" dataset with the language set to Spanish ('es')
dataset = load_dataset("conll2002", 'es')

# Print the number of examples in the training split of the dataset
print("Number of examples in the training split:", len(dataset['train']))

# Print the number of examples in the validation split of the dataset
print("Number of examples in the validation split:", len(dataset['validation']))

# Print the number of examples in the test split of the dataset
print("Number of examples in the test split:", len(dataset['test']))


Downloading builder script:   0%|          | 0.00/9.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading and preparing dataset conll2002/es to /root/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/713k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/138k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1518 [00:00<?, ? examples/s]

Dataset conll2002 downloaded and prepared to /root/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Number of examples in the training split: 8324
Number of examples in the validation split: 1916
Number of examples in the test split: 1518


In [5]:
# Get the sentences from the dataset
sentences = dataset["train"]["tokens"] + dataset["validation"]["tokens"] + dataset["test"]["tokens"]

# Create a unique set of tokens
unique_tokens = set(token for sentence in sentences for token in sentence)

# Calculate the number of unique tokens
num_unique_tokens = len(unique_tokens)

print("Number of tokens in the vocabulary:", num_unique_tokens)


Cantidad de tokens en el vocabulario: 31405


In [6]:
# Example of entities in a train sample
ejemplo = dataset['train'][2]
# Detokenized sentence
' '.join(ejemplo['tokens']).replace(' ,', ',').replace(' .', '.')

'El Abogado General del Estado, Daryl Williams, subrayó hoy la necesidad de tomar medidas para proteger al sistema judicial australiano frente a una página de internet que imposibilita el cumplimiento de los principios básicos de la Ley.'

In [7]:
ner_lista = dataset["train"].features["ner_tags"].feature.names
for indice, elem in enumerate(ejemplo['ner_tags']):
  print("TOKEN: {:<15} Entity: {}".format(ejemplo['tokens'][indice], ner_lista[elem]))

TOKEN: El              Entity: O
TOKEN: Abogado         Entity: B-PER
TOKEN: General         Entity: I-PER
TOKEN: del             Entity: I-PER
TOKEN: Estado          Entity: I-PER
TOKEN: ,               Entity: O
TOKEN: Daryl           Entity: B-PER
TOKEN: Williams        Entity: I-PER
TOKEN: ,               Entity: O
TOKEN: subrayó         Entity: O
TOKEN: hoy             Entity: O
TOKEN: la              Entity: O
TOKEN: necesidad       Entity: O
TOKEN: de              Entity: O
TOKEN: tomar           Entity: O
TOKEN: medidas         Entity: O
TOKEN: para            Entity: O
TOKEN: proteger        Entity: O
TOKEN: al              Entity: O
TOKEN: sistema         Entity: O
TOKEN: judicial        Entity: O
TOKEN: australiano     Entity: O
TOKEN: frente          Entity: O
TOKEN: a               Entity: O
TOKEN: una             Entity: O
TOKEN: página          Entity: O
TOKEN: de              Entity: O
TOKEN: internet        Entity: O
TOKEN: que             Entity: O
TOKEN: imposibilita

In [8]:
# Obtain the mapping from numerical labels to named labels
id2label = dataset["train"].features["ner_tags"].feature.names

# Create a mapping from named labels to numerical labels
label2id = {v: i for i, v in enumerate(id2label)}

In [9]:
# Import required modules
from collections import Counter
from torchtext.vocab import vocab as Vocab
from collections import OrderedDict

# Initialize a counter to keep track of token frequencies
counter = Counter()

# Iterate over each dataset split (train, validation, test)
for dataset_part in ['train', 'validation', 'test']:
    # Get the tokens from the current dataset split
    textos = dataset[dataset_part]['tokens']
    # Update the counter with the tokens from the current dataset split
    for texto in textos:
        counter.update(texto)

# Define special tokens for the vocabulary
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]

# Create a vocabulary object based on the token frequencies
vocab = Vocab(counter, min_freq=1, specials=["<unk>", "<pad>", "<bos>", "<eos>"])

# Get the index-to-token (itos) and token-to-index (stoi) mappings from the vocabulary
itos = vocab.get_itos()
stoi = vocab.get_stoi()

# Get the index of the special tokens in the stoi mapping
UNK_IDX = stoi["<unk>"]
PAD_IDX = stoi["<pad>"]
BOS_IDX = stoi["<bos>"]
EOS_IDX = stoi["<eos>"]

# Print the size of the vocabulary
print("Vocabulary Size:", len(vocab))

Tamaño del vocabulario: 31409


In [10]:
def tokenize_and_format(example):
    """
    Tokenizes and formats an example.
    Arguments:
    - example: An input data example in the form of a dictionary with 'tokens' and 'ner_tags' keys.
    Returns:
    - A new dictionary with 'input_ids' and 'labels' keys containing the tokens converted to IDs and NER tags respectively.
    """
    tokens = example['tokens']
    ner_tags = example['ner_tags']

    # Add BOS token at the beginning and EOS token at the end
    tokens = ['<bos>'] + tokens + ['<eos>']
    token_ids = [stoi.get(token, UNK_IDX) for token in tokens]

    # Add 0 to the left and right of the NER tags
    ner_tags = [0] + ner_tags + [0]

    return {'input_ids': token_ids, 'labels': ner_tags}

# Apply the tokenize_and_format function to the dataset
dataset = dataset.map(tokenize_and_format, batched=False)

Map:   0%|          | 0/8324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [11]:
# Download the FastText word vectors for Spanish language
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz

# Decompress the downloaded file
!gunzip cc.es.300.bin.gz

--2023-06-26 17:59:32--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.8.51, 13.35.8.29, 13.35.8.35, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.8.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4500107671 (4.2G) [application/octet-stream]
Saving to: ‘cc.es.300.bin.gz’


2023-06-26 18:02:52 (21.6 MB/s) - ‘cc.es.300.bin.gz’ saved [4500107671/4500107671]



In [12]:
# Import the fasttext module
import fasttext

# Load the FastText model for Spanish language
ft = fasttext.load_model('cc.es.300.bin')



In [13]:
import torch

# Calculate the dimension of the embeddings
DIM = ft["random"].shape[0]

# Create an embedding matrix of random values with dimensions (vocab_size, DIM)
emb_matrix = torch.randn(len(vocab), DIM)

# Set the embedding for the PAD_IDX to zero
emb_matrix[PAD_IDX] = 0

# Build the embedding matrix

# Load all the embeddings for our vocabulary

for i, word in enumerate(itos):
    """
    Complete the embedding matrix
    """
    if i == UNK_IDX or i == PAD_IDX or i == BOS_IDX or i == EOS_IDX:
        # Skip UNK, PAD, BOS, and EOS embeddings
        pass
    else:
        # Fill in the embedding matrix
        word_vector = ft.get_word_vector(word)
        emb_matrix[i] = torch.tensor(word_vector)

In [14]:
del ft

torch.Size([31409, 300])

In [15]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    """
    Function used to collate a list of examples into a batch.
    Args:
        batch (list): List of examples, where each example is a dictionary with 'labels' and 'input_ids' keys.
    Returns:
        tuple: Tuple containing the tensors of input_ids and labels after applying padding.
    """
    # Extract the 'labels' tensors from each example in the batch
    labels = [torch.tensor(example["labels"]) for example in batch]

    # Extract the 'input_ids' tensors from each example in the batch
    input_ids = [torch.tensor(example["input_ids"]) for example in batch]

    # Pad the input_ids tensors with the PAD_IDX as the padding value
    input_ids_padded = pad_sequence(input_ids, padding_value=PAD_IDX, batch_first=True)

    # Pad the labels tensors with -100 as the padding value
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return input_ids_padded, labels_padded

# Create DataLoaders for the training, validation, and test data.
# The batch_size will be set to 32 or 16 depending on the case.
# collate_fn: Function to collate examples into batches, using the collate_batch function.
train_dataloader = DataLoader(dataset["train"], batch_size=32, collate_fn=collate_batch)
dev_dataloader = DataLoader(dataset["validation"], batch_size=16, collate_fn=collate_batch)
test_dataloader = DataLoader(dataset["test"], batch_size=16, collate_fn=collate_batch)

In [16]:
from tqdm.auto import tqdm
import numpy as np
import seqeval
import torch.nn.functional as F
from torch import nn
from datasets import load_metric

# Load the seqeval metric for sequence labeling evaluation
metric = load_metric("seqeval")

def validate_step(model, dataloader):
    """
    Validate step

    Calculates F1 and other metrics.
    """
    device = next(model.parameters()).device
    with torch.no_grad():
        # Disable gradient calculation
        all_labels = []
        all_preds = []
        all_losses = []
        for text, labels in tqdm(dataloader):
            text = text.to(device)
            labels = labels.to(device)
            logits = model(text)

            # Compute the cross-entropy loss
            loss = F.cross_entropy(
                logits.view(-1, 9),
                labels.view(-1),
            )
            all_losses.append(loss.detach().item())

            # Instead of softmax, directly get the maximum value
            preds = logits.argmax(-1)

            # Convert label indices to corresponding labels
            true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
            # Ignore the -100 values
            true_predictions = [
                [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(preds, labels)
            ]

            all_labels += true_labels
            all_preds += true_predictions

        # Compute evaluation metrics
        metrics = metric.compute(predictions=all_preds, references=all_labels)

        metrics["loss"] = np.array(all_losses).mean()
        metrics["micro_f1"] = seqeval.metrics.sequence_labeling.f1_score(
            all_labels, all_preds, average="micro",
        )
        metrics["macro_f1"] = seqeval.metrics.sequence_labeling.f1_score(
            all_labels, all_preds, average="macro",
        )
        return metrics

def log_metrics(writer, metrics):
    """
    Log metrics to tensorboard
    """
    for k, v in metrics.items():
        if type(v) is dict:
            # Handle metrics with sub-categories (e.g., LOC, PER)
            for sub_k, sub_v in v.items():
                if sub_k == "number":
                    continue
                writer.add_scalar(f"dev/{k} {sub_k}", sub_v, global_step=step)
        else:
            writer.add_scalar(f"dev/{k}", sub_v, global_step=step)

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [45]:
from google.protobuf.reflection import ParseMessage
import torch.nn.functional as F
from torch import nn
from datasets import load_metric

# Load the seqeval metric for sequence labeling evaluation
metric = load_metric("seqeval")

class MyNERModel(nn.Module):
    """
    Custom NER model class.
    Args:
        vocab_size (int): Vocabulary size.
        embedding_dim (int): Dimension of embeddings.
        pad_idx (int): Padding index.
        rnn_units (int): Number of units in the LSTM layer.
        num_labels (int): Number of entity labels.
        num_layers (int, optional): Number of LSTM layers. Default: 1.
        dropout (float, optional): Dropout rate. Default: 0.25.
        embedding_matrix (torch.Tensor, optional): Pre-trained embedding matrix. Default: None.
        freeze_embeddings (bool, optional): Indicator of whether to freeze embeddings during training. Default: True.
    """
    def __init__(self, vocab_size, embedding_dim, pad_idx, rnn_units, num_labels, num_layers=1,
                 dropout=0.25, embedding_matrix=None, freeze_embeddings=True):
        """
        Constructor of the MyNERModel class.
        Creates the necessary layers for the model.
        """
        super().__init__()
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=pad_idx, freeze=freeze_embeddings)
        self.lstm = nn.LSTM(embedding_dim, rnn_units, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(2*rnn_units, num_labels)

    def forward(self, text):
        """
        Performs a forward pass in the model.
        Args:
            text (torch.Tensor): Encoded text sequence.
        Returns:
            torch.Tensor: Logits produced by the model.
        """
        # Embedding layer
        embedded = self.embedding(text)
        # LSTM layer
        output, _ = self.lstm(embedded)
        # Dropout layer
        output = self.dropout(output)
        logits = self.fc(output)
        return logits


In [46]:
# CHECK!
# Obtain the number of classes for the NER tags
num_classes = dataset["train"].features["ner_tags"].feature.num_classes

# Create an instance of the MyNERModel
model = MyNERModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=512, embedding_matrix=emb_matrix, num_layers=3,
    freeze_embeddings=True, num_labels=num_classes, dropout=0.25
)

# Get a batch of text and labels from the training dataloader
text, labels = next(iter(train_dataloader))

# Forward pass through the model to get predictions
preds = model(text)

# Calculate the loss using cross-entropy
loss = F.cross_entropy(
    preds.view(-1, num_classes),
    labels.view(-1),
)


In [19]:
loss

tensor(2.1664, grad_fn=<NllLossBackward0>)

In [47]:
import torch
from tqdm.auto import tqdm
from pprint import pprint as pp
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

# Initialize a SummaryWriter for logging training progress
writer = SummaryWriter()

# Determine the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create an instance of the MyNERModel
model = MyNERModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=512, embedding_matrix=emb_matrix,
    num_layers=3, freeze_embeddings=True, num_labels=num_classes,
)

# Set the number of epochs and initialize the step counter
num_epochs = 5
step = 0

# Set the learning rate for the optimizer
lr = 1e-3

# Define the optimizer (Adam) for updating model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Move the model to the appropriate device
model = model.to(device)

# Training loop
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        step += 1

        # Get the text and labels from the batch and move them to the device
        text, labels = batch
        text = text.to(device)
        labels = labels.to(device)

        # Reset the gradients
        optimizer.zero_grad()

        # Forward pass through the model to get logits
        logits = model(text)

        # Calculate the loss using cross-entropy
        loss = F.cross_entropy(logits.view(-1, num_classes), labels.view(-1))

        # Backpropagation to calculate gradients
        loss.backward()

        # Update the model parameters
        optimizer.step()

        # Calculate the gradient norm
        total_norm = sum(param.grad.detach().norm(2) ** 2 for param in model.parameters() if param.requires_grad) ** (0.5)

        # Log the loss and gradient norm to TensorBoard
        writer.add_scalar("train/loss", loss, global_step=step)
        writer.add_scalar("train/gradient_norm", total_norm, global_step=step)

    # Perform validation on the development set
    metrics = validate_step(model, dev_dataloader)
    pp(metrics)

    # Log the metrics to TensorBoard
    log_metrics(writer, metrics)


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.29314420803782504,
         'number': 985,
         'precision': 0.2743362831858407,
         'recall': 0.3147208121827411},
 'MISC': {'f1': 0.003669724770642202,
          'number': 445,
          'precision': 0.01,
          'recall': 0.0022471910112359553},
 'ORG': {'f1': 0.5199029126213592,
         'number': 1700,
         'precision': 0.4425619834710744,
         'recall': 0.63},
 'PER': {'f1': 0.5874200426439233,
         'number': 1222,
         'precision': 0.8425076452599388,
         'recall': 0.4509001636661211},
 'loss': 0.22357289713496964,
 'macro_f1': 0.3510342220184375,
 'micro_f1': 0.446626617375231,
 'overall_accuracy': 0.9266320148004581,
 'overall_f1': 0.446626617375231,
 'overall_precision': 0.44911710037174724,
 'overall_recall': 0.44416360294117646}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.5512416928996152,
         'number': 985,
         'precision': 0.4204909284951974,
         'recall': 0.8},
 'MISC': {'f1': 0.31662870159453305,
          'number': 445,
          'precision': 0.3210161662817552,
          'recall': 0.31235955056179776},
 'ORG': {'f1': 0.6501114294810569,
         'number': 1700,
         'precision': 0.7085357390700903,
         'recall': 0.6005882352941176},
 'PER': {'f1': 0.619023653749371,
         'number': 1222,
         'precision': 0.803921568627451,
         'recall': 0.5032733224222586},
 'loss': 0.16518078629548352,
 'macro_f1': 0.534251369431144,
 'micro_f1': 0.5782289904117315,
 'overall_accuracy': 0.946612633248172,
 'overall_f1': 0.5782289904117315,
 'overall_precision': 0.567914912475072,
 'overall_recall': 0.5889246323529411}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.6968204209583521,
         'number': 985,
         'precision': 0.6233974358974359,
         'recall': 0.7898477157360406},
 'MISC': {'f1': 0.396732788798133,
          'number': 445,
          'precision': 0.41262135922330095,
          'recall': 0.38202247191011235},
 'ORG': {'f1': 0.7326073805202662,
         'number': 1700,
         'precision': 0.7540473225404732,
         'recall': 0.7123529411764706},
 'PER': {'f1': 0.843167701863354,
         'number': 1222,
         'precision': 0.8020679468242246,
         'recall': 0.8887070376432079},
 'loss': 0.12064227676019072,
 'macro_f1': 0.6673320730350263,
 'micro_f1': 0.7233615693267945,
 'overall_accuracy': 0.9628050392035944,
 'overall_f1': 0.7233615693267945,
 'overall_precision': 0.7023809523809523,
 'overall_recall': 0.7456341911764706}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7224901359053046,
         'number': 985,
         'precision': 0.6358024691358025,
         'recall': 0.8365482233502538},
 'MISC': {'f1': 0.4251101321585903,
          'number': 445,
          'precision': 0.4168466522678186,
          'recall': 0.4337078651685393},
 'ORG': {'f1': 0.756822316555488,
         'number': 1700,
         'precision': 0.7809762202753442,
         'recall': 0.7341176470588235},
 'PER': {'f1': 0.8731707317073171,
         'number': 1222,
         'precision': 0.8675282714054927,
         'recall': 0.8788870703764321},
 'loss': 0.11161816575719664,
 'macro_f1': 0.694398329081675,
 'micro_f1': 0.7463954398122277,
 'overall_accuracy': 0.9660470443132764,
 'overall_f1': 0.7463954398122277,
 'overall_precision': 0.7266594124047878,
 'overall_recall': 0.7672334558823529}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7487131492746841,
         'number': 985,
         'precision': 0.6944444444444444,
         'recall': 0.8121827411167513},
 'MISC': {'f1': 0.4332925336597307,
          'number': 445,
          'precision': 0.47580645161290325,
          'recall': 0.39775280898876403},
 'ORG': {'f1': 0.775946805435097,
         'number': 1700,
         'precision': 0.762933484934622,
         'recall': 0.7894117647058824},
 'PER': {'f1': 0.8812749003984065,
         'number': 1222,
         'precision': 0.8586956521739131,
         'recall': 0.9050736497545008},
 'loss': 0.10976623414705197,
 'macro_f1': 0.7098068471919795,
 'micro_f1': 0.7676790317157907,
 'overall_accuracy': 0.9692890494229583,
 'overall_f1': 0.7676790317157907,
 'overall_precision': 0.7492889958433603,
 'overall_recall': 0.7869944852941176}


In [48]:
validate_step(model, test_dataloader)

  0%|          | 0/95 [00:00<?, ?it/s]

{'LOC': {'precision': 0.7517361111111112,
  'recall': 0.7988929889298892,
  'f1': 0.774597495527728,
  'number': 1084},
 'MISC': {'precision': 0.4740484429065744,
  'recall': 0.40294117647058825,
  'f1': 0.4356120826709062,
  'number': 340},
 'ORG': {'precision': 0.7529335071707953,
  'recall': 0.825,
  'f1': 0.787321063394683,
  'number': 1400},
 'PER': {'precision': 0.9036939313984169,
  'recall': 0.9319727891156463,
  'f1': 0.9176155391828533,
  'number': 735},
 'overall_precision': 0.7615858558799893,
 'overall_recall': 0.7988198932284349,
 'overall_f1': 0.7797586396050465,
 'overall_accuracy': 0.976195275705987,
 'loss': 0.08199278261071365,
 'micro_f1': 0.7797586396050465,
 'macro_f1': 0.7287865451940426}

In [50]:
from spacy.lang.es import Spanish
import spacy
nlp = Spanish()
tokenizer = nlp.tokenizer

def simplify_entities(entities):
    """
    Simplifies the identified entities by combining consecutive elements of a phrase.

    Parameters:
        entities (list): A list of dictionaries representing the identified entities.
                         Each dictionary should contain two keys: 'entidad' (entity type) and 'texto' (entity text).
    Returns:
        list: A list of dictionaries representing the simplified entities.
              Each dictionary contains two keys: 'entidad' (entity type) and 'texto' (simplified entity text).
    Example:
        entities = [{'entidad': 'B-PER', 'texto': 'Juan'},
                    {'entidad': 'I-PER', 'texto': 'Manuel'},
                    {'entidad': 'I-PER', 'texto': 'Pérez'},
                    {'entidad': 'B-ORG', 'texto': 'Universidad'},
                    {'entidad': 'I-ORG', 'texto': 'de'},
                    {'entidad': 'I-ORG', 'texto': 'San'},
                    {'entidad': 'I-ORG', 'texto': 'Andrés'}]

        simplified_entities = simplify_entities(entities)
        print(simplified_entities)
        Output: [{'entidad': 'PER', 'texto': 'Juan Manuel Pérez'},
                 {'entidad': 'ORG', 'texto': 'Universidad de San Andrés'}]
    """
    simplified_entities = []
    current_entity = None
    current_text = ""

    for entity in entities:
        if entity["entidad"].startswith("B-"):
            if current_entity is not None:
                simplified_entities.append({"entidad": current_entity[2:], "texto": current_text})
            current_entity = entity["entidad"]
            current_text = str(entity["texto"])
        elif entity["entidad"].startswith("I-"):
            if current_entity is not None:
                current_text += " " + str(entity["texto"])

    if current_entity is not None:
        simplified_entities.append({"entidad": current_entity[2:], "texto": current_text})

    return simplified_entities

def identificar_entidades(model, tokenizer, text):
    """
    Identifies named entities in the given text using a trained model.

    Parameters:
        model (torch.nn.Module): The trained model for named entity recognition.
        tokenizer: The tokenizer object used to tokenize the input text.
        text (str): The input text from which entities are to be identified.

    Returns:
        list: A list of dictionaries representing the identified entities.
              Each dictionary contains two keys: 'entidad' (entity type) and 'texto' (entity text).
    """
    # Tokenize text and obtain tokens and token IDs
    tokens = tokenizer(text)
    token_ids = [stoi.get(token.text, stoi["<unk>"]) for token in tokens]
    input_ids = torch.tensor(token_ids).unsqueeze(0).to(next(model.parameters()).device)
    output = model(input_ids)
    predicted_labels = output.argmax(dim=-1)[0]
    labels = [id2label[label_id.item()] for label_id in predicted_labels]
    entities = [{"entidad": label, "texto": token} for token, label in zip(tokens,labels) if label != 'O']
    return simplify_entities(entities)

In [51]:
text = 'Las denominaciones adoptadas sucesivamente desde 1810 hasta el presente, a saber: Provincias Unidas del Río de la \
Plata, República Argentina, Confederación Argentina, serán en adelante nombres oficiales indistintamente para la \
designación del Gobierno y territorio de las provincias, empleándose las palabras Nación Argentina en la formación \
y sanción de las leyes.'
identificar_entidades(model, tokenizer, text)

[{'entidad': 'ORG', 'texto': 'sucesivamente'},
 {'entidad': 'ORG', 'texto': '1810 Unidas'},
 {'entidad': 'LOC', 'texto': 'Río de la Plata'},
 {'entidad': 'LOC', 'texto': 'República Argentina'},
 {'entidad': 'ORG', 'texto': 'Confederación Argentina'},
 {'entidad': 'ORG', 'texto': 'Gobierno'},
 {'entidad': 'ORG', 'texto': 'empleándose'},
 {'entidad': 'ORG', 'texto': 'Nación Argentina'}]

In [52]:
identificar_entidades(model, tokenizer, "Tim Cook es el presidente de Apple")

[{'entidad': 'ORG', 'texto': 'Cook'}, {'entidad': 'ORG', 'texto': 'Apple'}]

In [53]:
identificar_entidades(model, tokenizer, "Juan Manuel Pérez es el profesor de NLP de la Universidad de San Andrés")

[{'entidad': 'PER', 'texto': 'Juan Manuel Pérez'},
 {'entidad': 'ORG', 'texto': 'NLP'},
 {'entidad': 'ORG', 'texto': 'Universidad de San Andrés'}]

In [54]:
text = "El sitio Aires de los Lagos es un lugar mágico, fui con Juan Pérez"
identificar_entidades(model, tokenizer, text)

[{'entidad': 'LOC', 'texto': 'Aires de los Lagos'},
 {'entidad': 'ORG', 'texto': 'mágico'},
 {'entidad': 'PER', 'texto': 'Juan Pérez'}]

In [60]:
text = "Lionel es un jugador de fútbol profesional de la liga MLS."
identificar_entidades(model, tokenizer, text)

[{'entidad': 'PER', 'texto': 'Lionel'}, {'entidad': 'ORG', 'texto': 'MLS'}]