In [1]:
# Install required packages
!pip install pytorch-crf datasets spacy fasttext seqeval ipdb

# Import the os module to modify environment variables
import os

# Set the CUDA_LAUNCH_BLOCKING environment variable to "1"
# This variable is specific to the CUDA library used for GPU acceleration
# Setting it to "1" enables synchronization mode, causing the program to wait for GPU kernel completion before proceeding
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"




In [2]:
# Import the load_dataset function from the datasets module
from datasets import load_dataset

# Load the "conll2002" dataset with the language set to Spanish ('es')
dataset = load_dataset("conll2002", 'es')

# Print the number of examples in the training split of the dataset
print("Number of examples in the training split:", len(dataset['train']))

# Print the number of examples in the validation split of the dataset
print("Number of examples in the validation split:", len(dataset['validation']))

# Print the number of examples in the test split of the dataset
print("Number of examples in the test split:", len(dataset['test']))




  0%|          | 0/3 [00:00<?, ?it/s]

Number of examples in the training split: 8324
Number of examples in the validation split: 1916
Number of examples in the test split: 1518


In [3]:
# Get the sentences from the dataset
sentences = dataset["train"]["tokens"] + dataset["validation"]["tokens"] + dataset["test"]["tokens"]

# Create a unique set of tokens
unique_tokens = set(token for sentence in sentences for token in sentence)

# Calculate the number of unique tokens
num_unique_tokens = len(unique_tokens)

print("Number of tokens in the vocabulary:", num_unique_tokens)


Number of tokens in the vocabulary: 31405


In [4]:
# Example of entities in a train sample
ejemplo = dataset['train'][2]
# Detokenized sentence
' '.join(ejemplo['tokens']).replace(' ,', ',').replace(' .', '.')

'El Abogado General del Estado, Daryl Williams, subrayó hoy la necesidad de tomar medidas para proteger al sistema judicial australiano frente a una página de internet que imposibilita el cumplimiento de los principios básicos de la Ley.'

In [5]:
ner_lista = dataset["train"].features["ner_tags"].feature.names
for indice, elem in enumerate(ejemplo['ner_tags']):
  print("TOKEN: {:<15} Entity: {}".format(ejemplo['tokens'][indice], ner_lista[elem]))

TOKEN: El              Entity: O
TOKEN: Abogado         Entity: B-PER
TOKEN: General         Entity: I-PER
TOKEN: del             Entity: I-PER
TOKEN: Estado          Entity: I-PER
TOKEN: ,               Entity: O
TOKEN: Daryl           Entity: B-PER
TOKEN: Williams        Entity: I-PER
TOKEN: ,               Entity: O
TOKEN: subrayó         Entity: O
TOKEN: hoy             Entity: O
TOKEN: la              Entity: O
TOKEN: necesidad       Entity: O
TOKEN: de              Entity: O
TOKEN: tomar           Entity: O
TOKEN: medidas         Entity: O
TOKEN: para            Entity: O
TOKEN: proteger        Entity: O
TOKEN: al              Entity: O
TOKEN: sistema         Entity: O
TOKEN: judicial        Entity: O
TOKEN: australiano     Entity: O
TOKEN: frente          Entity: O
TOKEN: a               Entity: O
TOKEN: una             Entity: O
TOKEN: página          Entity: O
TOKEN: de              Entity: O
TOKEN: internet        Entity: O
TOKEN: que             Entity: O
TOKEN: imposibilita

In [6]:
# Obtain the mapping from numerical labels to named labels
id2label = dataset["train"].features["ner_tags"].feature.names

# Create a mapping from named labels to numerical labels
label2id = {v: i for i, v in enumerate(id2label)}

In [7]:
# Import required modules
from collections import Counter
from torchtext.vocab import vocab as Vocab
from collections import OrderedDict

# Initialize a counter to keep track of token frequencies
counter = Counter()

# Iterate over each dataset split (train, validation, test)
for dataset_part in ['train', 'validation', 'test']:
    # Get the tokens from the current dataset split
    textos = dataset[dataset_part]['tokens']
    # Update the counter with the tokens from the current dataset split
    for texto in textos:
        counter.update(texto)

# Define special tokens for the vocabulary
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]

# Create a vocabulary object based on the token frequencies
vocab = Vocab(counter, min_freq=1, specials=["<unk>", "<pad>", "<bos>", "<eos>"])

# Get the index-to-token (itos) and token-to-index (stoi) mappings from the vocabulary
itos = vocab.get_itos()
stoi = vocab.get_stoi()

# Get the index of the special tokens in the stoi mapping
UNK_IDX = stoi["<unk>"]
PAD_IDX = stoi["<pad>"]
BOS_IDX = stoi["<bos>"]
EOS_IDX = stoi["<eos>"]

# Print the size of the vocabulary
print("Vocabulary Size:", len(vocab))

Vocabulary Size: 31409


In [8]:
def tokenize_and_format(example):
    """
    Tokenizes and formats an example.
    Arguments:
    - example: An input data example in the form of a dictionary with 'tokens' and 'ner_tags' keys.
    Returns:
    - A new dictionary with 'input_ids' and 'labels' keys containing the tokens converted to IDs and NER tags respectively.
    """
    tokens = example['tokens']
    ner_tags = example['ner_tags']

    # Add BOS token at the beginning and EOS token at the end
    tokens = ['<bos>'] + tokens + ['<eos>']
    token_ids = [stoi.get(token, UNK_IDX) for token in tokens]

    # Add 0 to the left and right of the NER tags
    ner_tags = [0] + ner_tags + [0]

    return {'input_ids': token_ids, 'labels': ner_tags}

# Apply the tokenize_and_format function to the dataset
dataset = dataset.map(tokenize_and_format, batched=False)



In [9]:
# Download the FastText word vectors for Spanish language
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz

# Decompress the downloaded file
!gunzip cc.es.300.bin.gz

--2023-07-05 01:32:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.160.46, 99.84.160.108, 99.84.160.80, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.160.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4500107671 (4.2G) [application/octet-stream]
Saving to: ‘cc.es.300.bin.gz.1’


2023-07-05 01:33:29 (134 MB/s) - ‘cc.es.300.bin.gz.1’ saved [4500107671/4500107671]



In [10]:
# Import the fasttext module
import fasttext

# Load the FastText model for Spanish language
ft = fasttext.load_model('cc.es.300.bin')



In [11]:
import torch

# Calculate the dimension of the embeddings
DIM = ft["random"].shape[0]

# Create an embedding matrix of random values with dimensions (vocab_size, DIM)
emb_matrix = torch.randn(len(vocab), DIM)

# Set the embedding for the PAD_IDX to zero
emb_matrix[PAD_IDX] = 0

# Build the embedding matrix

# Load all the embeddings for our vocabulary

for i, word in enumerate(itos):
    """
    Complete the embedding matrix
    """
    if i == UNK_IDX or i == PAD_IDX or i == BOS_IDX or i == EOS_IDX:
        # Skip UNK, PAD, BOS, and EOS embeddings
        pass
    else:
        # Fill in the embedding matrix
        word_vector = ft.get_word_vector(word)
        emb_matrix[i] = torch.tensor(word_vector)

In [12]:
del ft

In [13]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    """
    Function used to collate a list of examples into a batch.
    Args:
        batch (list): List of examples, where each example is a dictionary with 'labels' and 'input_ids' keys.
    Returns:
        tuple: Tuple containing the tensors of input_ids and labels after applying padding.
    """
    # Extract the 'labels' tensors from each example in the batch
    labels = [torch.tensor(example["labels"]) for example in batch]

    # Extract the 'input_ids' tensors from each example in the batch
    input_ids = [torch.tensor(example["input_ids"]) for example in batch]

    # Pad the input_ids tensors with the PAD_IDX as the padding value
    input_ids_padded = pad_sequence(input_ids, padding_value=PAD_IDX, batch_first=True)

    # Pad the labels tensors with -100 as the padding value
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)#-100)

    return input_ids_padded, labels_padded

# Create DataLoaders for the training, validation, and test data.
# The batch_size will be set to 32 or 16 depending on the case.
# collate_fn: Function to collate examples into batches, using the collate_batch function.
train_dataloader = DataLoader(dataset["train"], batch_size=32, collate_fn=collate_batch)
dev_dataloader = DataLoader(dataset["validation"], batch_size=16, collate_fn=collate_batch)
test_dataloader = DataLoader(dataset["test"], batch_size=16, collate_fn=collate_batch)

In [14]:
from tqdm.auto import tqdm
import numpy as np
import seqeval
import torch.nn.functional as F
from torch import nn
from datasets import load_metric

# Load the seqeval metric for sequence labeling evaluation
metric = load_metric("seqeval")

def validate_step(model, dataloader):
    """
    Validate step

    Calculates F1 and other metrics.
    """
    device = next(model.parameters()).device
    with torch.no_grad():
        # Disable gradient calculation
        all_labels = []
        all_preds = []
        all_losses = []
        for text, labels in tqdm(dataloader):
            text = text.to(device)
            labels = labels.to(device)
            logits = model(text)

            # Compute the cross-entropy loss
            loss = -model.crf(logits[:,:-1], labels[:,:-1])
            #loss = F.cross_entropy(
            #    logits.view(-1, 9),
            #    labels.view(-1),
            #)
            all_losses.append(loss.detach().item())

            # Instead of softmax, directly get the maximum value
            preds = logits.argmax(-1)

            # Convert label indices to corresponding labels
            true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
            # Ignore the -100 values
            true_predictions = [
                [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(preds, labels)
            ]

            all_labels += true_labels
            all_preds += true_predictions

        # Compute evaluation metrics
        metrics = metric.compute(predictions=all_preds, references=all_labels)

        metrics["loss"] = np.array(all_losses).mean()
        metrics["micro_f1"] = seqeval.metrics.sequence_labeling.f1_score(
            all_labels, all_preds, average="micro",
        )
        metrics["macro_f1"] = seqeval.metrics.sequence_labeling.f1_score(
            all_labels, all_preds, average="macro",
        )
        return metrics

def log_metrics(writer, metrics):
    """
    Log metrics to tensorboard
    """
    for k, v in metrics.items():
        if type(v) is dict:
            # Handle metrics with sub-categories (e.g., LOC, PER)
            for sub_k, sub_v in v.items():
                if sub_k == "number":
                    continue
                writer.add_scalar(f"dev/{k} {sub_k}", sub_v, global_step=step)
        else:
            writer.add_scalar(f"dev/{k}", sub_v, global_step=step)

  metric = load_metric("seqeval")


In [15]:
from google.protobuf.reflection import ParseMessage
import torch.nn.functional as F
from torch import nn
from datasets import load_metric
from torchcrf import CRF

# Load the seqeval metric for sequence labeling evaluation
metric = load_metric("seqeval")

class MyNERModel(nn.Module):
    """
    Custom NER model class.
    Args:
        vocab_size (int): Vocabulary size.
        embedding_dim (int): Dimension of embeddings.
        pad_idx (int): Padding index.
        rnn_units (int): Number of units in the LSTM layer.
        num_labels (int): Number of entity labels.
        num_layers (int, optional): Number of LSTM layers. Default: 1.
        dropout (float, optional): Dropout rate. Default: 0.25.
        embedding_matrix (torch.Tensor, optional): Pre-trained embedding matrix. Default: None.
        freeze_embeddings (bool, optional): Indicator of whether to freeze embeddings during training. Default: True.
    """
    def __init__(self, vocab_size, embedding_dim, pad_idx, rnn_units, num_labels, num_layers=1,
                 dropout=0.25, embedding_matrix=None, freeze_embeddings=True):
        """
        Constructor of the MyNERModel class.
        Creates the necessary layers for the model.
        """
        super().__init__()
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=pad_idx, freeze=freeze_embeddings)
        self.lstm = nn.LSTM(embedding_dim, rnn_units, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2*rnn_units, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, text):
        """
        Performs a forward pass in the model.
        Args:
            text (torch.Tensor): Encoded text sequence.
        Returns:
            torch.Tensor: Logits produced by the model.
        """
        # Embedding layer
        embedded = self.embedding(text)
        # LSTM layer
        output, _ = self.lstm(embedded)
        # Dropout layer
        output = self.dropout(output)
        logits = self.fc(output)
        return logits


In [16]:
# CHECK!
# Obtain the number of classes for the NER tags
num_classes = dataset["train"].features["ner_tags"].feature.num_classes

# Create an instance of the MyNERModel
model = MyNERModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=512, #512,
    embedding_matrix=emb_matrix, num_layers=16,
    freeze_embeddings=True, num_labels=num_classes, dropout=0.25
)

# Get a batch of text and labels from the training dataloader
text, labels = next(iter(train_dataloader))

# Forward pass through the model to get predictions
preds = model(text)

# Calculate the loss using cross-entropy
loss = -model.crf(preds[:,:-1], labels[:,:-1])
#loss = -model.crf(
#    preds.view(-1, num_classes),
#    labels.view(-1),
#)


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


In [17]:
loss

tensor(4845.5732, grad_fn=<NegBackward0>)

In [32]:
import torch
from tqdm.auto import tqdm
from pprint import pprint as pp
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

# Initialize a SummaryWriter for logging training progress
writer = SummaryWriter()

# Determine the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create an instance of the MyNERModel
model = MyNERModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=512, embedding_matrix=emb_matrix,
    num_layers=3, freeze_embeddings=True, num_labels=num_classes,
)

# Set the number of epochs and initialize the step counter
num_epochs = 10
step = 0

# Set the learning rate for the optimizer
lr = 1e-3

# Define the optimizer (Adam) for updating model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Move the model to the appropriate device
model = model.to(device)

# Training loop
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        step += 1

        # Get the text and labels from the batch and move them to the device
        text, labels = batch
        text = text.to(device)
        labels = labels.to(device)

        # Reset the gradients
        optimizer.zero_grad()

        # Forward pass through the model to get logits
        logits = model(text)

        # Calculate the loss using cross-entropy
        #loss = F.cross_entropy(logits.view(-1, num_classes), labels.view(-1))
        loss = -model.crf(logits[:,:-1], labels[:,:-1])
        # Backpropagation to calculate gradients
        loss.backward()

        # Update the model parameters
        optimizer.step()

        # Calculate the gradient norm
        total_norm = sum(param.grad.detach().norm(2) ** 2 for param in model.parameters() if param.requires_grad) ** (0.5)

        # Log the loss and gradient norm to TensorBoard
        writer.add_scalar("train/loss", loss, global_step=step)
        writer.add_scalar("train/gradient_norm", total_norm, global_step=step)

    # Perform validation on the development set
    metrics = validate_step(model, dev_dataloader)
    pp(metrics)

    # Log the metrics to TensorBoard
    log_metrics(writer, metrics)


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.18181818181818182,
         'number': 985,
         'precision': 0.4888888888888889,
         'recall': 0.1116751269035533},
 'MISC': {'f1': 0.8341474761572459,
          'number': 2235,
          'precision': 0.8687015503875969,
          'recall': 0.8022371364653244},
 'ORG': {'f1': 0.41625573881992856,
         'number': 1700,
         'precision': 0.2927529299210715,
         'recall': 0.72},
 'PER': {'f1': 0.39009600808489137,
         'number': 1222,
         'precision': 0.5099075297225891,
         'recall': 0.3158756137479542},
 'loss': 149.57934869130452,
 'macro_f1': 0.45557935122006193,
 'micro_f1': 0.5255441693469968,
 'overall_accuracy': 0.9504243305382742,
 'overall_f1': 0.5255441693469968,
 'overall_precision': 0.48609381486093817,
 'overall_recall': 0.5719635297948551}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.3413926499032882,
         'number': 985,
         'precision': 0.32594644506001846,
         'recall': 0.3583756345177665},
 'MISC': {'f1': 0.8444964871194379,
          'number': 2235,
          'precision': 0.885995085995086,
          'recall': 0.8067114093959732},
 'ORG': {'f1': 0.5546539379474941,
         'number': 1700,
         'precision': 0.4666666666666667,
         'recall': 0.6835294117647058},
 'PER': {'f1': 0.7013595874355368,
         'number': 1222,
         'precision': 0.8210757409440176,
         'recall': 0.6121112929623568},
 'loss': 80.50880381266276,
 'macro_f1': 0.6104756656014392,
 'micro_f1': 0.6422873390727432,
 'overall_accuracy': 0.9701109007303219,
 'overall_f1': 0.6422873390727432,
 'overall_precision': 0.6237152937567112,
 'overall_recall': 0.6619993487463367}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.626005929690809,
         'number': 985,
         'precision': 0.5370639534883721,
         'recall': 0.750253807106599},
 'MISC': {'f1': 0.8582905601428252,
          'number': 2235,
          'precision': 0.8561887800534284,
          'recall': 0.8604026845637583},
 'ORG': {'f1': 0.6670644391408115,
         'number': 1700,
         'precision': 0.6767554479418886,
         'recall': 0.6576470588235294},
 'PER': {'f1': 0.8030546623794212,
         'number': 1222,
         'precision': 0.7890995260663507,
         'recall': 0.8175122749590835},
 'loss': 60.44261636336645,
 'macro_f1': 0.7386038978384668,
 'micro_f1': 0.7536666140987226,
 'overall_accuracy': 0.9778114011360562,
 'overall_f1': 0.7536666140987226,
 'overall_precision': 0.7307339449541285,
 'overall_recall': 0.7780853142298926}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7144778126400717,
         'number': 985,
         'precision': 0.6396468699839486,
         'recall': 0.8091370558375635},
 'MISC': {'f1': 0.8759707122254271,
          'number': 2235,
          'precision': 0.8688380281690141,
          'recall': 0.8832214765100671},
 'ORG': {'f1': 0.7049375371802499,
         'number': 1700,
         'precision': 0.7129963898916968,
         'recall': 0.6970588235294117},
 'PER': {'f1': 0.8629283489096573,
         'number': 1222,
         'precision': 0.8231797919762258,
         'recall': 0.9067103109656302},
 'loss': 51.18187313874562,
 'macro_f1': 0.7895786027388515,
 'micro_f1': 0.799494790022103,
 'overall_accuracy': 0.9817588585339464,
 'overall_f1': 0.799494790022103,
 'overall_precision': 0.7759730309531107,
 'overall_recall': 0.8244871377401498}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7267683772538142,
         'number': 985,
         'precision': 0.66723259762309,
         'recall': 0.7979695431472081},
 'MISC': {'f1': 0.866387849438697,
          'number': 2235,
          'precision': 0.8526863084922011,
          'recall': 0.8805369127516779},
 'ORG': {'f1': 0.7272199651770168,
         'number': 1700,
         'precision': 0.7176403207331042,
         'recall': 0.7370588235294118},
 'PER': {'f1': 0.8871096877502003,
         'number': 1222,
         'precision': 0.8683385579937304,
         'recall': 0.9067103109656302},
 'loss': 46.419448073705034,
 'macro_f1': 0.8018714699049321,
 'micro_f1': 0.808695652173913,
 'overall_accuracy': 0.9829168920746552,
 'overall_f1': 0.808695652173913,
 'overall_precision': 0.7859557467732022,
 'overall_recall': 0.8327906219472484}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7591312931885489,
         'number': 985,
         'precision': 0.7387127761767531,
         'recall': 0.7807106598984772},
 'MISC': {'f1': 0.8796992481203006,
          'number': 2235,
          'precision': 0.8696982947092261,
          'recall': 0.8899328859060402},
 'ORG': {'f1': 0.7537772803581421,
         'number': 1700,
         'precision': 0.7187833511205977,
         'recall': 0.7923529411764706},
 'PER': {'f1': 0.8954619124797407,
         'number': 1222,
         'precision': 0.8868378812199037,
         'recall': 0.9042553191489362},
 'loss': 42.63932158152262,
 'macro_f1': 0.8220174335366831,
 'micro_f1': 0.8276409849086576,
 'overall_accuracy': 0.984590546388964,
 'overall_f1': 0.8276409849086576,
 'overall_precision': 0.8080024813895782,
 'overall_recall': 0.8482578964506675}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7629482071713147,
         'number': 985,
         'precision': 0.7487781036168133,
         'recall': 0.7776649746192893},
 'MISC': {'f1': 0.884888888888889,
          'number': 2235,
          'precision': 0.879028697571744,
          'recall': 0.8908277404921701},
 'ORG': {'f1': 0.7471010491441192,
         'number': 1700,
         'precision': 0.7039542143600416,
         'recall': 0.7958823529411765},
 'PER': {'f1': 0.8858375833660259,
         'number': 1222,
         'precision': 0.8507912584777694,
         'recall': 0.9238952536824877},
 'loss': 43.09769067267577,
 'macro_f1': 0.8201939321425873,
 'micro_f1': 0.8264058679706602,
 'overall_accuracy': 0.9845820935893969,
 'overall_f1': 0.8264058679706602,
 'overall_precision': 0.8014379684870736,
 'overall_recall': 0.852979485509606}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7779941577409931,
         'number': 985,
         'precision': 0.7474275023386342,
         'recall': 0.8111675126903554},
 'MISC': {'f1': 0.8808107512667989,
          'number': 2235,
          'precision': 0.8676215277777778,
          'recall': 0.8944071588366891},
 'ORG': {'f1': 0.7270687237026648,
         'number': 1700,
         'precision': 0.6949061662198391,
         'recall': 0.7623529411764706},
 'PER': {'f1': 0.8907429479539133,
         'number': 1222,
         'precision': 0.8656370656370657,
         'recall': 0.9173486088379705},
 'loss': 42.03074960708618,
 'macro_f1': 0.8191541451660926,
 'micro_f1': 0.822879684418146,
 'overall_accuracy': 0.9843707736002164,
 'overall_f1': 0.822879684418146,
 'overall_precision': 0.7982550130108679,
 'overall_recall': 0.8490719635297949}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7777777777777777,
         'number': 985,
         'precision': 0.7604267701260912,
         'recall': 0.7959390862944162},
 'MISC': {'f1': 0.8935419880978619,
          'number': 2235,
          'precision': 0.8805386620330148,
          'recall': 0.9069351230425056},
 'ORG': {'f1': 0.7547062179121504,
         'number': 1700,
         'precision': 0.7325581395348837,
         'recall': 0.778235294117647},
 'PER': {'f1': 0.9001203369434417,
         'number': 1222,
         'precision': 0.8827694728560189,
         'recall': 0.9181669394435352},
 'loss': 40.63099521795909,
 'macro_f1': 0.831536580182808,
 'micro_f1': 0.8374760994263862,
 'overall_accuracy': 0.9857147687314038,
 'overall_f1': 0.8374760994263862,
 'overall_precision': 0.8199687987519501,
 'overall_recall': 0.8557473135786389}


  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

{'LOC': {'f1': 0.7595281306715063,
         'number': 985,
         'precision': 0.6866283839212469,
         'recall': 0.849746192893401},
 'MISC': {'f1': 0.8917672886937431,
          'number': 2235,
          'precision': 0.8754310344827586,
          'recall': 0.9087248322147651},
 'ORG': {'f1': 0.7295742232451092,
         'number': 1700,
         'precision': 0.713963963963964,
         'recall': 0.7458823529411764},
 'PER': {'f1': 0.8835239305379077,
         'number': 1222,
         'precision': 0.9157155399473222,
         'recall': 0.853518821603928},
 'loss': 44.66590526898702,
 'macro_f1': 0.8160983932870665,
 'micro_f1': 0.822324547475389,
 'overall_accuracy': 0.9842355288071409,
 'overall_f1': 0.822324547475389,
 'overall_precision': 0.8024480942051441,
 'overall_recall': 0.8432106805600782}


In [33]:
validate_step(model, test_dataloader)

  0%|          | 0/95 [00:00<?, ?it/s]

{'LOC': {'precision': 0.7421276595744681,
  'recall': 0.8044280442804428,
  'f1': 0.7720230190349713,
  'number': 1084},
 'MISC': {'precision': 0.8631465517241379,
  'recall': 0.9117814456459875,
  'f1': 0.886797675062275,
  'number': 1757},
 'ORG': {'precision': 0.7464503042596349,
  'recall': 0.7885714285714286,
  'f1': 0.7669329628343176,
  'number': 1400},
 'PER': {'precision': 0.9354395604395604,
  'recall': 0.926530612244898,
  'f1': 0.9309637730690362,
  'number': 735},
 'overall_precision': 0.8130966017563955,
 'overall_recall': 0.8559083601286174,
 'overall_f1': 0.8339533972978266,
 'overall_accuracy': 0.9862378121750731,
 'loss': 38.83540423041896,
 'micro_f1': 0.8339533972978266,
 'macro_f1': 0.8391793575001499}

In [34]:
from spacy.lang.es import Spanish
import spacy
nlp = Spanish()
tokenizer = nlp.tokenizer

def simplify_entities(entities):
    """
    Simplifies the identified entities by combining consecutive elements of a phrase.

    Parameters:
        entities (list): A list of dictionaries representing the identified entities.
                         Each dictionary should contain two keys: 'entidad' (entity type) and 'texto' (entity text).
    Returns:
        list: A list of dictionaries representing the simplified entities.
              Each dictionary contains two keys: 'entidad' (entity type) and 'texto' (simplified entity text).
    Example:
        entities = [{'entidad': 'B-PER', 'texto': 'Juan'},
                    {'entidad': 'I-PER', 'texto': 'Manuel'},
                    {'entidad': 'I-PER', 'texto': 'Pérez'},
                    {'entidad': 'B-ORG', 'texto': 'Universidad'},
                    {'entidad': 'I-ORG', 'texto': 'de'},
                    {'entidad': 'I-ORG', 'texto': 'San'},
                    {'entidad': 'I-ORG', 'texto': 'Andrés'}]

        simplified_entities = simplify_entities(entities)
        print(simplified_entities)
        Output: [{'entidad': 'PER', 'texto': 'Juan Manuel Pérez'},
                 {'entidad': 'ORG', 'texto': 'Universidad de San Andrés'}]
    """
    simplified_entities = []
    current_entity = None
    current_text = ""

    for entity in entities:
        if entity["entidad"].startswith("B-"):
            if current_entity is not None:
                simplified_entities.append({"entidad": current_entity[2:], "texto": current_text})
            current_entity = entity["entidad"]
            current_text = str(entity["texto"])
        elif entity["entidad"].startswith("I-"):
            if current_entity is not None:
                current_text += " " + str(entity["texto"])

    if current_entity is not None:
        simplified_entities.append({"entidad": current_entity[2:], "texto": current_text})

    return simplified_entities

def identificar_entidades(model, tokenizer, text):
    """
    Identifies named entities in the given text using a trained model.

    Parameters:
        model (torch.nn.Module): The trained model for named entity recognition.
        tokenizer: The tokenizer object used to tokenize the input text.
        text (str): The input text from which entities are to be identified.

    Returns:
        list: A list of dictionaries representing the identified entities.
              Each dictionary contains two keys: 'entidad' (entity type) and 'texto' (entity text).
    """
    # Tokenize text and obtain tokens and token IDs
    tokens = tokenizer(text)
    token_ids = [stoi.get(token.text, stoi["<unk>"]) for token in tokens]
    input_ids = torch.tensor(token_ids).unsqueeze(0).to(next(model.parameters()).device)
    output = model(input_ids)
    predicted_labels = output.argmax(dim=-1)[0]
    labels = [id2label[label_id.item()] for label_id in predicted_labels]
    entities = [{"entidad": label, "texto": token} for token, label in zip(tokens,labels) if label != 'O']
    return simplify_entities(entities)

In [35]:
text = 'Las denominaciones adoptadas sucesivamente desde 1810 hasta el presente, a saber: Provincias Unidas del Río de la \
Plata, República Argentina, Confederación Argentina, serán en adelante nombres oficiales indistintamente para la \
designación del Gobierno y territorio de las provincias, empleándose las palabras Nación Argentina en la formación \
y sanción de las leyes.'
identificar_entidades(model, tokenizer, text)

[{'entidad': 'LOC', 'texto': 'Río de la Plata'},
 {'entidad': 'LOC', 'texto': 'República Argentina'},
 {'entidad': 'ORG', 'texto': 'Confederación Argentina'},
 {'entidad': 'ORG', 'texto': 'Gobierno'},
 {'entidad': 'ORG', 'texto': 'Argentina'}]

In [36]:
identificar_entidades(model, tokenizer, "Juan Manuel Pérez es el profesor de NLP de la Universidad de San Andrés")

[{'entidad': 'PER', 'texto': 'Juan Manuel Pérez'},
 {'entidad': 'MISC', 'texto': 'Universidad de San Andrés'}]

In [37]:
text = "El sitio Aires de los Lagos es un lugar mágico, fui con Juan Pérez"
identificar_entidades(model, tokenizer, text)

[{'entidad': 'LOC', 'texto': 'Aires de los Lagos'},
 {'entidad': 'PER', 'texto': 'Juan Pérez'}]

In [38]:
text = "Laura Romano es una nutricionista muy famosa que vive en Buenos Aires"
identificar_entidades(model, tokenizer, text)

[{'entidad': 'PER', 'texto': 'Laura Romano'},
 {'entidad': 'LOC', 'texto': 'Buenos Aires'}]

In [39]:
text = "Lionel, el mejor futbolista de todos los tiempos, firmó su nuevo\
 contrato con el Inter de Miami"
identificar_entidades(model, tokenizer, text)

[{'entidad': 'ORG', 'texto': 'Inter de Miami'}]