# Model inference
Author: Jan Rodríguez Miret

It makes inference on the test set of the given dataset, creating the ann files.
We need to create the CoNLL during the process.

In [1]:
import os
import pandas as pd
import torch
from datasets import load_dataset
import csv

In [30]:
# General
HF_DATASET = "/home/jan/bsc/cataccc-ner"
MODEL_PATH = "/home/jan/bsc/best-hlu3ln61"
OUTPUT_DIR = "/home/jan/prediction_conlls"
MERGED_CONLL = "/home/jan/bsc/cataccc-ner/test.conll" # CoNLL file with all true labels for the split
ORIGINAL_CONLLS_DIR = "/home/jan/bsc/cataccc" # Directory containing the CoNLL files (with true labels) of the split
ORIGINAL_TXTS_DIR = ORIGINAL_CONLLS_DIR
OUTPUT_ANNS_DIR = "/home/jan/bsc/predictions_anns"
OUTPUT_CONLL_DIR = "/home/jan/bsc/predictions_conlls" # Do not include a trailing '/'

# Environment variables
os.environ["PYTORCH_HIP_ALLOC_CONF"] = "garbage_collection_threshold:0.9,max_split_size_mb:4096" # otherwise we get HIP Error for memory fragmentation
#os.environ["WANDB_NOTEBOOK_NAME"] = f"{PROJECT_NAME.replace('-','_')}.ipynb"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
dataset = load_dataset(HF_DATASET)
dataset

Found cached dataset cataccc-ner (/home/jan/.cache/huggingface/datasets/cataccc-ner/Distemist/1.0.0/ebd6a01ae600b54dad2e7ea34943eff472795cbdbda1058cb11cd29205845690)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3990
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3990
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3990
    })
})

## Training

### Prepare data & class weights

In [5]:
classes = dataset["train"].features["ner_tags"].feature
id2label = {idx: tag for idx, tag in enumerate(classes.names)}
label2id = {tag: idx for idx, tag in enumerate(classes.names)}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer

RobertaTokenizerFast(name_or_path='/home/jan/bsc/best-hlu3ln61', vocab_size=50262, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [7]:
def tokenize_and_align_labels(samples):
    tokenized_inputs = tokenizer(samples["tokens"], truncation=True, is_split_into_words=True)

    labs = []
    for i, label in enumerate(samples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to the current label
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labs.append(label_ids)

    tokenized_inputs["labels"] = labs
    return tokenized_inputs

In [8]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset

Loading cached processed dataset at /home/jan/.cache/huggingface/datasets/cataccc-ner/Distemist/1.0.0/ebd6a01ae600b54dad2e7ea34943eff472795cbdbda1058cb11cd29205845690/cache-5e451af6b9535147.arrow


Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/jan/.cache/huggingface/datasets/cataccc-ner/Distemist/1.0.0/ebd6a01ae600b54dad2e7ea34943eff472795cbdbda1058cb11cd29205845690/cache-b5f8c001e4a28e27.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3990
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3990
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3990
    })
})

### Evaluation metrics

In [9]:
from transformers import RobertaForTokenClassification

model = RobertaForTokenClassification.from_pretrained(MODEL_PATH)

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='pt')

## Evaluate model

In [11]:
def forward_pass_with_label(batch):
    # Convertimos los datos en una lista de diccionarios para que puedan ser procesados por el
    # data collator.
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    # Padding de las entradas y etiquetas para obtener las predicciones.
    new_batch = data_collator(features)
    input_ids = new_batch["input_ids"].to(device)
    attention_mask = new_batch["attention_mask"].to(device)
    labels = new_batch["labels"].to(device)
    with torch.no_grad():
        # Pasa los datos a través del modelo
        output = model(input_ids, attention_mask)
        # Logit.size: [batch_size, sequence_length, classes]
        # Predecimos la clase más probable como aquella que tenga el logit más alto.
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()
    # Calculamos la loss por token. La los en NER está siendo cross_entroy
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    loss = loss_fct(output.logits.view(-1, classes.num_classes),
                          labels.view(-1))
    # Hacemos el unflatten para ponerlo en formato de salida
    loss = loss.view(len(input_ids), -1).cpu().numpy()

    return {"loss":loss, "predicted_label": predicted_label, 'logits': output.logits}

In [20]:
# Make inference
test_subset = tokenized_dataset["test"].map(batched=True, batch_size=32, remove_columns=["id","ner_tags","tokens"])
test_subset = test_subset.map(forward_pass_with_label, batched=True, batch_size=32)
test_df = test_subset.to_pandas()

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [22]:
df = test_df.copy()

In [14]:
df_tokens = df.apply(lambda x: x.apply(pd.Series).stack())
# NaN comes from padding-added tokens. For ignored tokens (special characters and not-first subtokens of a word), label is -100
df_tokens = df_tokens.dropna()
df_tokens

Unnamed: 0,Unnamed: 1,input_ids,attention_mask,labels,loss,predicted_label,logits
0,0,0.0,1.0,-100.0,0.000000,0.0,"[6.7168273926, -3.9612832069, -3.9891347885]"
0,1,49278.0,1.0,0.0,0.000045,0.0,"[6.9053611755, -3.3825907707000002, -4.4961137..."
0,2,262.0,1.0,0.0,0.000027,0.0,"[7.0929918289, -4.2796778679, -3.9607875347]"
0,3,1236.0,1.0,0.0,0.000029,0.0,"[7.0992631912, -3.9553177357, -4.1476831436]"
0,4,32599.0,1.0,0.0,0.000026,0.0,"[7.1441493034, -4.061006546, -4.1392183304]"
...,...,...,...,...,...,...,...
3989,141,614.0,1.0,0.0,0.000024,0.0,"[7.1774206161, -4.1089110374, -4.1932125092]"
3989,142,2023.0,1.0,0.0,0.000024,0.0,"[7.1765441895, -4.1786317825, -4.1047554016]"
3989,143,426.0,1.0,-100.0,0.000000,0.0,"[7.1845502853, -4.1745696068, -4.123939991]"
3989,144,22671.0,1.0,0.0,0.000024,0.0,"[7.1943531036, -4.0900592804, -4.1988039017]"


In [15]:
# Add label in string format (int to string)
df_tokens['labels_str'] = df_tokens['labels'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])
df_tokens['predicted_label_str'] = df_tokens['predicted_label'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])

In [16]:
# Filter out predictions that should be ignored
df_filtered = df_tokens[df_tokens['labels'] != -100.]

In [17]:
# Load the reference CoNLL (whole split)
df_conll = pd.read_csv(MERGED_CONLL, sep='\t', quoting=csv.QUOTE_NONE, header=None)
df_conll.columns = ['label', 'start', 'end', 'text']
df_conll

Unnamed: 0,label,start,end,text
0,O,1,11,Adolescent
1,O,12,14,de
2,O,15,17,15
3,O,18,22,anys
4,O,22,23,","
...,...,...,...,...
78100,O,3405,3407,la
78101,O,3408,3418,restitutio
78102,O,3419,3421,ad
78103,O,3422,3430,integrum


## Correct missing tokens due to truncation

In [18]:
df['token_length'] = df['labels'].apply(len)
too_long_level_0 = df[df['token_length'] >= 512].index
df[df['token_length'] >= 512] # max input RobertaModel

Unnamed: 0,input_ids,attention_mask,labels,loss,predicted_label,logits,token_length


In [19]:
df_flat = df_filtered.reset_index()
too_long_level_1 = df_flat[df_flat['level_0'].isin(too_long_level_0)].groupby('level_0')['level_1'].count().values
too_long_level_1

array([], dtype=int64)

In [20]:
labels_list = df_filtered['labels_str'].to_list()
predicted_labels_list = df_filtered['predicted_label_str'].to_list()

In [21]:
for level_0, level_1 in zip(too_long_level_0, too_long_level_1):
    too_long_idx_flat = df_flat[(df_flat['level_0'] == level_0) & (df_flat['level_1'] == level_1)].index[0]
    tokenized_too_long = tokenizer(dataset['test'][4349]['tokens'], is_split_into_words=True, return_length=True)
    num_words = len(dataset['test'][level_0]['ner_tags'])
    print(f"{num_words - level_1 = }")
    for i in range(num_words - level_1):
        labels_list.insert(too_long_idx_flat, 'O')
        predicted_labels_list.insert(too_long_idx_flat, 'O')

In [22]:
print(f"{len(labels_list) = }")
print(f"{len(df_conll) = }")
print(f"{len(predicted_labels_list) = }")

len(labels_list) = 78105
len(df_conll) = 78105
len(predicted_labels_list) = 78105


In [23]:
# Make sure that both true labels from the dataset and CoNLL are the same
assert labels_list ==  df_conll['label'].to_list()

In [24]:
# Replace true labels with predicted labels
df_conll['label'] = predicted_labels_list

In [25]:
# Get the filenames of CoNLLs (important that they are sorted)
original_conlls = sorted([filename for filename in os.listdir(ORIGINAL_CONLLS_DIR) if filename.endswith('.conll')])

In [None]:
os.makedirs(OUTPUT_CONLL_DIR)

In [50]:
# Generate the .conll files by using offset
current_offset = 0
file_idx = 0 # Position of file within all retrieved with listdir
start_token_idx = 0 # Index within the dataframe that marks the start of a file
for idx, line in df_conll.iterrows():
    # If we reach the end of a file
    if line['start'] < current_offset:
        df_conll.loc[start_token_idx:idx-1].to_csv(os.path.join(OUTPUT_CONLL_DIR, original_conlls[file_idx]), sep='\t', quoting=csv.QUOTE_NONE, header=None, index=False)
        file_idx += 1
        current_offset = 0
        start_token_idx = idx
    current_offset = line['end']
# Add last document
df_conll.loc[start_token_idx:idx].to_csv(os.path.join(OUTPUT_CONLL_DIR, original_conlls[file_idx]), sep='\t', quoting=csv.QUOTE_NONE, header=None, index=False)

In [27]:
from brat.tools import BIOtoStandoff

In [32]:
os.makedirs(OUTPUT_ANNS_DIR)

In [37]:
conll_files = [file for file in os.listdir(OUTPUT_CONLL_DIR) if file.endswith(".conll")]

In [41]:
# Write an .ann file for each .conll file by calling BIOtoStandoff.py
conll_files = [file for file in os.listdir(OUTPUT_CONLL_DIR) if file.endswith(".conll")]

for conll_file in conll_files:
    txt_file = conll_file.replace('.conll', '.txt')
    argv = ["brat/tools/BIOtoStandoff.py", os.path.join(ORIGINAL_TXTS_DIR, txt_file), os.path.join(OUTPUT_CONLL_DIR, conll_file), "-1", "0"]
    res = BIOtoStandoff.main(argv)
    ann_file = conll_file.replace('.conll', '.ann')
    with open(OUTPUT_ANNS_DIR + ann_file, 'w') as file:
        ann_content = map(lambda line: str(line)+'\n', res)
        file.writelines(ann_content)

Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
Note: rewriting "I" -> "B" after "O"
N