Generate the .conll files with the predictions from a model.

This script uses the predictions (.json) obtained after executing the following code in the training notebook for the corresponding split (training, validation, or test):

```python
test_subset = tokenized_dataset["test"].map(batched=True, batch_size=32, remove_columns=["id","ner_tags","tokens"])
test_subset = test_subset.map(forward_pass_with_label, batched=True, batch_size=32)
test_df = test_subset.to_pandas()
test_df.to_json("test_results.json")
```

The idea is to align the predictions JSON (`test_results.json`) and the complete CoNLL file for the split (i.e. `test.conll`), so that we can use the same char offsets and tokens of the CoNLL and just substitute the ground truth label with the predicted one. 

Also, by having the offsets, we can split the predicted joint CoNLL into the different files (`e.g. caso-clinico-1.conll`) if we find that the next offset is lower than the last one.

In [None]:
PREDICTIONS_JSON = "../bsc-bio-ehr-es-meddoprof/best-2rvi973b/test_results.json" # Path to JSON File containing the predictions resulting from training notebook
HF_DATASET = "../meddoprof-no-act-ner" # Path to hugging face dataset (can be local or remote)
MERGED_CONLL = "../meddoprof-no-act-ner/test.conll" # CoNLL file with all true labels for the split
ORIGINAL_CONLLS_DIR = "../meddoprof-no-act/test" # Directory containing the CoNLL files (with true labels) of the split
OUTPUT_DIR = "../bsc-bio-ehr-es-meddoprof/best-2rvi973b/test_prediction_conlls" # Do not include a trailing '/'

In [None]:
import pandas as pd
import csv
import os

In [None]:
df = pd.read_json(PREDICTIONS_JSON)
df['predicted_label']

In [None]:
df

In [None]:
from huggingface_hub import notebook_login
#notebook_login()

In [None]:
from datasets import load_dataset
dataset = load_dataset(HF_DATASET)

In [None]:
classes = dataset["train"].features["ner_tags"].feature
id2label = {idx: tag for idx, tag in enumerate(classes.names)}
label2id = {tag: idx for idx, tag in enumerate(classes.names)}

In [None]:
df_tokens = df.apply(lambda x: x.apply(pd.Series).stack())
# NaN comes from padding-added tokens. For ignored tokens (special characters and not-first subtokens of a word), label is -100
df_tokens = df_tokens.dropna()
df_tokens

In [None]:
from transformers import AutoTokenizer

BASE_MODEL = "../bsc-bio-ehr-es-drugtemist-es/best-5umrjpdk"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer

In [None]:
# Add label in string format (int to string)
df_tokens['labels_str'] = df_tokens['labels'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])
df_tokens['predicted_label_str'] = df_tokens['predicted_label'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])

In [None]:
# Filter out predictions that should be ignored
df_filtered = df_tokens[df_tokens['labels'] != -100.]

In [None]:
# Load the reference CoNLL (whole split)
df_conll = pd.read_csv(MERGED_CONLL, sep='\t', quoting=csv.QUOTE_NONE, header=None)
df_conll.columns = ['label', 'start', 'end', 'text']
df_conll

In [None]:
# Correct missing tokens due to truncation

In [None]:
df['token_length'] = df['labels'].apply(len)
too_long_level_0 = df[df['token_length'] >= 512].index
df[df['token_length'] >= 512] # max input RobertaModel

In [None]:
tokenized_too_long = tokenizer(dataset['test'][4349]['tokens'], is_split_into_words=True, return_length=True)
len(dataset['test'][4349]['ner_tags'])

In [None]:
dataset['test'][4349]['tokens']

In [None]:
df_flat = df_filtered.reset_index()
too_long_level_1 = df_flat[df_flat['level_0'].isin(too_long_level_0)].groupby('level_0')['level_1'].count().values
too_long_level_1

In [None]:
labels_list = df_filtered['labels_str'].to_list()
predicted_labels_list = df_filtered['predicted_label_str'].to_list()

In [None]:
for level_0, level_1 in zip(too_long_level_0, too_long_level_1):
    too_long_idx_flat = df_flat[(df_flat['level_0'] == level_0) & (df_flat['level_1'] == level_1)].index[0]
    tokenized_too_long = tokenizer(dataset['test'][4349]['tokens'], is_split_into_words=True, return_length=True)
    num_words = len(dataset['test'][level_0]['ner_tags'])
    print(f"{num_words - level_1 = }")
    for i in range(num_words - level_1):
        labels_list.insert(too_long_idx_flat, 'O')
        predicted_labels_list.insert(too_long_idx_flat, 'O')

In [None]:
len(labels_list)

In [None]:
len(df_conll)

In [None]:
len(predicted_labels_list)

In [None]:
# Make sure that both true labels from the dataset and CoNLL are the same
assert labels_list ==  df_conll['label'].to_list()

In [None]:
# Replace true labels with predicted labels
df_conll['label'] = predicted_labels_list

In [None]:
# Get the filenames of CoNLLs
original_conlls = sorted([filename for filename in os.listdir(ORIGINAL_CONLLS_DIR) if filename.endswith('.conll')])

In [None]:
os.makedirs(OUTPUT_DIR)

In [None]:
# Generate the .conll files by using offset
current_offset = 0
file_idx = 0 # Position of file within all retrieved with listdir
start_token_idx = 0 # Index within the dataframe that marks the start of a file
for idx, line in df_conll.iterrows():
    # If we reach the end of a file
    if line['start'] < current_offset:
        df_conll.loc[start_token_idx:idx-1].to_csv(OUTPUT_DIR + '/' + original_conlls[file_idx], sep='\t', quoting=csv.QUOTE_NONE, header=None, index=False)
        file_idx += 1
        current_offset = 0
        start_token_idx = idx
    current_offset = line['end']
# Add last document
df_conll.loc[start_token_idx:idx].to_csv(OUTPUT_DIR + '/' + original_conlls[file_idx], sep='\t', quoting=csv.QUOTE_NONE, header=None, index=False)