Generate the .conll files with the predictions from a model.

This script uses the predictions (.json) obtained after executing the following code in the training notebook for the corresponding split (training, validation, or test):

```python
test_subset = tokenized_dataset["test"].map(batched=True, batch_size=32, remove_columns=["id","ner_tags","tokens"])
test_subset = test_subset.map(forward_pass_with_label, batched=True, batch_size=32)
test_df = test_subset.to_pandas()
test_df.to_json("test_results.json")
```

The idea is to align the predictions JSON (`test_results.json`) and the complete CoNLL file for the split (i.e. `test.conll`), so that we can use the same char offsets and tokens of the CoNLL and just substitute the ground truth label with the predicted one. 

Also, by having the offsets, we can split the predicted joint CoNLL into the different files (`e.g. caso-clinico-1.conll`) if we find that the next offset is lower than the last one.

In [58]:
PREDICTIONS_JSON = "/home/jan/bsc/meddoplace_jan/meddoplace_noun/test_results_noun_10_epochs.json" # Path to JSON File containing the predictions resulting from training notebook
HF_DATASET = "janrodriguez/meddoplace-noun-ner" # Path to hugging face dataset (can be local or remote)
MERGED_CONLL = "/home/jan/bsc/meddoplace_noun/test.conll" # CoNLL file with all true labels for the split
ORIGINAL_CONLLS_DIR = "/home/jan/bsc/meddoplace_noun/test" # Directory containing the CoNLL files (with true labels) of the split
OUTPUT_DIR = "/home/jan/bsc/meddoplace_jan/meddoplace_noun/noun_prediction_conlls" # Do not include a trailing '/'

In [59]:
import pandas as pd
import csv
import os

In [60]:
df = pd.read_json(PREDICTIONS_JSON)
df['predicted_label']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 11, 0, 1, 2, 0, 0, 0, 0, 0, 0,...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ...
4       [0, 0, 0, 0, 0, 9, 10, 10, 10, 0, 0, 0, 0, 0, ...
                              ...                        
9804    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9805    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9806    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9807    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9808    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: predicted_label, Length: 9809, dtype: object

In [61]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/jan/.cache/huggingface/token
Login successful


In [62]:
from datasets import load_dataset
dataset = load_dataset(HF_DATASET)

Found cached dataset meddoplace-noun-ner (/home/jan/.cache/huggingface/datasets/janrodriguez___meddoplace-noun-ner/Meddoplace/1.0.0/3e1d88b1040fd27a258b349014199818b1336f38bc4e1ce11b506637965baeff)


  0%|          | 0/3 [00:00<?, ?it/s]

In [63]:
classes = dataset["train"].features["ner_tags"].feature
id2label = {idx: tag for idx, tag in enumerate(classes.names)}
label2id = {tag: idx for idx, tag in enumerate(classes.names)}

In [64]:
df_tokens = df.apply(lambda x: x.apply(pd.Series).stack())
df_tokens = df_tokens.dropna()
df_tokens

Unnamed: 0,Unnamed: 1,input_ids,attention_mask,labels,loss,predicted_label,logits
0,0,0.0,1.0,-100.0,0.000000,0.0,"[9.5171518326, -2.222471714, -2.9363076687, 2...."
0,1,458.0,1.0,0.0,0.000010,0.0,"[11.3209171295, -1.9549036026, -1.6792972088, ..."
0,2,2705.0,1.0,0.0,0.000010,0.0,"[11.2885913849, -1.9743897915000002, -1.712980..."
0,3,262.0,1.0,0.0,0.000009,0.0,"[11.3430776596, -2.0153839588, -1.6989270449, ..."
0,4,3946.0,1.0,0.0,0.000010,0.0,"[11.2928352356, -1.9858816862, -1.7037079334, ..."
...,...,...,...,...,...,...,...
9808,10,837.0,1.0,0.0,0.000007,0.0,"[11.4828119278, -1.9847073554999999, -1.946771..."
9808,11,1066.0,1.0,0.0,0.000007,0.0,"[11.4844226837, -1.9791500568, -1.9528244734, ..."
9808,12,3777.0,1.0,0.0,0.000007,0.0,"[11.4821424484, -2.0000920296, -1.9531577826, ..."
9808,13,22671.0,1.0,0.0,0.000007,0.0,"[11.4722986221, -1.965950489, -1.9108389616, -..."


In [65]:
# Add label in string format (int to string)
df_tokens['labels_str'] = df_tokens['labels'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])
df_tokens['predicted_label_str'] = df_tokens['predicted_label'].apply(lambda x: 'IGN' if x not in id2label else id2label[x])

In [66]:
# Filter out predictions that should be ignored
df_filtered = df_tokens[df_tokens['labels'] != -100]

In [67]:
# Load the reference CoNLL (whole split)
df_conll = pd.read_csv(MERGED_CONLL, sep='\t', quoting=csv.QUOTE_NONE, header=None)
df_conll.columns = ['label', 'start', 'end', 'text']
df_conll

Unnamed: 0,label,start,end,text
0,O,0,2,El
1,O,3,5,21
2,O,6,8,de
3,O,9,14,enero
4,O,15,17,de
...,...,...,...,...
209720,O,1632,1634,es
209721,O,1635,1636,9
209722,O,1636,1637,%
209723,O,1638,1653,aproximadamente


In [68]:
# Make sure that both true labels from the dataset and CoNLL are the same
assert df_filtered['labels_str'].to_list() ==  df_conll['label'].to_list()

In [69]:
# Replace true labels with predicted labels
df_conll['label'] = df_filtered['predicted_label_str'].to_list()

In [70]:
# Get the filenames of CoNLLs
original_conlls = sorted([filename for filename in os.listdir(ORIGINAL_CONLLS_DIR) if filename.endswith('.conll')])

In [72]:
os.makedirs(OUTPUT_DIR)

In [73]:
# Generate the .conll files by using offset
current_offset = 0
file_idx = 0 # Position of file within all retrieved with listdir
start_token_idx = 0 # Index within the dataframe that marks the start of a file
for idx, line in df_conll.iterrows():
    # If we reach the end of a file
    if line['start'] < current_offset:
        df_conll.loc[start_token_idx:idx-1].to_csv(OUTPUT_DIR + '/' + original_conlls[file_idx], sep='\t', quoting=csv.QUOTE_NONE, header=None, index=False)
        file_idx += 1
        current_offset = 0
        start_token_idx = idx
    current_offset = line['end']