Generate an .ann file for each .conll file in a directory.

In [None]:
import os
from brat.tools import BIOtoStandoff

In [None]:
CONLL_DIR = "/home/jan/bsc/meddoplace_jan/meddoplace_noun/noun_prediction_conlls/" # Directory with CoNLLs (with trailing '/')
TXT_DIR = "/home/jan/bsc/meddoplace_completo/test_set_reference/txt/" # Directory with .txts
ANN_DIR = "/home/jan/bsc/meddoplace_jan/meddoplace_noun/noun_prediction_anns/" # output dir

In [None]:
os.makedirs(ANN_DIR)

In [None]:
# Write an .ann file for each .conll file by calling BIOtoStandoff.py
conll_files = [file for file in os.listdir(CONLL_DIR) if file.endswith(".conll")]

for conn_file in conll_files:
    txt_file = conn_file.replace('.conll', '.txt')
    argv = ["brat/tools/BIOtoStandoff.py", TXT_DIR + txt_file, CONLL_DIR + conn_file, "-1", "0"]
    res = BIOtoStandoff.main(argv)
    ann_file = conn_file.replace('.conll', '.ann')
    with open(ANN_DIR + ann_file, 'w') as file:
        ann_content = map(lambda line: str(line)+'\n', res)
        file.writelines(ann_content)

## Merge annotation files (.ann) into one file and include filename

After merging all resulting .ann files to one using:
```bash
find . -name '*.ann' -type f -exec grep "" {} + > ../all_merged.ann
```

We then adapt the columns of the .ann file with the format required by the subtask's evaluation.
This was needed for Meddoplace evaluation.

In [None]:
MERGED_ANN = "/home/jan/bsc/meddoplace_jan/meddoplace_noun/all_merged.ann"

In [None]:
import pandas as pd
import csv

In [None]:
df = pd.read_csv(MERGED_ANN, quoting=csv.QUOTE_NONE,
                 sep="\t", usecols=[0, 1, 2], names=['id', "label", 'text'], header=None)

In [None]:
df['start_span'] = df['label'].apply(lambda elem: elem.split()[1])
df['end_span'] = df['label'].apply(lambda elem: elem.split()[2])
df['label'] = df['label'].apply(lambda elem: elem.split()[0])
df['filename'] = df['id'].apply(lambda elem: elem.split(':')[0])
df['ann_id'] = df['id'].apply(lambda elem: elem.split(':')[1])
df = df.drop(columns='id')
# Reorder columns
df = df[['filename', 'ann_id', 'label', 'start_span', 'end_span', 'text']]
df['filename'] = df['filename'].apply(lambda filename: filename[2:].split('.')[0])
df

In [None]:
df.to_csv("/home/jan/bsc/meddoplace_jan/meddoplace_noun/meddoplace_noun.tsv", quoting=csv.QUOTE_NONE, sep="\t", index=False, header=True)

### Fixing reference loc_type.tsv for meddoplace scoring script

In [None]:
df_ref = pd.read_csv("/home/jan/bsc/meddoplace_scoring_script_noun/input/ref/noun_type.tsv", sep="\t")

In [None]:
df_ref['start_span'] = df_ref['span'].apply(lambda elem: elem.split(', ')[0])
df_ref['end_span'] = df_ref['span'].apply(lambda elem: elem.split(', ')[1])
df_ref.drop(columns=['span', 'attributes', 'note'])
df_ref = df_ref[['filename', 'label', 'start_span', 'end_span', 'text']]

In [None]:
df_ref.to_csv("/home/jan/bsc/meddoplace_scoring_script_noun/input/ref/noun_type.tsv", sep="\t", index=False)