## BRAT -> JSONL
This notebook will help you prepare your data for the submission format for the BioNNE task. 

One entity corresponds to one line in the .json file, which contains the following keys:

**"entities"** is a list of lists containing the starting position of the entity, the ending position of the entity and the type of the entity;

**"id"** is the id (name) of the corresponding .txt and .ann files;

**"text"** is the medical abstract from the corresponding .txt file.

Example:

```
{"entities":[[27,39,"CHEM"],[86,92,"DISO"],[109,131,"DISO"],[184,196,"CHEM"],[435,453,"PHYS"],...],
"id":"25591652_en",
"text": "Clinical effectiveness of pioglitazone in the combination treatment of patients with asthma concurrent with coronary heart disease].  AIM To investigate the clinical effectiveness of pioglitazone in the combination treatment of patients with asthma concurrent with coronary heart disease (CHD).  SUBJECTS AND METHODS Fifty patients aged 40-75 years with asthma concurrent with CHD were examined.  External respiratory function (ERF), electrocardiograms, blood pressure (BP), and anthropometric measurements were assessed in all the patients..."}
```

In [None]:
import os 
import re
import pandas as pd

In [None]:
BRAT_FORMAT = r'(?P<entity_id>^T[0-9\_a-z]+)\t(?P<type>[a-zA-Z\_\-]+) (?P<positions>[0-9; ]+)\t(?P<text>.*)'
ANNOTATION = r'(?P<id>^N[0-9]+)\tReference (?P<entity_id>T[0-9]+) (?P<ontology_name>[a-zA-Z\_]+)\:(?P<concept_id>[a-zA-Z\_0-9]+)\t(?P<concept_name>.*)'

In [None]:
def read_file(fpath):
    with open(fpath, encoding='utf-8') as input_stream:
        data = input_stream.read()
    return data


def parse_annotation(annotation_line):
    annotation = re.search(BRAT_FORMAT, annotation_line).groupdict()
    annotation_positions = annotation['positions']

    if ";" in annotation_positions:
        # if the entity is splitted in a sentence, the desired format is [start, end, start, end, type]
        positions = annotation_positions.split(";")

        result_annotation = [int(positions[0].split(" ")[0]),
                             int(positions[1].split(" ")[0]),
                             annotation['type']]

    else:
        # desired format is [start, end, type] 
        positions = annotation_positions.split(" ")
        result_annotation = [int(positions[0]), int(positions[1]), annotation['type']]

    return result_annotation
    

def extract_entities_from_brat(annotations_raw: str):
    annotations = []
    for annotation_line in annotations_raw.split('\n'):
        if annotation_line:
            annotation = parse_annotation(annotation_line)
            annotations.append(annotation)
    return annotations


def convert_brat_to_dataframe(path_to_brat_folder):
    entities_dicts = []
    texts_dicts = []

    for path, dirs, files in os.walk(path_to_brat_folder):
        for file in files:
            file_id = file.split(".")[0]
            filepath = os.path.join(path, file)
            if file.endswith(".ann"):

                annotations_raw = read_file(filepath)

                entities = extract_entities_from_brat(annotations_raw)

                entities_dict = { 
                                    "entities": entities,
                                    "id": file_id 
                                }

                entities_dicts.append(entities_dict)


            if file.endswith(".txt"):

                text = read_file(filepath)
                texts_dict = {
                                "text": text,
                                "id": file_id
                             }
                texts_dicts.append(texts_dict)

    df_entities = pd.DataFrame(entities_dicts)
    df_texts = pd.DataFrame(texts_dicts)
    entities_df = df_entities.merge(df_texts)
        
    return entities_df

In [None]:
#You can check the corresponding dataframe
entities_df = convert_brat_to_dataframe(path_to_brat_folder)
entities_df

In [None]:
#And save it as .jsonl file
entities_df.to_json("predictions.jsonl", orient='records', lines=True, force_ascii=False)