# **DisplaCy visualiser tool for the metadata annotations**

In [None]:
# Imports
import json
from spacy import displacy

Currently The annotations structure is as follows:


```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
    "classes": ["TEMP", "SOFT", "STIME", "MOL", "FFM"],
    "annotations": [
        [
            "text_to_fill_in",
            {
                "entities": [
                    [start_index, end_index, label],
                ]
            }
        ]
    ]
}
```


However, the spaCy  format for `ent` is as follows:

```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
  "text": "But Google is starting from behind.",
  "ents": [{ "start": start_index, "end": end_index, "label": label }]
}

# Example:
{
  "text": "text_to_fill_in",
  "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
}
```



We need to therefore convert the format.

In [None]:
def convert_annotations(file_path):
    """
    Convert annotations from the custom format to spaCy's displaCy format.
    
    Parameters:
        file_path (str): Path to the JSON file containing "classes" and "annotations" keys.
        
    Returns:
        List[dict]: A list of dictionaries in the spaCy format.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    converted = []
    for item in data["annotations"]:
        # Each item is structured as [text, {"entities": [[start, end, label], ...]}]
        text, annotation_info = item
        entities = annotation_info.get("entities", [])
        # Convert each entity to the spaCy dictionary format
        ents = [{"start": start, "end": end, "label": label} for start, end, label in entities]
        converted.append({
            "text": text,
            "ents": ents
        })
    return converted

data = "../annotations/figshare_1381865.json"

# Convert the annotations.
converted_data = convert_annotations(data)
print(converted_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFT": "#ffffba",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

In [None]:
data = "../annotations/figshare_1381865.json"

# Convert the annotations.
converted_data = convert_annotations(data)
print(converted_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFT": "#ffffba",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

[{'text': 'Molecular Dynamics Simulations of Sheet-like Replicators\nThis dataset contains 3D structures, MD trajectories and powder X-ray diffractograms (simulated and experimental) of tetrameric carboxynaphthalene disulfide. The MD simulations were performed in water with periodic boundary conditions. Either AMBER or GROMACS packages were used. For detailed description of the simulations see the linked paper (the link will appear upon acceptance). PDB files are snapshots from the MD simulations; they can be viewed using any 3D structure viewer (e.g. Avogadro). The trajectories (*.xtc - GROMACS, *.mdcrd - AMBER/NetCDF) can be visualized using e.g. VMD. Partial charges for the naphthalene building blocks are provided within *.mol2 files (N-0 for the neutral and N-1 for the deprotonated subunit). The parameter/topology files have .top (GROMACS) and .prmtop (AMBER) extensions. Files labeled "sheet1" contain results of simulations of 16 repeats of tetramer IV showed in Figure 4 and S13 of