# **DisplaCy visualiser tool for the metadata annotations**

In [1]:
# Imports
import json
from spacy import displacy
import re

Currently The annotations structure is as follows:


```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
    "classes": ["TEMP", "SOFT", "STIME", "MOL", "FFM"],
    "annotations": [
        [
            "text_to_fill_in",
            {
                "entities": [
                    [start_index, end_index, label],
                ]
            }
        ]
    ]
}
```


However, the spaCy  format for `ent` is as follows:

```python
start_index = int # The index of the first character of the entity. 
end_index = int # The index of the last character of the entity. (not inclusive)
label = str # Label attached to the entity. 

{
  "text": "But Google is starting from behind.",
  "ents": [{ "start": start_index, "end": end_index, "label": label }]
}

# Example:
{
  "text": "text_to_fill_in",
  "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
}
```



We need to therefore convert the format.

In [2]:
def convert_annotations(file_path):
    """
    Convert annotations from the custom format to spaCy's displaCy format.
    
    Parameters:
        file_path (str): Path to the JSON file containing "classes" and "annotations" keys.
        
    Returns:
        List[dict]: A list of dictionaries in the spaCy format.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    converted = []
    for item in data["annotations"]:
        # Each item is structured as [text, {"entities": [[start, end, label], ...]}]
        text, annotation_info = item
        entities = annotation_info.get("entities", [])
        # Convert each entity to the spaCy dictionary format
        ents = [{"start": start, "end": end, "label": label} for start, end, label in entities]
        converted.append({
            "text": text,
            "ents": ents
        })
    return converted

In [3]:
data = "../annotations/figshare_1381865.json"

# Convert the annotations.
converted_data = convert_annotations(data)
print(converted_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFT": "#ffffba",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

[{'text': 'Molecular Dynamics Simulations of Sheet-like Replicators\nThis dataset contains 3D structures, MD trajectories and powder X-ray diffractograms (simulated and experimental) of tetrameric carboxynaphthalene disulfide. The MD simulations were performed in water with periodic boundary conditions. Either AMBER or GROMACS packages were used. For detailed description of the simulations see the linked paper (the link will appear upon acceptance). PDB files are snapshots from the MD simulations; they can be viewed using any 3D structure viewer (e.g. Avogadro). The trajectories (*.xtc - GROMACS, *.mdcrd - AMBER/NetCDF) can be visualized using e.g. VMD. Partial charges for the naphthalene building blocks are provided within *.mol2 files (N-0 for the neutral and N-1 for the deprotonated subunit). The parameter/topology files have .top (GROMACS) and .prmtop (AMBER) extensions. Files labeled "sheet1" contain results of simulations of 16 repeats of tetramer IV showed in Figure 4 and S13 of

### **We are going to be doing some prompt engineering**
---

These are just functions to convert the AI outputs to a strcuture compatible with spaCy.

In [4]:
def convert_annotated_text_to_spacy_format(annotated_text):

    with open(annotated_text, 'r') as file:
        annotated_text = file.read()

    pattern = re.compile(r'<(MOL|STIME|TEMP|SOFTNAME|SOFTVERS|FFM)>(.+?)<\1>')
    ents = []
    clean_text = ""
    last_idx = 0  # Position in the annotated text

    for match in pattern.finditer(annotated_text):
        tag = match.group(1)
        entity_text = match.group(2)

        # Text between last entity and current
        pre_entity_text = annotated_text[last_idx:match.start()]
        clean_text += pre_entity_text
        entity_start = len(clean_text)
        clean_text += entity_text
        entity_end = len(clean_text)

        ents.append({
            "start": entity_start,
            "end": entity_end,
            "label": tag
        })

        last_idx = match.end()

    # Add any remaining text after last tag
    clean_text += annotated_text[last_idx:]

    return [{
        "text": clean_text,
        "ents": ents
    }]

In [5]:
def visualize_annotated_texts(data:list)->None:
    """
    Visualize the annotated text using displaCy.
    
    Parameters:
        data (list): List of paths to the annotated text files.
    """
   # Convert the annotations.
    converted_real_data = convert_annotations(data[0]) # Real annotation
    converted_ai_data = convert_annotated_text_to_spacy_format(data[1]) # AI annotated

    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#orange",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}

    print("\nREAL ANNOTATION:")
    displacy.render(converted_real_data, style="ent", manual=True, options=options)
    print("\nAI ANNOTATION:")
    displacy.render(converted_ai_data, style="ent", manual=True, options=options)


### **Let's use multiple different texts with ChatGPT4o - Detailed few-shot propmting**
---

```txt
The aim is to extract entities from an abstract or description of a molecular dynamics dataset. The following entities will be ones to look out for:

- software name (SOFTNAME): Name of any software identified
- software version (SOFTVERS): Version associated with the software
- molecule (MOL): Gene, protein, molecule-derived entity present in the simulation
- simulation time (STIME): Simulation time
- temperature (TEMP): Temperature used in the simulation
- force field (FFM): Force field used for the simulation

Here are some examples of the entities:

- software name (SOFTNAME): gromacs, mdtraj, charmm-gui, namd, mdanalysis, glide
- software version (SOFTVERS): 3.0.0, 24.1,  14.0
- molecule (MOL): POPC, cholesterol, ubiquitin, TSPO, kinase, phenytoin, water, Hp
- simulation time (STIME): 2 ns, 5 µs, 100 ns
- temperature (TEMP): 300 K, 310.15 K, 280 K, 298 K
- force field (FFM): charmm36m, martini, amber

Here are a couple of examples of abstracts, followed by their proper, correct annotation.

Example 1
Abstract: 
"POPC Ulmschneider OPLS NaCl 1Mol\nMD simulation trajectory and related files for fully hydrated POPC bilayer with 1M NaCl. The Ulmschneider force field for POPC was used with Gromacs 5.0.3 [1,2]. Ions were described by OPLS-AA force field (Gromacs 5.0.3). Conditions: T 298.15, 128 POPC molecules, 5120 tip3p waters, 77 Na, 77 Cl. 200ns trajectory (preceded by 5ns NPT equillibration). This data is ran for the nmrlipids.blospot.fi project. More details from nmrlipids.blospot.fi and https://removed [1] J.P. Ulmschneider & M.B. Ulmschneider, United Atom Lipid Parameters for Combination with the Optimized Potentials for Liquid Simulations All-Atom Force Field, JCTC 2009, 5 (7), 1803 1813 [2] https://removed"

Annotation: 
{"classes": ["TEMP", "SOFTNAME", “SOFTVERS”, "STIME", "MOL", "FFM"], "annotations": [["POPC Ulmschneider OPLS NaCl 1Mol\nMD simulation trajectory and related files for fully hydrated POPC bilayer with 1M NaCl. The Ulmschneider force field for POPC was used with Gromacs 5.0.3 [1,2]. Ions were described by OPLS-AA force field (Gromacs 5.0.3). Conditions: T 298.15, 128 POPC molecules, 5120 tip3p waters, 77 Na, 77 Cl. 200ns trajectory (preceded by 5ns NPT equillibration). This data is ran for the nmrlipids.blospot.fi project. More details from nmrlipids.blospot.fi and https://removed [1] J.P. Ulmschneider & M.B. Ulmschneider, United Atom Lipid Parameters for Combination with the Optimized Potentials for Liquid Simulations All-Atom Force Field, JCTC 2009, 5 (7), 1803 1813 [2] https://removed", {"entities": [[0, 4, "MOL"], [5, 17, "FFM"], [18, 22, "FFM"], [23, 27, "MOL"], [95, 99, "MOL"], [116, 120, "MOL"], [126, 138, "FFM"], [155, 159, "MOL"], [174, 181, "SOFTNAME"], [182, 187, "SOFTVERS"], [195, 199, "MOL"], [218, 225, "FFM"], [239, 246, "SOFTNAME"], [247, 252, "SOFTVERS"], [269, 275, "TEMP"], [281, 285, "MOL"], [302, 307, "FFM"], [308, 314, "MOL"], [319, 321, "MOL"], [326, 328, "MOL"], [330, 335, "STIME"]]}]]}

Example 2
Abstract: 
"6 molecular dynamics simulations of coronavirus 2019-nCoV protease model in complex with different conformations of lopinavir.\nMolecular dynamics simulations of comparative model of novel coronavirus 2019-nCoV protease Mpro in complex with 6 different conformations based on the Catalophore point-cloud alignment and (re)- docking of lopinavir into the 2019ncov virus protease model. The docking experiment produced 8 clusters of possible conformations, we chose 6 out of 8 conformers and ran an all-atom 300 ps MD at 310 K ( 36.85 C ).-The two images in the main folder refers to the docked structures before MD simulation.-The file all centroids.pse contains the frames representing the centroid of the subsequent MD simulation for each docking cluster.-Each archive contains the centroid in PDB format, the starting frame of the simulation in GRO format and the compressed trajectory in XTC format. In the directory \"other files\" there are other data generated during the simulation, i.e. heatmap representing the contact frequency between the ligand atoms and the ones belonging to the homology model."

Annotation: 
{"classes": ["TEMP", "SOFTNAME", “SOFTVERS”, "STIME", "MOL", "FFM"], "annotations": [["6 molecular dynamics simulations of coronavirus 2019-nCoV protease model in complex with different conformations of lopinavir.\nMolecular dynamics simulations of comparative model of novel coronavirus 2019-nCoV protease Mpro in complex with 6 different conformations based on the Catalophore point-cloud alignment and (re)- docking of lopinavir into the 2019ncov virus protease model. The docking experiment produced 8 clusters of possible conformations, we chose 6 out of 8 conformers and ran an all-atom 300 ps MD at 310 K ( 36.85 C ).-The two images in the main folder refers to the docked structures before MD simulation.-The file all centroids.pse contains the frames representing the centroid of the subsequent MD simulation for each docking cluster.-Each archive contains the centroid in PDB format, the starting frame of the simulation in GRO format and the compressed trajectory in XTC format. In the directory \"other files\" there are other data generated during the simulation, i.e. heatmap representing the contact frequency between the ligand atoms and the ones belonging to the homology model.", {"entities": [[36, 66, "MOL"], [116, 125, "MOL"], [188, 218, "MOL"], [219, 223, "MOL"], [334, 343, "MOL"], [353, 376, "MOL"], [518, 523, "TEMP"], [526, 533, "TEMP"]]}]]}

I will want you to annotate with the following structure making sure to keep using \n for line breaks. Don’t manually go to the line. Make sure to only answer with the output:
“text <ENTITY LABEL>entity_text<ENTITY LABEL> text”

Example:
“<MOL>POPC<MOL> >FFM>Ulmschneider<FFM> <FFM>OPLS<FFM> <MOL>NaCl<MOL> 1Mol”

Here is an abstract, and I need you to properly annotate it with the entities I explained before:
[INSERT ABSTRACT]
```

In [None]:
real_data = "../annotations/figshare_1381865.json"
ai_data = "../ai_annotations/chatgpt_annotated.txt"

data = [real_data, ai_data]

visualize_annotated_texts(data)


REAL ANNOTATION:



AI ANNOTATION:


In [None]:
real_data = "../annotations/zenodo_7672195.json"
ai_data = "../ai_annotations/chatgpt_zenodo_7672195.txt"

data = [real_data, ai_data]

visualize_annotated_texts(data)


REAL ANNOTATION:



AI ANNOTATION:


In [None]:
real_data = "../annotations/zenodo_7440399.json"
ai_data = "../ai_annotations/chatgpt_zenodo_7440399.txt"

data = [real_data, ai_data]

visualize_annotated_texts(data)


REAL ANNOTATION:



AI ANNOTATION:


In [None]:
real_data = "../annotations/zenodo_30904.json"
ai_data = "../ai_annotations/chatgpt_zenodo_30904.txt"

data = [real_data, ai_data]

visualize_annotated_texts(data)


REAL ANNOTATION:



AI ANNOTATION:


In [None]:
real_data = "../annotations/zenodo_3950029.json"
ai_data = "../ai_annotations/chatgpt_zenodo_3950029.txt"

data = [real_data, ai_data]

visualize_annotated_texts(data)


REAL ANNOTATION:



AI ANNOTATION:


### **Let's try with a different prompt: few shot prompting**
---

```txt
Entities:
<SOFTNAME>: Software used in the simulation or analysis (e.g., Gromacs, AMBER, VMD, CHARMM-GUI)
<SOFTVERS>: Version number associated with the software (e.g., v. 2016.4, 5.0.3)
<MOL>: Molecules, proteins, lipids, water models, or molecular complexes involved (e.g., DPPC, water, GTP, KRas4B)
<STIME>: Duration of the simulation (e.g., 50 ns, 200ns, 5 µs)
<TEMP>: Temperature used in the simulation (e.g., 300 K, 288K, 358K)
<FFM>: Force fields used in the simulation (e.g., Charmm36, AMBER, MARTINI, TIP3P)

Annotate the text:
POPC Ulmschneider OPLS Verlet Group\nMD simulation trajectory and related files for fully hydrated POPC bilayer run with Verlet and Group schemes. The Ulmschneider force field for POPC was used with Gromacs 5.0.3 [1,2]. Conditions: T 298.15, 128 POPC molecules, 5120 tip3p waters. 200ns trajectory (preceded by 5ns NPT equillibration). Starting structure was obtained from CHARMM-GUI [3]. This data is ran for the nmrlipids.blospot.fi project. More details from nmrlipids.blospot.fi and https://removed [1] J.P. Ulmschneider & M.B. Ulmschneider, United Atom Lipid Parameters for Combination with the Optimized Potentials for Liquid Simulations All-Atom Force Field, JCTC 2009, 5 (7), 1803 1813 [2] https://removed [3] https://removed

Annotated text:
<MOL>POPC<MOL> <FFM>Ulmschneider<FFM> <FFM>OPLS<FFM> Verlet Group\nMD simulation trajectory and related files for fully hydrated <MOL>POPC<MOL> bilayer run with Verlet and Group schemes. The <FFM>Ulmschneider<FFM> force field for <MOL>POPC<MOL> was used with <SOFTNAME>Gromacs<SOFTNAME> <SOFTVERS>5.0.3<SOFTVERS> [1,2]. Conditions: T <TEMP>298.15<TEMP>, 128 <MOL>POPC<MOL> molecules, 5120 <MOL>tip3p<MOL> <MOL>waters<MOL>. <STIME>200ns<STIME> trajectory (preceded by <STIME>5ns<STIME> NPT equillibration). Starting structure was obtained from <SOFTNAME>CHARMM-GUI<SOFTNAME> [3]. This data is ran for the nmrlipids.blospot.fi project. More details from nmrlipids.blospot.fi and https://removed [1] J.P. Ulmschneider & M.B. Ulmschneider, United Atom Lipid Parameters for Combination with the Optimized Potentials for Liquid Simulations All-Atom Force Field, JCTC 2009, 5 (7), 1803 1813 [2] https://removed [3] https://removed


Annotate the text:
Assessment of mutation probabilities of KRAS G12 missense mutants and their long-time scale dynamics by atomistic molecular simulations and Markov state modeling: Datasets.\nDatasets related to the publication [1]. Including: KRAS G12X mutations derived from COSMIC v.79 [http://cancer.sanger.ac.uk/cosmic/] (KRAS G12X mut COSMICv79..xlsx) RMSFs (300-2000ns) of GDP-systems (300 2000rmsf GDP systems RAW AVG SE.xlsx) RMSFs (300-2000ns) of GTP-systems (300 2000RMSF GTP systems RAW AVG SE.xlsx) PyInteraph analysis data for salt-bridges and hydrophobic clusters (.dat files for each system in the PyInteraph data.zip-file) Backbone trajectories for each system (residues 4-164; frames for every 1ns). Last number (e.g. 1) refers to the replica of the simulated system. backbone 4-164.gro/.pdb/.tpr -files (resid 4-164) [1] Pantsar T et al. Assessment of mutation probabilities of KRAS G12 missense mutants and their long-time scale dynamics by atomistic molecular simulations and Markov state modeling. PLoS Comput Biol Submitted (2018)

Annotated text:
Assessment of mutation probabilities of <MOL>KRAS G12<MOL> missense mutants and their long-time scale dynamics by atomistic molecular simulations and Markov state modeling: Datasets.\nDatasets 1  related to the publication [1]. Including: <MOL>KRAS G12X<MOL> mutations derived from COSMIC v.79 [http://cancer.sanger.ac.uk/cosmic/] (<MOL>KRAS G12X<MOL> mut COSMICv79..xlsx) RMSFs (<STIME>300-2000ns<STIME>) of <MOL>GDP<MOL>-systems (<STIME>300-2000ns<STIME> rmsf <MOL>GDP<MOL> systems RAW AVG SE.xlsx) RMSFs (<STIME>300-2000ns<STIME>) of <MOL>GTP<MOL>-systems (<STIME>300-2000ns<STIME> RMSF <MOL>GTP<MOL> systems RAW AVG SE.xlsx) <SOFTNAME>PyInteraph<SOFTNAME> analysis data for salt-bridges and hydrophobic clusters (.dat files for each system in the <SOFTNAME>PyInteraph<SOFTNAME> data.zip-file) Backbone trajectories for each system (residues 4-164; frames for every <STIME>1ns<STIME>). Instead of using <SOFTNAME>GROMACS<SOFTNAME>, we used <SOFTNAME>NAMD<SOFTNAME> also


Annotate the text:
212 DPPC Molecules bilayer in pure Water, simulated at temperatures ranging from 288K to 358K\nPublication: A machine learning study of the two states model for lipid bilayer phase transitions Published on: 12 August 2020 Journal: Phys. Chem. Chem. Phys., 2020, DOI: 10.1039/D0CP02058C Description: Simulation files used to train our machine learning algorithm to identify the thermodynamic phase of individual lipid molecules in a bilayer, as well as the simulation files analysed by the machine learning models. Code source for the ML algorithm can be found on Github. The training files are named gel.gro and fluid.gro. They respectively correspond to the final frame of the systems simulated at 288K and 358K. All other files are the files analysed by the machine learning models. System composition: DPPC molecules: 212 with 130 atoms each Water molecules: 29,826 with 3 atoms each Simulation box dimensions (approx.): 8 x 8 x 20 nm Simulation details: Software: Gromacs (v. 2016.4) Forcefield: Charmm36 (v. June 2015) - Water: TIP3P Thermostat: Nose-hoover (0.4ps, 2 groups) Barostat: Parrinello-Rahman semi-isotropic (2.0ps, 1.0 bar on each axis, 4.5e-5 bar-1) Duration: 50 ns

Annotated text:


```

In [None]:
ai_data = ["../ai_annotations/chatgpt4o_zenodo_3950029.txt", "../ai_annotations/gemini_2_0_flash_zenodo_3950029.txt", "../ai_annotations/claude_3_7_sonnet_zenodo_3950029.txt", "../ai_annotations/mistral_saba_24b_zenodo_3950029.txt"]
model_names = ["chatgpt4o", "gemini_2_0_flash", "claude_3_7_sonnet", "mistral_saba_24b"]

for i in range(len(ai_data)):
    converted_ai_data = convert_annotated_text_to_spacy_format(ai_data[i])
    colors = {
        "TEMP": "#ffb3ba",
        "SOFTNAME": "#ffffba",
        "SOFTVERS": "#orange",
        "STIME": "#baffc9",
        "MOL": "#bae1ff",
        "FFM": "#cdb4db",
    }
    options = {"colors": colors}

    print(f"\nAI ANNOTATION - {model_names[i]}:")
    displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION - chatgpt4o:



AI ANNOTATION - gemini_2_0_flash:



AI ANNOTATION - claude_3_7_sonnet:



AI ANNOTATION - mistral_saba_24b:


### **Time to test out one-shot prompting with different models**
---

```txt
You are an expert at NER, you are meticulous and precise with extensive molecular dynamics and biology knowledge. You are given a scientific abstract or dataset description related to molecular dynamics simulations. Your task is to identify and annotate specific named entities relevant to simulation setup and analysis.

Entity Labels to Extract:
<SOFTNAME>: Software used in the simulation or analysis (e.g., Gromacs, AMBER, VMD, CHARMM-GUI)
<SOFTVERS>: Version number associated with the software (e.g., v. 2016.4, 5.0.3, June 2015)
<MOL>: Molecules, proteins, lipids, water models, or molecular complexes involved (e.g., DPPC, water, TIP3P, GTP, KRas4B, tetramer IV)
<STIME>: Duration of the simulation (e.g., 50 ns, 200ns, 5 µs)
<TEMP>: Temperature used in the simulation (e.g., 300 K, 288K, 358K)
<FFM>: Force fields used in the simulation (e.g., Charmm36, AMBER, MARTINI)

Expected Output Format:
Wrap each identified entity with its label using this format:
<LABEL>entity<LABEL>

Example:
Simulations were performed using <SOFTNAME>Gromacs<SOFTNAME> with the <FFM>Charmm36<FFM> force field at <TEMP>300 K<TEMP> for <STIME>50 ns<STIME>.

Input:
"Assessment of mutation probabilities of KRAS G12 missense mutants and their long-time scale dynamics by atomistic molecular simulations and Markov state modeling: Datasets.\nDatasets related to the publication [1]. Including: KRAS G12X mutations derived from COSMIC v.79 [http://cancer.sanger.ac.uk/cosmic/] (KRAS G12X mut COSMICv79..xlsx) RMSFs (300-2000ns) of GDP-systems (300 2000rmsf GDP systems RAW AVG SE.xlsx) RMSFs (300-2000ns) of GTP-systems (300 2000RMSF GTP systems RAW AVG SE.xlsx) PyInteraph analysis data for salt-bridges and hydrophobic clusters (.dat files for each system in the PyInteraph data.zip-file) Backbone trajectories for each system (residues 4-164; frames for every 1ns). Instead of using GROMACS, we used NAMD also.”

Output:
Return the input text with the appropriate entities labeled inline as shown above.
```

### **REAL ANNOTATION**

In [11]:
real_data = "../annotations/zenodo_1346073.json"

# Convert the annotations.
converted_data = convert_annotations(real_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

# You now have a list of annotation dictionaries. For visualization with displaCy,
# you can serve a single annotated text or multiple.
displacy.render(converted_data, style="ent", manual=True, options=options)

### **A.I. ANNOTATIONS**

In [None]:
ai_data = "../ai_annotations/llama_3_1_b_annotation.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


In [None]:
ai_data = "../ai_annotations/deepseek_qwen.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


In [None]:
ai_data = "../ai_annotations/gemma2.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


I tried telling the model to remove the backslashes from the labels but it didn't change much. It would keep adding them in the label.

In [None]:
ai_data = "../ai_annotations/chatgpt4o.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


In [None]:
ai_data = "../ai_annotations/gemini_2_0_flash.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


In [None]:
ai_data = "../ai_annotations/gemini_2_0_flash_thinking.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:


Here I had to specify to not use backslashes in the labels. It was doing a html format.

In [None]:
ai_data = "../ai_annotations/claude_3_7_sonnet.txt"

converted_ai_data = convert_annotated_text_to_spacy_format(ai_data)

colors = {
    "TEMP": "#ffb3ba",
    "SOFTNAME": "#ffffba",
    "SOFTVERS": "#orange",
    "STIME": "#baffc9",
    "MOL": "#bae1ff",
    "FFM": "#cdb4db",
}
options = {"colors": colors}

print("\nAI ANNOTATION:")
displacy.render(converted_ai_data, style="ent", manual=True, options=options)


AI ANNOTATION:
