## Generate dataset

In [122]:
#This is a script that loads a pandas dataframe from csv, converts the string formatted as a list into a list of lists. The list of lists is then flattened into a single list.

import pandas as pd
import random
import json
df = pd.read_csv('./gpt_final_outs/o3mini_maxclust_80_subset_random_1000_nocutoff.csv', index_col=None)

#rename the column "Entity Name" to "Entity" and "Group Items" to "output"
df = df.rename(columns={"Group Items": "Output"})

# Convert the string formatted as a list into a list of lists
df['Output'] = df['Output'].apply(lambda x: eval(x))

# Flatten the list of lists into a single list
df['Input'] = df['Output'].apply(lambda x: [item.strip("*") for sublist in x for item in sublist])

#randomize the order of the single list
df['Input'] = df['Input'].apply(lambda x: random.sample(x, len(x)))
#df = df[df['Input'].apply(len) < 30]
# make a new column, where internal lists of length 1 are removed 
df['Output>1'] = df['Output'].apply(lambda x: [item for item in x if len(item) > 1])
#create a flag column by name one_item. For each row in Output column, If length of the list in output column is less than 2, put True otherwise False.
df['one_item'] = df['Output>1'].apply(lambda x: True if len(x) == 0 else False)

#zip the Group_Items_no_singles with the Entity Name 
df["Output"] = df.apply(lambda row: {row["Entity Name"]: row['Output']}, axis=1)
df["Input"] = df.apply(lambda row: {row["Entity Name"]: row['Input']}, axis=1)
df['Output>1'] = df.apply(lambda row: {row["Entity Name"]: row['Output>1']}, axis=1)





#convert the columns to json strings
df["Output"] = df["Output"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Input"] = df["Input"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Output>1"] = df["Output>1"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)

#Change order of columns to have input, then output, then output>1, then entity name
df = df[['Entity Name', 'Input', 'Output', 'Output>1', 'one_item']]

#filter by Output>1 length of the list > 1
#use column Output, but use only the ones where it clusteres multiple items together

In [123]:
df
#note i don't really use Outputs>1 in this workflow.

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item
0,58963,"{""58963"": [""Gardea-Torresdey et al. [researche...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False
2,22996,"{""22996"": [""stomatal conductance analyses [phe...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False
3,100116,"{""100116"": [""modern clade [gene]"", ""modern cla...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True
4,133255,"{""133255"": [""Plastidic LPPs [gene]"", ""PLSP2B [...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False
...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-regulated endocytosis [proce...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False
996,157227,"{""157227"": [""SlSAUR1-like [gene]"", ""ZmSAUR1 [g...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""BtuN gene(...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True
998,99091,"{""99091"": [""O. fragrans 'Liuyejingui' (OFL) [o...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False


## Prompts for making fine-tuning dataset (LLM as a Judge)


In [124]:
constant_system_message = """
You are senior scientist in plant biology. You NEED to meticulously VALIDATE given clustering output, and provide a corrected clustering if needed. Explain which groupings are wrong and provide final output. You MUST write response like a senior human plant scientist.

The following are the entity clustering guidelines:
	1.	Exact Phrase Matching Matters: Always consider the full phrase, including key biological terms, bracketed text (ignoring minor differences such as spacing, punctuation, correct abbreviations, plurality).
	2.	Strict (100%) Key Term Separation: Entities with different biological terms MUST be placed in separate clusters.
3. Sub-identifier separation: Separate Entities with numeric differences, sub-identifiers, or qualifiers into different groups.
	4.	Avoid False Similarity: Do NOT cluster two items together in same group just because they share a common word or term.
	5.	Strict Synonym/Near-Synonym Grouping: Only group entities that refer to the same biological structure, process, meaning or concept.
	6.	Maintain 100% Precision: When in even small doubt, MUST place entities in separate clusters.
	7.	Preserve Original Data: No new items should be introduced, no duplicates should be introduced, and no entities should be omitted.
8. YOU MUST pickup most appropriate and easy-to-understand cluster representative and enclose it with '**', if there is more than one entity in that particular cluster. For example, pick the full term instead of an abbreviation.
	9.	Output Format: Always return results in valid JSON format. MUST USE GIVEN KEY.

Your main task is to read the clustered entities and output the validation in the below mentioned format.

The goal:
These results came from a smaller LLM, and i want you to take a look at it.
With your expertise in plant biology, generate prompts for fine-tuning the smaller LLM and mimick your patterns for this specific task.
I want you to validate the clustering of this smaller model, and provide a corrected clustering if needed.
Then you should imagine that you are a human plant biology senior scientist who explains to the smaller LLM what it is doing wrong, to give it the final output. You should never mention the smaller LLM, but instead just pretend you are a human user. 
An example of this would be:
'I think you are wrong, cluster X is actually different from cluster Y because of so and so reasons. Cluster X and Y should be clustered seperately'.
If the smaller model is correct however, you should write something like:
'Great job, that looks correct because of so and so reasons. Now output the correct format, but only output clusters with more than one member.'
The expected input format:

{
  "submission_id_1": {
    "clusters": [
      [
        "**cluster_item1**",
        "cluster_item2"
      ],
      [
        "cluster_item3"
      ]
    ]
  }
}

Return format:

If the clustering is correct:
{
  "submission_id_1": {
    "is_correct": "True",
    "user_prompt": "This looks correct to me, output the clusters that have more than 1 member in the correct json format.",
    "clusters": [
      [
        "cluster_item1",
        "cluster_item2"
      ]
    ]
  }
}

If the clustering is incorrect:

{
  "submission_id_1": {
    "is_correct": "False",
    "user_prompt": "This looks incorrect to me, <insert reason here>. Output the clusters that have more than 1 member in the correct json format.",
    "clusters": [
      [
        "cluster_item2",
        "cluster_item3"
      ]
    ]
  }
}

Warnings:

Be completely sure to not forget any nodes from the input list.
Remember not to base corrections on higher domain level knowledge, that the smaller model might not have, but rather the semantic meaning of the terms.
is_correct should be returned as a string "True" or "False", and only one of these two.
If there is any doubt if two groups should cluster together, cluster them separately.
Remember to select group representative by enclosing entity in "**"

"""

In [125]:
constant_user_message = """  
Here are 2 examples of validation behavior:

Example 1, correct behavior:

input:
{120406: ['difficulty in detecting mutations [phenotype]', 'hereditary breast cancers [phenotype]', 'mutational status of parental tumor cells [phenotype]', 'certain mutation types [phenotype]', 'Well-characterized mutations [phenotype]', 'alternative classifications [phenotype]', 'unique molecular signature [phenotype]', 'oncological diseases [phenotype]', 'independent mutation data [phenotype]', 'inherited disease mutation databases [database]', 'mutational signatures [mutation]', 'mutation types [phenotype]', 'More than half of human cancers [phenotype]', 'somatic and germline mutations [phenotype]', 'Identified mutations [phenotype]', 'lung cancer patients [phenotype]', 'germline transmissible [phenotype]', 'mutations in the germline [phenotype]', 'sporadic breast cancers [phenotype]', 'segregating mutations [phenotype]', 'breast/ovarian cancers [phenotype]', 'non-small-cell lung carcinoma [phenotype]', 'inherited breast cancer cases [phenotype]', 'tumour-specific aberrations [phenotype]', 'list of likely candidate mutations [phenotype]', 'germline-transmitted mutations [phenotype]', 'systematic asymmetries in heritable mutations [genetic feature]', 'targeted heritable mutations [phenotype]', 'fixed mutations in ancestral line [phenotype]', 'various types of mutation [phenotype]', 'several types of mutation [phenotype]', 'screening of these exons [phenotype]', 'many human cancers [phenotype]', 'difficulties to differentiate mutations [phenotype]', 'base substitutions, deletions, insertions, and translocations [phenotype]', 'specific mutational classes [phenotype]', 'germline mutations [phenotype]', 'other types of mutation [phenotype]', 'unmasking of deleterious mutations [phenotype]']}

Explanation of correct clustering:

["**germline mutations [phenotype]**", "germline-transmitted mutations [phenotype]", "mutations in the germline [phenotype]", "germline transmissible [phenotype]"] all describe inheritable genetic alterations passed from one generation to the next.
["More than half of human cancers [phenotype]", "**oncological diseases [phenotype]**", "many human cancers [phenotype]"] reference broad groups of malignant diseases occurring in humans.
["**mutation types [phenotype]**", "base substitutions, deletions, insertions, and translocations [phenotype]", "various types of mutation [phenotype]", "several types of mutation [phenotype]", "other types of mutation [phenotype]", "certain mutation types [phenotype]", "specific mutational classes [phenotype]"] collectively describe or list categories of genetic changes.
["difficulties to differentiate mutations [phenotype]", "**difficulty in detecting mutations [phenotype]**"] denote challenges in identifying or distinguishing specific genetic variants.
["inherited breast cancer cases [phenotype]", "**hereditary breast cancers [phenotype]**"] explicitly point to breast cancer instances where genetic risk is inherited.
Notes on Separation Certain single-element entries might share partial similarity, such as "somatic and germline mutations [phenotype]" or "targeted heritable mutations [phenotype]," but remain separate because of additional qualifiers or exact term differences

Example 2, incorrect clustering:
input:
{"165162": [["**internalized cargo [metabolite]**", "endocytosed material [metabolite]", "Internalized cargoes [metabolite]"], ["**Internalization of endocytic tracer FM4-64 [metabolite]**", "internalisation of an endocytic tracer dye [metabolite]"], ["endocytosis of transferrin [metabolite]"], ["cellular uptake of 14 chemicals [metabolite]"]]}

Explanation of incorrect behavior:

["**internalized cargo [metabolite]**", "endocytosed material [metabolite]", "Internalized cargoes [metabolite]"] all describe general endocytic cargo without specifying particular molecules or numeric differences.
["**Internalization of endocytic tracer FM4-64 [metabolite]**", "internalisation of an endocytic tracer dye [metabolite]"] are clustered wrong, one specifically mentions FM4-64, while the other refers generally to any tracer dye, and they should therefore be separated following strict separation rules.

Now read the cluster, validate and output the desired format:
"""

## Prompts for finetuning dataset (v5)


In [None]:
system_message = """You are senior scientist in plant biology. You NEED to meticulously do reasoning and then cluster given input, and provide the reasoning followed by clusters based on the reasoning performed.

The following are the entity clustering guidelines:
	1.	Exact Phrase Matching Matters: Always consider the full phrase, including key biological terms, bracketed text (ignoring minor differences such as spacing, punctuation, correct abbreviations, plurality).
	2.	Strict (100%) Key Term Separation: Entities with different biological terms MUST be placed in separate clusters.
3. Sub-identifier separation: Separate Entities with numeric differences, sub-identifiers, or qualifiers into different groups.
	4.	Avoid False Similarity: Do NOT cluster two items together in same group just because they share a common word or term.
	5.	Strict Synonym/Near-Synonym Grouping: Only group entities that refer to the same biological structure, process, meaning or concept.
	6.	Maintain 100% Precision: When in even small doubt, MUST place entities in separate clusters.
	7.	Preserve Original Data: No new items should be introduced, no duplicates should be introduced, and no entities should be omitted.
8. YOU MUST pickup most appropriate and easy-to-understand cluster representative and enclose it with '**', if there is more than one entity in that particular cluster. For example, pick the full term instead of an abbreviation. MUST mention this in reasoning statement.
	9.	Output Format: Always return results in valid JSON format. MUST USE GIVEN KEY.
10. Discard Single-element Clusters: Remove any cluster that ends up with only one entity.

Your main task is to read the input, do reasoning and output the reasoning and the correct clusters in a valid JSON object format. Input is a json object with given key and the value is the list of biological entities. The output is a json object with given key and the values are two json objects: reasoning and corresponding clustered entities (list of lists where each inner list is one cluster).

"""


In [None]:
detailed_description = """
Input-1

{
  "0": [
    "meiotic arrest [phenotype]",
    "delayed/arrested meiosis [phenotype]",
    "absence of meiotic arrest [phenotype]",
    "meiotic prophase arrest [phenotype]",
    "males arresting in the middle of prophase I [phenotype]",
    "early arrest [phenotype]",
    "leptotene arrest [phenotype]",
    "pachytene arrest [phenotype]",
    "arrest in late prophase I [phenotype]",
    "meiosis I arrest [phenotype]",
    "meiotic arrest at telophase I [phenotype]",
    "male meiosis until the end of the first division [phenotype]",
    "termination of meiosis after anaphase I [phenotype]",
    "meiotic arrest in anaphase II [phenotype]",
    "arrested zygotic divisions [phenotype]",
    "arrested endosperm nuclear divisions [phenotype]",
    "arrest of the first mitotic division in gametogenesis [phenotype]",
    "arresting prior to the first mitotic division [phenotype]",
    "mitotic arrest during female gametogenesis [phenotype]"
  ]
}

Clustered entities given input-1:

{
  "0": [
    [
      "Meiotic block [phenotype]",
      "**meiotic arrest [phenotype]**",
      "meiotic arrest phenotype [phenotype]",
      "meiotic division stop [phenotype]"
    ],
    [
      "block in meiosis prophase I [phenotype]",
      "**meiotic prophase arrest [phenotype]**"
    ],
    [
      "**pachytene arrest [phenotype]**",
      "meiotic arrest at pachytene [phenotype]"
    ],
    [
      "arrest at the end of meiosis I [phenotype]",
      "arrest after meiosis I [phenotype]",
      "**meiotic arrest at telophase I [phenotype]**"
    ],
    [
      "**termination of meiosis after anaphase I [phenotype]**",
      "premature termination of meiosis after anaphase I [phenotype]"
    ],
    [
      "arrest of meiotic progression in anaphase II [phenotype]",
      "**meiotic arrest in anaphase II [phenotype]**"
    ],
    [
      "arresting the first mitosis during gametogenesis [phenotype]",
      "**arrest of the first mitotic division in gametogenesis [phenotype]**",
      "FNM half-stop [phenotype]"
    ]
  ]
}

Output-1:
{
  "0": {
    "reasoning": "All synonyms describing a complete, generic meiotic halt have to be clustered under **'meiotic arrest [phenotype]'** because they share the same meaning for an unspecified block. Broader prophase I arrests must merge into **'meiotic prophase arrest [phenotype]'** to group all general prophase I blocks without substage detail. Any pachytene-specific terms must fall under **'pachytene arrest [phenotype]'** to capture the unique arrest at the fully synapsed stage. Arrests at the end of meiosis I must be collected under **'meiotic arrest at telophase I [phenotype]'** so that the precise transition at telophase I is clearly indicated. Post–anaphase I failures must be grouped as **'termination of meiosis after anaphase I [phenotype]'** to highlight that the process ends immediately following homolog separation. Anaphase II stalls have to be assigned to **'meiotic arrest in anaphase II [phenotype]'** in order to specify a distinct block during sister chromatid separation. Finally, blocks in the first post–meiotic mitosis must form the cluster **'arrest of the first mitotic division in gametogenesis [phenotype]'** because they occur right after meiosis is completed. Any single-term categories remain excluded from the final output, as each must contain at least two synonymous labels to be shown."
  }
}


Input-2
{
  "2": '[
    "Salt-stress severity [treatment]",
    "high NaCl stress [treatment]",
    "potassium deprivation stress [treatment]",
    "salt-stress response [treatment]",
    "salt stress tolerance [treatment]",
    "heat and salt stress conditions [treatment]",
    "prolonged levels of salt stress [treatment]",
    "recovery from salt stress [treatment]",
    "salt stress assay [treatment]",
    "salt stress signaling pathways [treatment]",
    "gradual salt stress treatments [treatment]",
    "salt and low temperature stresses [treatment]",
    "salt and silicon stresses [treatment]"
  ]'
}

clustered entities for given input-2
{
  "2": [
    [
      "Salt-stress severity [treatment]",
      "**high NaCl stress [treatment]**",
      "salt-stress response [treatment]",
      "salt stress tolerance [treatment]",
      "prolonged levels of salt stress [treatment]",
      "recovery from salt stress [treatment]",
      "salt stress assay [treatment]",
      "salt stress signaling pathways [treatment]",
      "gradual salt stress treatments [treatment]"
    ]
  ]
}

Output-2:
{
  "2": {
    "reasoning": "All treatments that involve only salt (NaCl) must be clustered under **'high NaCl stress [treatment]'** because they focus on the same abiotic stress factor (NaCl) without additional conditions. Each label—such as 'Salt-stress severity,' 'salt-stress response,' or 'salt stress tolerance'—addresses NaCl-induced stress exclusively, which justifies grouping them together. Single-term items mixing salt with other factors (e.g., heat, low temperature, or silicon) or an entirely different stress (e.g., potassium deprivation) must remain separate because their conditions differ fundamentally from pure NaCl stress. Consequently, any single-term clusters are omitted from the final output unless they contain multiple synonymous labels."
  }
}
"""

In [70]:
#sample 25 rows from the dataframe where one_item is False and also sample another 25 rows where one_item is True
df_one_item = df[df['one_item'] == True].sample(25)
df_not_one_item = df[df['one_item'] == False].sample(25)

#concatenate the two dataframes
df_final = pd.concat([df_one_item, df_not_one_item])
#randomize the order of the rows
df_final = df_final.sample(frac=1)
df_final = df_final.reset_index(drop=True)

In [71]:
df_final

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item
0,109139,"{""109139"": [""satC core promoter for (-)-strand...","{""109139"": [[""119 bp minimal promoter region [...","{""109139"": [[""pTRY-B fragment [genomic region]...",False
1,87406,"{""87406"": [""little to no training [treatment]""...","{""87406"": [[""favorite textbook [education]""], ...","{""87406"": []}",True
2,100596,"{""100596"": [""woody flowering plant(s) [organis...","{""100596"": [[""**woody flowering plant(s) [orga...","{""100596"": [[""**woody flowering plant(s) [orga...",False
3,78873,"{""78873"": [""chaperonin 60 subunit beta 2 [prot...","{""78873"": [[""chaperonin 60 subunit beta [prote...","{""78873"": []}",True
4,29577,"{""29577"": [""pre-ribosome export [process]"", ""p...","{""29577"": [[""ribosome export [process]"", ""**pr...","{""29577"": [[""ribosome export [process]"", ""**pr...",False
5,122504,"{""122504"": [""same role [phenotype]"", ""similar ...","{""122504"": [[""same role [phenotype]"", ""similar...","{""122504"": [[""same role [phenotype]"", ""similar...",False
6,101327,"{""101327"": [""volatiles emitted by A. thaliana ...","{""101327"": [[""**endophyte-inoculated A. thalia...","{""101327"": [[""**endophyte-inoculated A. thalia...",False
7,63504,"{""63504"": [""PsaB-PsaI-PsaH side of PSI [protei...","{""63504"": [[""anti-PsaA antibodies [protein]""],...","{""63504"": [[""PsaL protein(s) levels [protein]""...",False
8,63849,"{""63849"": [""manganese-binding polypeptides [pr...","{""63849"": [[""DCH model [phenotype]""], [""**mang...","{""63849"": [[""**manganese cluster [protein comp...",False
9,664,"{""664"": [""last stages of mite development [phe...","{""664"": [[""mature females that emerged [phenot...","{""664"": [[""mature females that emerged [phenot...",False


In [72]:
df_final['one_item'].value_counts()

one_item
False    25
True     25
Name: count, dtype: int64

In [128]:
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item
0,58963,"{""58963"": [""Gardea-Torresdey et al. [researche...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False
2,22996,"{""22996"": [""stomatal conductance analyses [phe...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False
3,100116,"{""100116"": [""modern clade [gene]"", ""modern cla...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True
4,133255,"{""133255"": [""Plastidic LPPs [gene]"", ""PLSP2B [...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False
...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-regulated endocytosis [proce...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False
996,157227,"{""157227"": [""SlSAUR1-like [gene]"", ""ZmSAUR1 [g...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""BtuN gene(...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True
998,99091,"{""99091"": [""O. fragrans 'Liuyejingui' (OFL) [o...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False


In [None]:
entity_dict = {}
for item in df["Output"]:
    try:
        parsed_dict = json.loads(item)
        entity_dict.update(parsed_dict)
    except json.JSONDecodeError:
        print(f"Skipping invalid JSON: {item}")

In [130]:
import ast

def split_dict(d, chunk_size=100):
    keys = list(d.keys())
    chunks = []
    for i in range(0, len(keys), chunk_size):
        chunk_keys = keys[i:i+chunk_size]
        chunk_dict = {k: d[k] for k in chunk_keys}
        chunks.append(chunk_dict)
    return chunks

def flatten_bracketed_strings(value_list):
    """
    Takes a list. For each item:
      - If item is a string that looks like '[...]', parse it and extend the list.
      - Otherwise, keep as is.
    Returns a new flattened list.
    """
    new_list = []
    for val in value_list:
        if (
            isinstance(val, str) 
            and val.strip().startswith("[") 
            and val.strip().endswith("]")
        ):
            # Attempt to parse the bracketed string
            try:
                parsed = ast.literal_eval(val)  # convert string -> Python list
                if isinstance(parsed, list):
                    new_list.extend(parsed)  # flatten
                else:
                    # If it's not a list, just append as-is
                    new_list.append(val)
            except (SyntaxError, ValueError):
                # If parsing fails, keep original
                new_list.append(val)
        else:
            new_list.append(val)
    return new_list



chunks = split_dict(entity_dict, 1) # Use batch size of 1
print('Number of chunks:',len(chunks))

Number of chunks: 1000


In [131]:
for chunk in chunks:
    print(chunk)
    print('-------------------')
    

{'58963': [['Gardea-Torresdey et al. [researcher]'], ['Barrena et al. [researcher]']]}
-------------------
{'22775': [['substance that could be translocated [phenotype]'], ['transport of metabolites [phenotype]'], ['**transportation [phenotype]**', 'transportion [phenotype]'], ['transport of intermediates [phenotype]'], ['dye transport [phenotype]'], ['transport issues [phenotype]'], ['As transport [phenotype]'], ['transport and distribution throughout the plant(s) [phenotype]'], ['traffic [phenotype]'], ['transport within the plant(s) [phenotype]', 'transport through the plant(s) [phenotype]', '**transport in planta [phenotype]**'], ['secondary metabolites transportation [phenotype]', '**secondary metabolite transport [phenotype]**'], ['systemic movement via the phloem [phenotype]'], ['transport of compounds [phenotype]'], ['changes in transport rates [phenotype]'], ['chemical transport [phenotype]'], ['glucosinolates transport [phenotype]'], ['**photoassimilate transport [phenotype]*

## Submit this file to batchapi in the playground

In [132]:
import jsonlines
import json
output_path = '/'
out_file = 'Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/o3mini_val_ins/o3minihigh_1k_outs.jsonl'
with jsonlines.open(output_path + out_file, mode='w') as file:
    for i, chunk in enumerate(chunks):
        
        line = {
            "custom_id": str(i),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "o3-mini",
                #"temperature": 0, #Enable this for non-reasoning model
                #"top_p": 0, #Enable this for non-reasoning model
                #"frequency_penalty": 0, #Enable this for non-reasoning model
                #"presence_penalty": 0, #Enable this for non-reasoning model
                "response_format": {"type": "json_object"},
                "messages": [
                    {
                        "role": "system",
                        "content": constant_system_message
                    },
                    {
                        "role": "user",
                        "content": constant_user_message
                    },
                    {
                        "role": "user",
                        "content": json.dumps(chunk, separators=(',', ':'))
                    }
                ],
                "reasoning_effort": "high"  # Options: "low", "medium", "high" (only for reasoning models)
                #"max_tokens": 16384 #Enable this for non-reasoning model
            }
        }
        file.write(line)
        #print(f'Wrote chunk {i+1} to file')




print('done writing to jsonl file')

done writing to jsonl file


## Read the batchapi output file and convert it to csv

In [None]:
import json
import jsonlines
import csv
# output CSV file from .json file
skip_lines = 0
input_file = "./o3mini_val_outs/batch_67c1822ff1e081908a02fc175e749f0b_output.jsonl" # input batch jsonl output file from GPT
output_csv = "./o3mini_val_outs/o3mini_1k.csv"
with jsonlines.open(input_file) as reader, open(output_csv, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    writer.writerow(["cluster_id","is_correct", "user_prompt", "clusters"])
    for line in reader:
        # Navigate into the response body
        try:
            body = line["response"]["body"]
            # Get the JSON string from the assistant's "content"
            content_str = body["choices"][0]["message"]["content"]
            data = json.loads(content_str)
            for cluster_id, cluster_data in data.items():
                # Ensure cluster_data is a dictionary before accessing keys
                if not isinstance(cluster_data, dict):
                    print(f"Skipping: cluster_data for {cluster_id} is not a dictionary ->", cluster_data)
                    continue

                # Extract values safely
                is_correct = cluster_data.get("is_correct", "N/A")  # Default to "N/A" if missing
                user_prompt = cluster_data.get("user_prompt", "N/A")
                clusters = json.dumps(cluster_data.get("clusters", []), ensure_ascii=False)

                writer.writerow([cluster_id, is_correct, user_prompt, clusters])


            
            

        except (KeyError, IndexError, json.JSONDecodeError):
            # Skip lines that don't match the expected structure
            print("Skipping line:", line)
            skip_lines += 1
            print(line["response"]["body"]["choices"][0]["message"]["content"])
            pass
print(f"Skipped {skip_lines} lines")
              

Skipped 0 lines


## Creating jsonl file for O3mini validation (V5)

In [None]:
#This is a script that loads a pandas dataframe from csv, converts the string formatted as a list into a list of lists. The list of lists is then flattened into a single list.

import pandas as pd
import random
import json
df = pd.read_csv('./gpt_final_outs/o3mini_maxclust_80_subset_random_1000_nocutoff.csv', index_col=None)

#rename the column "Entity Name" to "Entity" and "Group Items" to "output"
df = df.rename(columns={"Group Items": "Output"})

# Convert the string formatted as a list into a list of lists
df['Output'] = df['Output'].apply(lambda x: eval(x))

# Flatten the list of lists into a single list
df['Input'] = df['Output'].apply(lambda x: [item.strip("*") for sublist in x for item in sublist])

#randomize the order of the single list
df['Input'] = df['Input'].apply(lambda x: random.sample(x, len(x)))
#df = df[df['Input'].apply(len) < 30]
# make a new column, where internal lists of length 1 are removed 
df['Output>1'] = df['Output'].apply(lambda x: [item for item in x if len(item) > 1])
#create a flag column by name one_item. For each row in Output column, If length of the list in output column is less than 2, put True otherwise False.
df['one_item'] = df['Output>1'].apply(lambda x: True if len(x) == 0 else False)

#zip the Group_Items_no_singles with the Entity Name 
df["Output"] = df.apply(lambda row: {row["Entity Name"]: row['Output']}, axis=1)
df["Input"] = df.apply(lambda row: {row["Entity Name"]: row['Input']}, axis=1)
df['Output>1'] = df.apply(lambda row: {row["Entity Name"]: row['Output>1']}, axis=1)





#convert the columns to json strings
df["Output"] = df["Output"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Input"] = df["Input"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Output>1"] = df["Output>1"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)

#Change order of columns to have input, then output, then output>1, then entity name
df = df[['Entity Name', 'Input', 'Output', 'Output>1', 'one_item']]

#filter by Output>1 length of the list > 1
#use column Output, but use only the ones where it clusteres multiple items together

In [None]:
#read output csv file
#load the  csv file
output_csv = pd.read_csv("./o3mini_val_outs/o3mini_1k.csv")

In [None]:
#create a dictionary with key as cluster_id and value as the clusters
o3_correct_cluster_dict = {}
for index, row in output_csv.iterrows():
    o3_correct_cluster_dict[row['cluster_id']] = row['clusters']

#create a new column named o3_corrected_cluster in df
df['o3_corrected_cluster'] = df['Entity Name'].map(o3_correct_cluster_dict)

In [None]:
variable_o3mini_corrected_cluster_enitity_dict =  df.set_index('Entity Name')['o3_corrected_cluster'].to_dict()
chunks_variable_o3mini_corrected_cluster = split_dict(variable_o3mini_corrected_cluster_enitity_dict,1)

In [None]:
variable_o3mini_corrected_cluster_dict = {}
for d in chunks_variable_o3mini_corrected_cluster:
    variable_o3mini_corrected_cluster_dict.update(d)

In [None]:
detailed_description = """
Input-1

{
  "0": [
    "meiotic arrest [phenotype]",
    "delayed/arrested meiosis [phenotype]",
    "absence of meiotic arrest [phenotype]",
    "meiotic prophase arrest [phenotype]",
    "males arresting in the middle of prophase I [phenotype]",
    "early arrest [phenotype]",
    "leptotene arrest [phenotype]",
    "pachytene arrest [phenotype]",
    "arrest in late prophase I [phenotype]",
    "meiosis I arrest [phenotype]",
    "meiotic arrest at telophase I [phenotype]",
    "male meiosis until the end of the first division [phenotype]",
    "termination of meiosis after anaphase I [phenotype]",
    "meiotic arrest in anaphase II [phenotype]",
    "arrested zygotic divisions [phenotype]",
    "arrested endosperm nuclear divisions [phenotype]",
    "arrest of the first mitotic division in gametogenesis [phenotype]",
    "arresting prior to the first mitotic division [phenotype]",
    "mitotic arrest during female gametogenesis [phenotype]"
  ]
}

Clustered entities given input-1:

{
  "0": [
    [
      "Meiotic block [phenotype]",
      "**meiotic arrest [phenotype]**",
      "meiotic arrest phenotype [phenotype]",
      "meiotic division stop [phenotype]"
    ],
    [
      "block in meiosis prophase I [phenotype]",
      "**meiotic prophase arrest [phenotype]**"
    ],
    [
      "**pachytene arrest [phenotype]**",
      "meiotic arrest at pachytene [phenotype]"
    ],
    [
      "arrest at the end of meiosis I [phenotype]",
      "arrest after meiosis I [phenotype]",
      "**meiotic arrest at telophase I [phenotype]**"
    ],
    [
      "**termination of meiosis after anaphase I [phenotype]**",
      "premature termination of meiosis after anaphase I [phenotype]"
    ],
    [
      "arrest of meiotic progression in anaphase II [phenotype]",
      "**meiotic arrest in anaphase II [phenotype]**"
    ],
    [
      "arresting the first mitosis during gametogenesis [phenotype]",
      "**arrest of the first mitotic division in gametogenesis [phenotype]**",
      "FNM half-stop [phenotype]"
    ]
  ]
}

Output-1:
{
  "0": {
    "reasoning": "All synonyms describing a complete, generic meiotic halt were clustered under **'meiotic arrest [phenotype]'**, chosen as the representative because it is concise and universally recognized. Broader prophase I arrests merged into **'meiotic prophase arrest [phenotype]'**, selected for its succinct coverage of any block in prophase I. Pachytene-specific arrests were grouped under **'pachytene arrest [phenotype]'**, chosen for clarity and direct reference to the fully synapsed stage. Arrests at the end of meiosis I appear under **'meiotic arrest at telophase I [phenotype]'**, precisely indicating telophase I. Post-anaphase I failures were combined into **'termination of meiosis after anaphase I [phenotype]'**, spotlighting the exact point of cessation. Anaphase II stalls were labeled **'meiotic arrest in anaphase II [phenotype]'**, pinpointing the second division stage. Finally, blocks in the first post-meiotic mitosis formed the cluster **'arrest of the first mitotic division in gametogenesis [phenotype]'**, chosen for its explicit naming of that specific cell-cycle event. Single-term categories (e.g., 'delayed/arrested meiosis [phenotype]', 'absence of meiotic arrest [phenotype]') remain excluded from the final output."
  }
}


Input-2
{
  "2": '[
    "Salt-stress severity [treatment]",
    "high NaCl stress [treatment]",
    "potassium deprivation stress [treatment]",
    "salt-stress response [treatment]",
    "salt stress tolerance [treatment]",
    "heat and salt stress conditions [treatment]",
    "prolonged levels of salt stress [treatment]",
    "recovery from salt stress [treatment]",
    "salt stress assay [treatment]",
    "salt stress signaling pathways [treatment]",
    "gradual salt stress treatments [treatment]",
    "salt and low temperature stresses [treatment]",
    "salt and silicon stresses [treatment]"
  ]'
}

clustered entities for given input-2
{
  "2": [
    [
      "Salt-stress severity [treatment]",
      "**high NaCl stress [treatment]**",
      "salt-stress response [treatment]",
      "salt stress tolerance [treatment]",
      "prolonged levels of salt stress [treatment]",
      "recovery from salt stress [treatment]",
      "salt stress assay [treatment]",
      "salt stress signaling pathways [treatment]",
      "gradual salt stress treatments [treatment]"
    ]
  ]
}

Output-2:
{'2':{'reasoning':'All treatments that involve only salt (NaCl)––such as “Salt-stress severity,” “salt-stress response,” “salt stress tolerance,” “prolonged levels of salt stress,” “recovery from salt stress,” “salt stress assay,” “salt stress signaling pathways,” “gradual salt stress treatments,” and “high NaCl stress”––cluster together, because each focuses on the same abiotic stress factor (NaCl). They appear in the final output under the representative “**high NaCl stress [treatment]**”. In contrast, single-term items that mix salt with another factor (e.g., heat, low temperature, or silicon) or that involve a completely different stress (e.g., potassium deprivation) remain separate clusters of length one and thus are omitted from the output.'}}

"""

In [None]:
import jsonlines
import json
output_path = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/o3mini_val_ins/o3minihigh_v2_1k_outs.jsonl'
with jsonlines.open(output_path, mode='w') as file:
    for i in range(len(chunks_variable_input_entity)):
    
        line = {
            "custom_id": str(i),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "o3-mini",
                #"temperature": 0, #Enable this for non-reasoning model
                #"top_p": 0, #Enable this for non-reasoning model
                #"frequency_penalty": 0, #Enable this for non-reasoning model
                #"presence_penalty": 0, #Enable this for non-reasoning model
                "response_format": {"type": "json_object"},
                "messages": [
                    {
                        "role": "system",
                        "content": system_message
                    },
                    {
                        "role": "user",
                        "content": detailed_description
                    },
                    {
                        "role": "user",
                        "content": json.dumps(chunks_variable_input_entity[i], indent=2)
                    }
                ],
                "reasoning_effort": "high"  # Options: "low", "medium", "high" (only for reasoning models)
                #"max_tokens": 16384 #Enable this for non-reasoning model
            }
        }
        file.write(line)
        #print(f'Wrote chunk {i+1} to file')




print('done writing to {} file'.format(output_path))

## Read the output jsonl file and convert it to csv file

In [None]:
import json
import jsonlines
import csv
# output CSV file from .json file
skip_lines = 0
input_file = "./o3mini_val_outs/batch_67c58e5895ec81909dfeaea04841b738_output.jsonl" # input batch jsonl output file from GPT
output_csv = "./o3mini_val_outs/o3mini_1k_v2_again.csv"
with jsonlines.open(input_file) as reader, open(output_csv, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    writer.writerow(["cluster_id","reasoning"])
    for line in reader:
        # Navigate into the response body
        try:
            body = line["response"]["body"]
            # Get the JSON string from the assistant's "content"
            content_str = body["choices"][0]["message"]["content"]
            data = json.loads(content_str)
            for cluster_id, cluster_data in data.items():
                #if int(cluster_id) not in empty_keys:
                reasoning = cluster_data.get("reasoning")
                writer.writerow([cluster_id, reasoning])


            
            

        except (KeyError, IndexError, json.JSONDecodeError):
            # Skip lines that don't match the expected structure
            print("Skipping line:", line)
            skip_lines += 1
            print(line["response"]["body"]["choices"][0]["message"]["content"])
            pass
print(f"Skipped {skip_lines} lines")
              