## Generate dataset

In [20]:
#This is a script that loads a pandas dataframe from csv, converts the string formatted as a list into a list of lists. The list of lists is then flattened into a single list.

import pandas as pd
import random
import json
df = pd.read_csv('./gpt_final_outs/o3mini_maxclust_80_subset_random_1000_nocutoff.csv', index_col=None)

#rename the column "Entity Name" to "Entity" and "Group Items" to "output"
df = df.rename(columns={"Group Items": "Output"})

# Convert the string formatted as a list into a list of lists
df['Output'] = df['Output'].apply(lambda x: eval(x))

# Flatten the list of lists into a single list
df['Input'] = df['Output'].apply(lambda x: [item.strip("*") for sublist in x for item in sublist])

#randomize the order of the single list
df['Input'] = df['Input'].apply(lambda x: random.sample(x, len(x)))
#df = df[df['Input'].apply(len) < 30]
# make a new column, where internal lists of length 1 are removed 
df['Output>1'] = df['Output'].apply(lambda x: [item for item in x if len(item) > 1])
#create a flag column by name one_item. For each row in Output column, If length of the list in output column is less than 2, put True otherwise False.
df['one_item'] = df['Output>1'].apply(lambda x: True if len(x) == 0 else False)

#zip the Group_Items_no_singles with the Entity Name 
df["Output"] = df.apply(lambda row: {row["Entity Name"]: row['Output']}, axis=1)
df["Input"] = df.apply(lambda row: {row["Entity Name"]: row['Input']}, axis=1)
df['Output>1'] = df.apply(lambda row: {row["Entity Name"]: row['Output>1']}, axis=1)





#convert the columns to json strings
df["Output"] = df["Output"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Input"] = df["Input"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
df["Output>1"] = df["Output>1"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)

#Change order of columns to have input, then output, then output>1, then entity name
df = df[['Entity Name', 'Input', 'Output', 'Output>1', 'one_item']]

#filter by Output>1 length of the list > 1
#use column Output, but use only the ones where it clusteres multiple items together

In [21]:
df
#note i don't really use Outputs>1 in this workflow.

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item
0,58963,"{""58963"": [""Barrena et al. [researcher]"", ""Gar...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False
2,22996,"{""22996"": [""number of open stomata and amount ...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False
3,100116,"{""100116"": [""Clade SH [clade]"", ""Clade M [clad...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True
4,133255,"{""133255"": [""PSPTO3648 [gene]"", ""LPP\u10b2 [ge...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False
...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-mediated modulation of the m...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False
996,157227,"{""157227"": [""GmSAUR [gene]"", ""ZmSAUR1 (Zea may...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True
998,99091,"{""99091"": [""Genome sequencing of O. fragrans '...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False


## System message prompts for making fine-tuning dataset (v1,v2,v3,v4)




In [23]:
system_message = """
You are a data scientist specializing in grouping plant biological entities. Your task is to cluster similar entities while strictly adhering to the following guidelines:
	1.	Exact Phrase Matching Matters: 
1.1 Consider the Entire Phrase: Treat each entity as a single, whole phrase. This includes all key biological terms and any bracketed text
1.2 Ignore Minor Surface Differences: Minor variations such as letter casing (uppercase vs. lowercase), spacing, punctuation, standard abbreviations, or singular vs. plural forms do not create new or separate entities.
	2.	Strict (100%) Key Term Separation: If an entity has a different key biological term, it MUST GO into a separate cluster.
3. Sub-identifier separation: If an entity differs by any numeric value, sub-identifier, or qualifier, they MUST BE placed in separate clusters.
	4.	Avoid False Similarity: DO NOT cluster two entities together simply because they share a common word or term if their overall key term or concept is different.
5. Extra Descriptor Differentiation: If one entity has an extra descriptor that changes its meaning, do not group them together.
	6.	Strict Synonym/Near-Synonym Grouping: Only group entities together if they refer to the exact same biological structure, process, or concept.
	7.	Maintain 100% Precision: If there is any doubt about whether two entities are the same, MUST place them in separate clusters.
	8.	Preserve Original Data: DO NOT introduce new items, create duplicates, or omit any entity from your final output.
	9.	Output Format: Always return results in valid JSON format. You MUST USE GIVEN KEY.
10. Discard Single-element Clusters: Remove any cluster that ends up with only one entity.
11. Choose cluster representative: YOU MUST pickup most appropriate and easy-to-understand cluster representative and enclose it with '**', if there is more than one entity in that particular cluster. For example, pick the full term instead of an abbreviation.

Read the input list, and return a clustered output list.
"""

## System message prompts for finetuning (v5))

In [24]:
system_message = """You are senior scientist in plant biology. You NEED to meticulously do reasoning and then cluster given input, and provide the reasoning followed by clusters based on the reasoning performed.

The following are the entity clustering guidelines:
	1.	Exact Phrase Matching Matters: Always consider the full phrase, including key biological terms, bracketed text (ignoring minor differences such as spacing, punctuation, correct abbreviations, plurality).
	2.	Strict (100%) Key Term Separation: Entities with different biological terms MUST be placed in separate clusters.
3. Sub-identifier separation: Separate Entities with numeric differences, sub-identifiers, or qualifiers into different groups.
	4.	Avoid False Similarity: Do NOT cluster two items together in same group just because they share a common word or term.
	5.	Strict Synonym/Near-Synonym Grouping: Only group entities that refer to the same biological structure, process, meaning or concept.
	6.	Maintain 100% Precision: When in even small doubt, MUST place entities in separate clusters.
	7.	Preserve Original Data: No new items should be introduced, no duplicates should be introduced, and no entities should be omitted.
8. YOU MUST pickup most appropriate and easy-to-understand cluster representative and enclose it with '**', if there is more than one entity in that particular cluster. For example, pick the full term instead of an abbreviation. MUST mention this in reasoning statement.
	9.	Output Format: Always return results in valid JSON format. MUST USE GIVEN KEY.
10. Discard Single-element Clusters: Remove any cluster that ends up with only one entity.

Your main task is to read the input, do reasoning and output the reasoning and the correct clusters in a valid JSON object format. Input is a json object with given key and the value is the list of biological entities. The output is a json object with given key and the values are two json objects: reasoning and corresponding clustered entities (list of lists where each inner list is one cluster).


"""


In [25]:
#read output csv file from the o3-mini validation
output_csv = pd.read_csv("./o3mini_val_outs/o3mini_1k.csv")

In [26]:
output_csv

Unnamed: 0,cluster_id,is_correct,user_prompt,clusters
0,58963,True,"Great job, that looks correct because each res...",[]
1,22775,True,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport..."
2,22996,False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va..."
3,100116,True,"This looks correct to me, as the grouping foll...",[]
4,133255,True,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""..."
...,...,...,...,...
995,31615,False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",..."
996,157227,True,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ..."
997,131559,True,"Great job, that looks correct to me. Each gene...",[]
998,99091,True,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism..."


In [8]:
# RUN THIS CELL ONLY for V5 
#filter rows in df such that entity name is not in the empty keys (JUST REMOVING THE rows with EMPTY CORRECTED CLUSTERS)
empty_keys = [58963, 100116, 2276, 82106, 169694, 89650, 46224, 74528, 59649, 88515, 42251, 138189, 121395, 66209, 109497, 51350, 125145, 83273, 31266, 20901, 155284, 69128, 149825, 54598, 155550, 76415, 10829, 29912, 147673, 13676, 43068, 154151, 158579, 154079, 121796, 64226, 148026, 30540, 13722, 38783, 141611, 152629, 158331, 164114, 61159, 30892, 11432, 118549, 170116, 8815, 23115, 135141, 130704, 13447, 82891, 79708, 49031, 137362, 108939, 30902, 1979, 161173, 10932, 49361, 6893, 47841, 6353, 53420, 134561, 141969, 163157, 170003, 5317, 45644, 100049, 116121, 96226, 80941, 104768, 54324, 153755, 63041, 49218, 83032, 74072, 95491, 136879, 128564, 3984, 149304, 108934, 111625, 40897, 110509, 150263, 20671, 54278, 150596, 107203, 154934, 159575, 151766, 151906, 126710, 96658, 61283, 68647, 140937, 52882, 58544, 69542, 51757, 95962, 45446, 55035, 25174, 146950, 151274, 149112, 159628, 55001, 141304, 77432, 112493, 90683, 101253, 154023, 19303, 141728, 62513, 117070, 130555, 103177, 88707, 24195, 150152, 22385, 115627, 152158, 146433, 120792, 153959, 49211, 115818, 80040, 60288, 76699, 156169, 123227, 24942, 50885, 100895, 167231, 51356, 42848, 55951, 123371, 121683, 93067, 91142, 76685, 141591, 55077, 112506, 396, 114365, 130281, 86746, 147191, 9338, 22536, 121491, 47170, 165937, 150418, 105086, 81729, 75904, 33899, 66190, 51024, 151910, 42267, 71717, 42070, 104675, 63463, 42619, 106019, 93865, 143138, 6596, 153123, 68999, 91023, 123962, 133452, 90992, 59529, 148751, 153190, 124484, 16960, 153122, 124186, 99387, 151388, 61033, 129608, 52510, 172622, 63985, 125670, 79230, 72583, 81435, 70356, 62742, 135336, 150457, 66974, 166178, 157848, 87401, 14243, 121095, 2505, 126507, 147170, 64516, 141136, 78873, 160119, 41639, 90709, 103854, 158312, 148245, 154282, 75046, 49728, 8072, 421, 136198, 103253, 82563, 85713, 155406, 87406, 58568, 131897, 79792, 112158, 148520, 162083, 40475, 130199, 68083, 23032, 77116, 106386, 35567, 115768, 60811, 72695, 68689, 170670, 29340, 160703, 10963, 38965, 4789, 99646, 59223, 93167, 108297, 42453, 87710, 55327, 88040, 81258, 91177, 45197, 57719, 12113, 136929, 16321, 76485, 160180, 102136, 46329, 138085, 153343, 31361, 20951, 159175, 59286, 31879, 116062, 134504, 36974, 130268, 154497, 103242, 115840, 45677, 60775, 47189, 43411, 20497, 152378, 126195, 76278, 20565, 174379, 151225, 152992, 60554, 125654, 15562, 65518, 106938, 31479, 8680, 84113, 55344, 115283, 37690, 165081, 107511, 135491, 102013, 153636, 50605, 32062, 151681, 50986, 55214, 2112, 56183, 75941, 160016, 30687, 61853, 24749, 161408, 150525, 163081, 86850, 167068, 88886, 65805, 131559]

df = df[~df['Entity Name'].isin(empty_keys)]

In [27]:
o3_user_prompt_dict = {}
#create a dictionary with key as cluster_id and value as the user_prompt 
for index, row in output_csv.iterrows():
    o3_user_prompt_dict[row['cluster_id']] = row['user_prompt']

#create a new column named user_prompt in df
df['user_prompt'] = df['Entity Name'].map(o3_user_prompt_dict)

In [28]:
#create a dictionary with key as cluster_id and value as the clusters
o3_correct_cluster_dict = {}
for index, row in output_csv.iterrows():
    o3_correct_cluster_dict[row['cluster_id']] = row['clusters']

#create a new column named o3_corrected_cluster in df
df['o3_corrected_cluster'] = df['Entity Name'].map(o3_correct_cluster_dict)

In [29]:
# create a dictionary with key as cluster_id and value as the is_correct
o3_is_correct_dict = {}
for index, row in output_csv.iterrows():
    o3_is_correct_dict[row['cluster_id']] = row['is_correct']

#create a new column named is_correct in df
df['is_correct'] = df['Entity Name'].map(o3_is_correct_dict)

In [30]:
variable_o3_user_prompt = df.iloc[1]['user_prompt']
print(variable_o3_user_prompt)

Great job, the clusters appear to be correctly separated based on the guidelines. Each cluster groups only those entities that are 100% synonyms or near-synonyms without mixing distinct biological terms. For example, the cluster grouping '**transportation [phenotype]**' with 'transportion [phenotype]' is correctly grouped because they differ only in minor typographical aspects, and the clusters for 'transport in planta', 'secondary metabolite transport', 'photoassimilate transport', and 'enzyme transport' are all appropriately formed. Now output only the clusters that have more than 1 member in the correct JSON format.


In [31]:
variable_o3mini_corrected_cluster = df.iloc[1]['o3_corrected_cluster']
print(variable_o3mini_corrected_cluster)

[["**transportation [phenotype]**", "transportion [phenotype]"], ["transport within the plant(s) [phenotype]", "transport through the plant(s) [phenotype]", "**transport in planta [phenotype]**"], ["secondary metabolites transportation [phenotype]", "**secondary metabolite transport [phenotype]**"], ["**photoassimilate transport [phenotype]**", "transport of photo-assimilates [phenotype]"], ["**enzyme transport [phenotype]**", "transport of enzymes [phenotype]"]]


In [32]:
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct
0,58963,"{""58963"": [""Barrena et al. [researcher]"", ""Gar...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True,"Great job, that looks correct because each res...",[],True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport...",True
2,22996,"{""22996"": [""number of open stomata and amount ...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va...",False
3,100116,"{""100116"": [""Clade SH [clade]"", ""Clade M [clad...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True,"This looks correct to me, as the grouping foll...",[],True
4,133255,"{""133255"": [""PSPTO3648 [gene]"", ""LPP\u10b2 [ge...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""...",True
...,...,...,...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-mediated modulation of the m...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",...",False
996,157227,"{""157227"": [""GmSAUR [gene]"", ""ZmSAUR1 (Zea may...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ...",True
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True,"Great job, that looks correct to me. Each gene...",[],True
998,99091,"{""99091"": [""Genome sequencing of O. fragrans '...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism...",True


### Doing the analysis

#### Non empty: 
1. initial and corrected clusters - checking if lengths are equal
2. initial and corrected clusters - checking if corrected cluster are shorter than initials
3. initial and corrected clusters - checking if corrected cluster are longer than initials
#### Empty:
1. Check if both initial and corrected clusters are empty.

In [33]:


import ast

def get_list_length(output):
    # If the output is a string representation of a dictionary, convert it on the fly.
    if isinstance(output, str):
        output = ast.literal_eval(output)
    # Get the length of the list in the first value of the dictionary.
    return len(list(output.values())[0])

df_nonempty_init_correct_same_length = df[df.apply(lambda row: get_list_length(row['Output']) == len(ast.literal_eval(row['o3_corrected_cluster'])), axis=1)]

In [34]:
len(df_nonempty_init_correct_same_length)

18

In [35]:
#get the rows where the Input is non empty list and o3 corrected cluster length is fewer than Input length
df_nonempty_init_correct_fewer_length = df[df.apply(lambda row: get_list_length(row['Output']) > len(ast.literal_eval(row['o3_corrected_cluster'])), axis=1)]
len(df_nonempty_init_correct_fewer_length)

982

In [36]:
#get the rows where the Input is non empty list and o3 corrected cluster length is greater than Input length
df_nonempty_init_correct_more_length = df[df.apply(lambda row: get_list_length(row['Output']) < len(ast.literal_eval(row['o3_corrected_cluster'])), axis=1)]
len(df_nonempty_init_correct_more_length)

0

In [37]:
#get the rows where the Input is empty list and o3 corrected cluster length is also empty
df_empty_init_correct_also_empty = df[df.apply(lambda row: get_list_length(row['Output']) == 0 and len(ast.literal_eval(row['o3_corrected_cluster'])) == 0, axis=1)]
len(df_empty_init_correct_also_empty)

0

In [38]:
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct
0,58963,"{""58963"": [""Barrena et al. [researcher]"", ""Gar...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True,"Great job, that looks correct because each res...",[],True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport...",True
2,22996,"{""22996"": [""number of open stomata and amount ...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va...",False
3,100116,"{""100116"": [""Clade SH [clade]"", ""Clade M [clad...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True,"This looks correct to me, as the grouping foll...",[],True
4,133255,"{""133255"": [""PSPTO3648 [gene]"", ""LPP\u10b2 [ge...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""...",True
...,...,...,...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-mediated modulation of the m...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",...",False
996,157227,"{""157227"": [""GmSAUR [gene]"", ""ZmSAUR1 (Zea may...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ...",True
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True,"Great job, that looks correct to me. Each gene...",[],True
998,99091,"{""99091"": [""Genome sequencing of O. fragrans '...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism...",True


In [39]:
output_csv['is_correct'].value_counts()

is_correct
True     809
False    191
Name: count, dtype: int64

In [40]:
# remove the rows where the Entity Name is 73713, 97119, 159677,664. I found some issues with the correctness of the clusters for these rows.

df = df[~df['Entity Name'].isin([73713, 97119, 159677,664])]
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct
0,58963,"{""58963"": [""Barrena et al. [researcher]"", ""Gar...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True,"Great job, that looks correct because each res...",[],True
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport...",True
2,22996,"{""22996"": [""number of open stomata and amount ...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va...",False
3,100116,"{""100116"": [""Clade SH [clade]"", ""Clade M [clad...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True,"This looks correct to me, as the grouping foll...",[],True
4,133255,"{""133255"": [""PSPTO3648 [gene]"", ""LPP\u10b2 [ge...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""...",True
...,...,...,...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-mediated modulation of the m...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",...",False
996,157227,"{""157227"": [""GmSAUR [gene]"", ""ZmSAUR1 (Zea may...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ...",True
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True,"Great job, that looks correct to me. Each gene...",[],True
998,99091,"{""99091"": [""Genome sequencing of O. fragrans '...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism...",True


In [42]:
output_csv

Unnamed: 0,cluster_id,is_correct,user_prompt,clusters
0,58963,True,"Great job, that looks correct because each res...",[]
1,22775,True,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport..."
2,22996,False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va..."
3,100116,True,"This looks correct to me, as the grouping foll...",[]
4,133255,True,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""..."
...,...,...,...,...
995,31615,False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",..."
996,157227,True,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ..."
997,131559,True,"Great job, that looks correct to me. Each gene...",[]
998,99091,True,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism..."


In [46]:
# RUN THIS CELL for V5 only

#mapping cluster_id from output_csv_v2 with Entity Name add the reasoning column to df
output_csv_v2 = pd.read_csv("./o3mini_val_outs/o3mini_1k_v2_again.csv")
#create a dictionary with key as cluster_id and value as the reasoning
o3_reasoning_dict = {}
for index, row in output_csv_v2.iterrows():
    o3_reasoning_dict[row['cluster_id']] = row['reasoning']

#create a new column named reasoning in df
df['reasoning'] = df['Entity Name'].map(o3_reasoning_dict)
df = df.reset_index(drop=True)
df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reasoning'] = df['Entity Name'].map(o3_reasoning_dict)


Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct,reasoning
0,58963,"{""58963"": [""Barrena et al. [researcher]"", ""Gar...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True,"Great job, that looks correct because each res...",[],True,
1,22775,"{""22775"": [""physiological transport of macro-m...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport...",True,We formed clusters only when two or more entit...
2,22996,"{""22996"": [""number of open stomata and amount ...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va...",False,Our analysis of the stomatal conductance–relat...
3,100116,"{""100116"": [""Clade SH [clade]"", ""Clade M [clad...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True,"This looks correct to me, as the grouping foll...",[],True,
4,133255,"{""133255"": [""PSPTO3648 [gene]"", ""LPP\u10b2 [ge...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""...",True,"In this gene list, only one multi‐entity clust..."
...,...,...,...,...,...,...,...,...,...
991,31615,"{""31615"": [""auxin-mediated modulation of the m...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",...",False,The three process-typed items 'auxin-dependent...
992,157227,"{""157227"": [""GmSAUR [gene]"", ""ZmSAUR1 (Zea may...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ...",True,The clustering was performed by strictly adher...
993,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True,"Great job, that looks correct to me. Each gene...",[],True,
994,99091,"{""99091"": [""Genome sequencing of O. fragrans '...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism...",True,We first examined the full phrases and noted t...


In [47]:
must_present = df_nonempty_init_correct_same_length['Entity Name'].to_list()

In [48]:
len(must_present)

18

In [None]:
#randomly sample rows from the dataframe such that number of rows where is_correct column is True is equal in number to the rows where is_correct column is False (V1, V2, V3)
#also make sure the Entity Name must contain the must_present list
df_correct = df[(df['is_correct'] == True)]
df_must_present = df[df['Entity Name'].isin(must_present)]
df_incorrect = df[df['is_correct'] == False]


df_correct_sample = df_correct.sample(n=181, random_state=42) #change n to 50 for V1, V2
df_incorrect_sample = df_incorrect.sample(n=180, random_state=42) #change n to 50 for V1, V2


df_sample = pd.concat([df_correct_sample, df_incorrect_sample, df_must_present])
df_sample = df_sample.sample(frac=1).reset_index(drop=True)

df_sample




Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct
0,31465,"{""31465"": [""stomatal number establishment [pro...","{""31465"": [[""**stomatal patterning [process]**...","{""31465"": [[""**stomatal patterning [process]**...",False,I believe the clustering is not entirely corre...,"[[""**stomatal patterning [process]**"", ""Stomat...",False
1,10291,"{""10291"": [""phase of post-mitotic expansion [p...","{""10291"": [[""Extensive cell expansion [phenoty...","{""10291"": [[""**enhanced post-mitotic cell expa...",False,I think there is an issue with the grouping of...,"[[""**post-mitotic cell expansion [phenotype]**...",False
2,134031,"{""134031"": [""CARDIOLIPIN SYNTHASE (CLS) [gene]...","{""134031"": [[""CLS [gene]"", ""**CARDIOLIPIN SYNT...","{""134031"": [[""CLS [gene]"", ""**CARDIOLIPIN SYNT...",False,This looks incorrect to me. The entry 'CL synt...,"[[""**CARDIOLIPIN SYNTHASE (CLS) [gene]**"", ""CL...",False
3,13588,"{""13588"": [""shoot apical meristem size [phenot...","{""13588"": [[""**IFM size [phenotype]**"", ""size ...","{""13588"": [[""**IFM size [phenotype]**"", ""size ...",False,"Great job, that looks correct to me. Each clus...","[[""**IFM size [phenotype]**"", ""size of the inf...",True
4,152916,"{""152916"": [""Haspin candidate gene(s) [gene]"",...","{""152916"": [[""GID1 homologs [gene]""], [""Omp85 ...","{""152916"": [[""**HEIP1 homologs [gene]**"", ""hom...",False,"Great job, that looks correct because each gen...","[[""**HEIP1 homologs [gene]**"", ""homologs of HE...",True
...,...,...,...,...,...,...,...,...
374,124777,"{""124777"": [""additional lncRNAs [metabolite]"",...","{""124777"": [[""**long noncoding RNAs (lncRNAs) ...","{""124777"": [[""**long noncoding RNAs (lncRNAs) ...",False,"Great job, that looks correct to me. The gener...","[[""**long noncoding RNAs (lncRNAs) [metabolite...",True
375,8926,"{""8926"": [""differences between the three genot...","{""8926"": [[""**morphological differences [pheno...","{""8926"": [[""**morphological differences [pheno...",False,I think you are wrong because the grouping of ...,"[[""**morphological differences [phenotype]**"",...",False
376,74789,"{""74789"": [""Non-Race-specific Disease Resistan...","{""74789"": [[""**non-race specific disease resis...","{""74789"": [[""**non-race specific disease resis...",False,This looks incorrect to me because the groupin...,"[[""**non-race specific disease resistance (NDR...",False
377,49218,"{""49218"": [""deletions on chromosome 6 [genetic...","{""49218"": [[""Mutants defective in chromosomal ...","{""49218"": []}",True,"Great job, that looks correct because each ent...",[],True


In [22]:
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct
0,58963,"{""58963"": [""Gardea-Torresdey et al. [researche...","{""58963"": [[""Gardea-Torresdey et al. [research...","{""58963"": []}",True,"Great job, that looks correct because each res...",[],True
1,22775,"{""22775"": [""transport in planta [phenotype]"", ...","{""22775"": [[""substance that could be transloca...","{""22775"": [[""**transportation [phenotype]**"", ...",False,"Great job, the clusters appear to be correctly...","[[""**transportation [phenotype]**"", ""transport...",True
2,22996,"{""22996"": [""stomatal conductance, transpiratio...","{""22996"": [[""**transpiration and stomatal cond...","{""22996"": [[""**transpiration and stomatal cond...",False,I think you are wrong. In the current clusteri...,"[[""**transpiration and stomatal conductance va...",False
3,100116,"{""100116"": [""Clade M [clade]"", ""Clade U [clade...","{""100116"": [[""Clade M [clade]""], [""Clade SL [c...","{""100116"": []}",True,"This looks correct to me, as the grouping foll...",[],True
4,133255,"{""133255"": [""LPE1 [gene]"", ""SUPPRESSOR OF LLP1...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...","{""133255"": [[""**PLSP2A [gene]**"", ""PLSP2A gene...",False,"Great job, the clustering is correct. The gene...","[[""**PLSP2A [gene]**"", ""PLSP2A gene(s) [gene]""...",True
...,...,...,...,...,...,...,...,...
995,31615,"{""31615"": [""auxin-regulated endocytosis [proce...","{""31615"": [[""auxin-mediated modulation of the ...","{""31615"": [[""auxin-dependent inhibition of end...",False,I think you are wrong. The grouping of the end...,"[[""**auxin-regulated endocytosis [process]**"",...",False
996,157227,"{""157227"": [""GmSAUR [gene]"", ""SlSAUR1-like [ge...","{""157227"": [[""GmSAUR [gene]""], [""StSAUR [gene]...","{""157227"": [[""ZmSAUR1 (Zea mays SAURs) [gene]""...",False,This looks correct to me. Each gene entity is ...,"[[""ZmSAUR1 (Zea mays SAURs) [gene]"", ""ZmSAUR1 ...",True
997,131559,"{""131559"": [""BtuC gene(s) [gene]"", ""cytochrome...","{""131559"": [[""BtuB gene(s) [gene]""], [""BtuC ge...","{""131559"": []}",True,"Great job, that looks correct to me. Each gene...",[],True
998,99091,"{""99091"": [""OFL [organism]"", ""D. oleifera geno...","{""99091"": [[""Genome sequencing of O. fragrans ...","{""99091"": [[""**O. fragrans 'Liuyejingui' (OFL)...",False,"Great job, the clustering is correct. You have...","[[""**O. fragrans 'Liuyejingui' (OFL) [organism...",True


In [None]:
#Sample 100 for training and 25 for testing with random state 42 (V3)
df_train = df_sample.sample(n=350, random_state=42)
df_test = df_sample[~df_sample.index.isin(df_train.index)]

In [49]:
df = df.reset_index(drop=True)
#take 960 rows randomly from df (For V4)
df_train = df.sample(n=960, random_state=42)
df_test = df[~df.index.isin(df_train.index)]

In [50]:
len(df_train), len(df_test)

(960, 36)

In [51]:
len(df)

996

In [52]:
df_train['is_correct'].value_counts()

is_correct
True     779
False    181
Name: count, dtype: int64

In [53]:
df_test['is_correct'].value_counts()

is_correct
True     29
False     7
Name: count, dtype: int64

In [None]:
# For V1,V2,V3,V4

detailed_description = """Here are two example inputs, reasoning for clustering, and the corresponding outputs to help you understand the task better:
Input-1
{
  "0": '['Meiotic block [phenotype]', 'early arrest [phenotype]', 'arrest of meiotic progression in anaphase II [phenotype]', 'arrested zygotic divisions [phenotype]', 'meiotic arrest [phenotype]', 'delayed/arrested meiosis [phenotype]', 'pachytene arrest [phenotype]', 'block in meiosis prophase I [phenotype]', 'male meiosis until the end of the first division [phenotype]', 'meiotic prophase arrest [phenotype]', 'arrest in late prophase I [phenotype]', 'arrest at the end of meiosis I [phenotype]', 'leptotene arrest [phenotype]', 'meiosis I arrest [phenotype]', 'arresting the first mitosis during gametogenesis [phenotype]', 'absence of meiotic arrest [phenotype]', 'meiotic arrest phenotype [phenotype]', 'meiotic arrest at telophase I [phenotype]', 'termination of meiosis after anaphase I [phenotype]', 'premature termination of meiosis after anaphase I [phenotype]', 'mitotic arrest during female gametogenesis [phenotype]', 'arrest after meiosis I [phenotype]', 'meiotic arrest at pachytene [phenotype]', 'arrested endosperm nuclear divisions [phenotype]', 'meiotic arrest in anaphase II [phenotype]', 'meiotic division stop [phenotype]', 'arrest of the first mitotic division in gametogenesis [phenotype]', 'FNM half-stop [phenotype]', 'males arresting in the middle of prophase I [phenotype]', 'arresting prior to the first mitotic division [phenotype]']'
}
Input-1 → Output-1 [REASONING]

1) Generic Meiotic Arrest

Cluster:
[ "Meiotic block [phenotype]", "**meiotic arrest [phenotype]**", "meiotic arrest phenotype [phenotype]", "meiotic division stop [phenotype]" ]
• Incorrect: Grouping these separately as if they refer to different stages would obscure their identical meaning.
• Correct: Recognize they all broadly indicate a total halt in meiosis at an unspecified stage.
• Important: These terms are functionally synonymous, capturing any generic failure of meiotic progression.
• Show in the output?: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'meiotic arrest [phenotype]' as the representative because it is the most concise and commonly used term to describe a complete halt in the meiotic process. Unlike the other phrases which include additional descriptors or less formal wording, 'meiotic arrest' clearly and unambiguously captures the essential biological event, making it the best exemplar for the cluster.


2) Delayed / Arrested Meiosis

Cluster:
[ "delayed/arrested meiosis [phenotype]" ]
• Incorrect: Merging it with generic meiotic arrest overlooks the “delayed” aspect, which implies partial progression.
• Correct: Keep it separate because it specifies a slow or partial block before a final stall.
• Important: “Delayed” suggests some chromosomes or cells proceed further than in an outright immediate block.
• Show in the output?: No, because the length of cluster is equal to 1.

3) Absence of Meiotic Arrest

Cluster:
[ "absence of meiotic arrest [phenotype]" ]
• Incorrect: Combining with any arrest group would contradict its meaning.
• Correct: Maintain it as the negative counterpart, meaning no block occurs.
• Important: This phenotype is crucial for comparisons, showing normal meiotic completion instead of a stoppage.
• Show in the output?: No, because the length of cluster is equal to 1.

4) Prophase I Arrest (Broad)

Cluster:
[ "block in meiosis prophase I [phenotype]", "meiotic prophase arrest [phenotype]" ]
• Incorrect: Splitting them into sub-sub-stages if the label doesn’t specify.
• Correct: They both emphasize an arrest somewhere in prophase I, without detailing the exact sub-stage (leptotene, pachytene, etc.).
• Important: This captures a prophase I blockade in general, distinct from specific sub-stage arrests.
• Show in the output?: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'meiotic prophase arrest [phenotype]' as the representative because it succinctly captures the core biological event of an arrest occurring during prophase, without the additional wording found in the alternative. This clarity makes it the most direct and precise exemplar for the cluster.


5) Male Arrest in Mid–Prophase I

Cluster:
[ "males arresting in the middle of prophase I [phenotype]" ]
• Incorrect: Mixing with general prophase I arrest loses the male-specific nature and midpoint detail.
• Correct: Keep it unique because it adds a sex specification (male) and timing (mid-prophase I).
• Important: This addresses sex-specific contexts where the XY body or other male meiotic events fail around zygotene/pachytene.
• Show in the output?: No, because the length of cluster is equal to 1.

6) Early Arrest (Undefined Sub-Stage)

Cluster:
[ "early arrest [phenotype]" ]
• Incorrect: Equating “early” to a named stage like leptotene or zygotene.
• Correct: It must stand alone since it lacks a formal sub-stage but implies an initial block.
• Important: The label indicates an arrest that happens before mid- or late-stage phenomena but is otherwise unspecified.
• Show in the output?: No, because the length of cluster is equal to 1.

7) Leptotene Arrest

Cluster:
[ "leptotene arrest [phenotype]" ]
• Incorrect: Merging it with “early arrest” would lose the sub-stage clarity.
• Correct: “Leptotene” is a well-defined earliest sub-stage of prophase I, deserving its own node.
• Important: This precisely pinpoints where chromosomes start to condense yet fail to progress.
• Show in the output?: No, because the length of cluster is equal to 1.

8) Pachytene Arrest (Synonyms)

Cluster:
[ "**pachytene arrest [phenotype]**", "meiotic arrest at pachytene [phenotype]" ]
• Incorrect: Splitting these two would ignore that they describe the exact same block.
• Correct: They both name the pachytene sub-stage, so they cluster together.
• Important: Pachytene is when homologs are fully synapsed, so arrest here is distinct from earlier or later prophase I phases.
• Show in the output?: Yes, because the length of cluster greater than 1.
• Cluster representative: I selected 'pachytene arrest [phenotype]' as the representative because it is more concise and directly highlights the specific stage of meiotic arrest without additional qualifiers. This succinct phrasing clearly captures the biological event at the pachytene stage, making it the best exemplar for the cluster.

9) Late Prophase I Arrest

Cluster:
[ "arrest in late prophase I [phenotype]" ]
• Incorrect: Grouping with general prophase I arrests would lose the “late” distinction.
• Correct: Keep it separate because it suggests diplotene or diakinesis sub-stages.
• Important: Identifies that the block occurs after chromosome synapsis (pachytene) but before metaphase I.
• Show in the output?: No, because the length of cluster is equal to 1.

10) Meiosis I Arrest (Broad)

Cluster:
[ "meiosis I arrest [phenotype]" ]
• Incorrect: Conflating with prophase I or end-of-meiosis-I arrests.
• Correct: This label is intentionally broad for a block anywhere in the entire first meiotic division.
• Important: Distinct from narrower arrests at anaphase I or telophase I.
• Show in the output?: No, because the length of cluster is equal to 1.

11) End-of-Meiosis-I Arrest

Cluster:
[ "arrest at the end of meiosis I [phenotype]", "arrest after meiosis I [phenotype]", "**meiotic arrest at telophase I [phenotype]**" ]
• Incorrect: Mixing with generic “meiosis I arrest” might obscure that these specifically reach telophase I or just beyond.
• Correct: All describe an arrest that specifically coincides with or follows telophase I.
• Important: They finish prophase–anaphase I but fail to transition into or complete meiosis II.
• Show in the output?: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'meiotic arrest at telophase I [phenotype]' as the representative because it explicitly specifies the stage of arrest, leaving no ambiguity about the timing within meiosis. By clearly indicating telophase I, it provides a precise and biologically accurate descriptor compared to the more ambiguous alternatives present in the cluster.

12) Male Meiosis Until End of First Division

Cluster:
[ "male meiosis until the end of the first division [phenotype]" ]
• Incorrect: Merging with “arrest at the end of meiosis I” would ignore the explicit mention of male gametogenesis.
• Correct: It parallels an end-of-meiosis-I block but is sex-specific.
• Important: Reflects male-specific phenotypes where meiosis I completes in a partial sense but doesn’t proceed to meiosis II.
• Show in the output?: No, because the length of cluster is equal to 1.

13) Post–Anaphase I Termination

Cluster:
[ "**termination of meiosis after anaphase I [phenotype]**", "premature termination of meiosis after anaphase I [phenotype]" ]
• Incorrect: Combining with end-of-meiosis-I arrests (telophase I) might overlook the specific time point (right after anaphase I).
• Correct: These highlight that meiosis halts immediately following homolog separation in anaphase I.
• Important: “Premature termination” still implies the same staging (post-anaphase I), so they cluster together.
• Show in the output?: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'termination of meiosis after anaphase I [phenotype]' as the representative because it provides a clear and concise description of the cessation of meiosis immediately following anaphase I. The absence of the qualifier 'premature' avoids additional nuance regarding timing, making it a more universally applicable and straightforward term to represent the cluster.

14) Anaphase II Arrest (Synonyms)

Cluster:
[ "arrest of meiotic progression in anaphase II [phenotype]", "**meiotic arrest in anaphase II [phenotype]**" ]
• Incorrect: Grouping with anaphase I or telophase I arrests would misrepresent the division stage.
• Correct: Both pinpoint the second meiotic anaphase, so they are true synonyms.
• Important: This arrest means meiosis I completed successfully, but the cell fails during separation of sister chromatids.
• Show in the output?: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'meiotic arrest in anaphase II [phenotype]' as the representative because it succinctly and directly identifies the specific phase (anaphase II) at which the arrest occurs. Its concise phrasing avoids unnecessary complexity, making it the clearest descriptor of the biological event in this cluster.

15) Arrested Zygotic Divisions

Cluster:
[ "arrested zygotic divisions [phenotype]" ]
• Incorrect: Folding into meiotic blocks misses that zygotic divisions are post-fertilization mitoses.
• Correct: Keep separate, as this block is in the earliest embryo after fertilization.
• Important: Distinguishing embryonic arrests from gametogenic or meiotic ones is crucial in developmental contexts.
• Show in the output?: No, because the length of cluster is equal to 1.

16) Arrested Endosperm Nuclear Divisions

Cluster:
[ "arrested endosperm nuclear divisions [phenotype]" ]
• Incorrect: Combining with zygotic divisions lumps distinct post-fertilization tissues (embryo vs. endosperm).
• Correct: Endosperm is a separate tissue formed post-fertilization (often triploid), so it merits its own category.
• Important: In many plants, endosperm divides separately, so an arrest here is unique from zygotic embryonic arrest.
• Show in the output: No, because the length of cluster is equal to 1.

17) First Mitotic Division in Gametogenesis (Synonyms)

Cluster:
[ "arresting the first mitosis during gametogenesis [phenotype]", "**arrest of the first mitotic division in gametogenesis [phenotype]**", "FNM half-stop [phenotype]" ]
• Incorrect: Splitting these fails to see that all reference halting the very first post-meiotic mitosis.
• Correct: They describe the same stage (first mitosis in gametogenesis), so they are synonyms.
• Important: “FNM half-stop” is shorthand for the same phenomenon, not a different event.
• Show in the output: Yes, because the length of cluster is greater than 1.
• Cluster representative: I selected 'arrest of the first mitotic division in gametogenesis [phenotype]' as the representative because it is the most precise and descriptive term. It clearly specifies the process (mitotic division) and context (gametogenesis), avoiding the informal shorthand of 'FNM half-stop' and the less formal phrasing of 'arresting the first mitosis during gametogenesis.' This precision makes it the best exemplar for the cluster.

18) Arrest Prior to First Mitotic Division

Cluster:
[ "arresting prior to the first mitotic division [phenotype]" ]
• Incorrect: Assuming it is the same as “arresting the first mitosis” would confuse the actual onset of that mitosis.
• Correct: This indicates cells never even enter mitosis.
• Important: Distinguishing “before it starts” from “during the division” can be crucial for understanding gametogenesis defects.
• Show in the output: No, because the length of cluster is equal to 1.

19) Mitotic Arrest During Female Gametogenesis

Cluster:
[ "mitotic arrest during female gametogenesis [phenotype]" ]
• Incorrect: Merging with the “first mitosis” cluster might ignore that multiple mitotic divisions can occur in female lines.
• Correct: A female-specific block in some mitotic division (not necessarily the first).
• Important: Sex specificity and indefinite mitotic stage set it apart from a clearly labeled “first mitosis” arrest.
• Show in the output: No, because the length of cluster is equal to 1.
Output-1
{
"0": [
[
"Meiotic block [phenotype]",
"**meiotic arrest [phenotype]**",
"meiotic arrest phenotype [phenotype]",
"meiotic division stop [phenotype]"
],
[
"block in meiosis prophase I [phenotype]",
"**meiotic prophase arrest [phenotype]**"
],
[
"**pachytene arrest [phenotype]**",
"meiotic arrest at pachytene [phenotype]"
],
[
"arrest at the end of meiosis I [phenotype]",
"arrest after meiosis I [phenotype]",
"**meiotic arrest at telophase I [phenotype]**"
],
[
"**termination of meiosis after anaphase I [phenotype]**",
"premature termination of meiosis after anaphase I [phenotype]"
],
[
"arrest of meiotic progression in anaphase II [phenotype]",
"**meiotic arrest in anaphase II [phenotype]**"
],
[
"arresting the first mitosis during gametogenesis [phenotype]",
"**arrest of the first mitotic division in gametogenesis [phenotype]**",
"FNM half-stop [phenotype]"
]
]
}
Input-2
{
  "1": '[
    "Salt-stress severity [treatment]",
    "high NaCl stress [treatment]",
    "potassium deprivation stress [treatment]",
    "salt-stress response [treatment]",
    "salt stress tolerance [treatment]",
    "heat and salt stress conditions [treatment]",
    "prolonged levels of salt stress [treatment]",
    "recovery from salt stress [treatment]",
    "salt stress assay [treatment]",
    "salt stress signaling pathways [treatment]",
    "gradual salt stress treatments [treatment]",
    "salt and low temperature stresses [treatment]",
    "salt and silicon stresses [treatment]"
  ]'
}
Input-2 → Output-2 [REASONING]

1) Salt-Stress Core

Clustered Terms
• “Salt-stress severity [treatment]”
• “**high NaCl stress [treatment]**”
• “salt-stress response [treatment]”
• “salt stress tolerance [treatment]”
• “prolonged levels of salt stress [treatment]”
• “recovery from salt stress [treatment]”
• “salt stress assay [treatment]”
• “salt stress signaling pathways [treatment]”
• “gradual salt stress treatments [treatment]”

Incorrect: Splitting “NaCl” from “salt” would be biologically misleading since NaCl is the chemical basis of most salt stress.
Correct: Recognize all are purely salt-based conditions; “NaCl” is the explicit form, but it is still “salt.”
Important: These labels measure or manipulate salt-stress conditions alone (no other stress factor).
• Show in the output?: Yes, because the length of cluster is greater than 1.
Cluster representative: I selected 'high NaCl stress [treatment]' as the representative because it directly encapsulates the core concept of salt stress by explicitly naming the chemical agent (NaCl) responsible for inducing the stress condition. This term is both succinct and unambiguous, avoiding additional qualifiers (like severity, response, or tolerance) that could shift the focus away from the primary salt stress condition.

2) Unique Stress: Potassium Deprivation

Clustered Term
• “potassium deprivation stress [treatment]”

Incorrect: Combining this with salt-based treatments implies overlapping ionic stress without specificity.
Correct: Keep it separate because it focuses on K+ deficiency rather than NaCl excess.
Important: Potassium starvation is a distinct abiotic stress requiring separate interpretation and management from salt stress.
• Show in the output?: No, because the length of cluster is equal to 1.

3) Heat + Salt Stress

Clustered Term
• “heat and salt stress conditions [treatment]”

Incorrect: Merging with the salt core group would lose the additional heat component.
Correct: Keep it in its own cluster because it involves two distinct stressors (heat + salt).
Important: Many experiments examine combined stresses differently than single-stress treatments.
• Show in the output?: No, because the length of cluster is equal to 1.

4) Salt + Low Temperature

Clustered Term
• “salt and low temperature stresses [treatment]”

Incorrect: Folding it into a single “salt” cluster disregards the cold factor.
Correct: Identify that it specifically tests tolerance or response to dual stress: salt and cold.
Important: Understanding multi-stress interactions is crucial for breeding or experimental design.
• Show in the output?: No, because the length of cluster is equal to 1.

5) Salt + Silicon

Clustered Term
• “salt and silicon stresses [treatment]”

Incorrect: Grouping with plain salt stress lumps unique “silicon” involvement into generic salt.
Correct: Keep separate because it’s salt + another factor (silicon) that could mitigate or alter salt stress.
Important: Silicon is sometimes used to ameliorate salt stress, so it forms a distinct combined treatment.
• Show in the output: No, because the length of cluster is equal to 1.

output-2
{
  "1": [
    [
      "Salt-stress severity [treatment]",
      "**high NaCl stress [treatment]**",
      "salt-stress response [treatment]",
      "salt stress tolerance [treatment]",
      "prolonged levels of salt stress [treatment]",
      "recovery from salt stress [treatment]",
      "salt stress assay [treatment]",
      "salt stress signaling pathways [treatment]",
      "gradual salt stress treatments [treatment]"]
  ]
}
"""

In [58]:
import ast

def split_dict(d, chunk_size=100):
    keys = list(d.keys())
    chunks = []
    for i in range(0, len(keys), chunk_size):
        chunk_keys = keys[i:i+chunk_size]
        chunk_dict = {k: d[k] for k in chunk_keys}
        chunks.append(chunk_dict)
    return chunks

def flatten_bracketed_strings(value_list):
    """
    Takes a list. For each item:
      - If item is a string that looks like '[...]', parse it and extend the list.
      - Otherwise, keep as is.
    Returns a new flattened list.
    """
    new_list = []
    for val in value_list:
        if (
            isinstance(val, str) 
            and val.strip().startswith("[") 
            and val.strip().endswith("]")
        ):
            # Attempt to parse the bracketed string
            try:
                parsed = ast.literal_eval(val)  # convert string -> Python list
                if isinstance(parsed, list):
                    new_list.extend(parsed)  # flatten
                else:
                    # If it's not a list, just append as-is
                    new_list.append(val)
            except (SyntaxError, ValueError):
                # If parsing fails, keep original
                new_list.append(val)
        else:
            new_list.append(val)
    return new_list





In [31]:
#For training 
df = df_train.copy()
#For testing
#df = df_test.copy()

In [32]:
df.columns  

Index(['Entity Name', 'Input', 'Output', 'Output>1', 'one_item', 'user_prompt',
       'o3_corrected_cluster', 'is_correct'],
      dtype='object')

In [211]:
entity_dict = {}
for item in df["Output"]:
    try:
        parsed_dict = json.loads(item)
        entity_dict.update(parsed_dict)
    except json.JSONDecodeError:
        print(f"Skipping invalid JSON: {item}")

In [41]:
df

Unnamed: 0,Entity Name,Input,Output,Output>1,one_item,user_prompt,o3_corrected_cluster,is_correct,reasoning
630,23162,"{""23162"": [""Cross talk [phenotype]"", ""node for...","{""23162"": [[""signal transduction cross-talk [p...","{""23162"": [[""**Cross talk [phenotype]**"", ""cro...",False,This looks correct to me because each cluster ...,"[[""**Cross talk [phenotype]**"", ""cross-talks [...",True,We first identified a large group of entities ...
367,135617,"{""135617"": [""gene(s) related to respiratory el...","{""135617"": [[""coordinately regulated gene(s) f...","{""135617"": [[""coordinately regulated gene(s) f...",False,This looks incorrect to me. Two issues stand o...,"[[""**gene(s) involved in the alternative respi...",False,After careful evaluation following the cluster...
485,122763,"{""122763"": [""overlapping gene(s) in apple chro...","{""122763"": [[""**orthologous regions [phenotype...","{""122763"": [[""**orthologous regions [phenotype...",False,"This looks incorrect to me, because some clust...","[[""**Homologous regions [phenotype]**"", ""Homol...",False,The clustering was performed by first consider...
290,151814,"{""151814"": [""locus tms5 [gene]"", ""tms5 locus [...","{""151814"": [[""orf79 [gene]""], [""SLG1indica [ge...","{""151814"": [[""**asp-lsl [gene]**"", ""ASP-LSL lo...",False,"Great job, the clusters are correctly formed. ...","[[""**asp-lsl [gene]**"", ""ASP-LSL locus [gene]""...",True,The clustering was performed by strictly group...
72,131773,"{""131773"": [""VvNPF3.2 [gene]"", ""VviNPF2.1 [gen...","{""131773"": [[""VviNPF2.1 [gene]""], [""**VvNPF3.2...","{""131773"": [[""**VvNPF3.2 [gene]**"", ""grapevine...",False,"Great job, that looks correct because the enti...","[[""**VvNPF3.2 [gene]**"", ""grapevine NFP3.2 [ge...",True,The gene entities were clustered based on exac...
...,...,...,...,...,...,...,...,...,...
345,130937,"{""130937"": [""AtEDEM1 homolog [gene]"", ""AtEDEM1...","{""130937"": [[""AtEDEM1/AtEDEM2 [gene]"", ""two ED...","{""130937"": [[""AtEDEM1/AtEDEM2 [gene]"", ""two ED...",False,"Great job, the grouping is spot‐on. The cluste...","[[""AtEDEM1/AtEDEM2 [gene]"", ""two EDEM homologs...",True,The gene list was first divided into two major...
241,63849,"{""63849"": [""catalytic manganese cluster [prote...","{""63849"": [[""DCH model [phenotype]""], [""**mang...","{""63849"": [[""**manganese cluster [protein comp...",False,"Great job, that looks correct to me. You have ...","[[""**manganese cluster [protein complex]**"", ""...",True,The clustering was executed by strictly applyi...
13,81737,"{""81737"": [""wt MP [protein]"", ""TMEM16A wild-ty...","{""81737"": [[""native ScHal3 [protein]""], [""nati...","{""81737"": [[""**wt HcPro [protein]**"", ""Wild-ty...",False,"Great job, the clustering looks correct to me....","[[""**wt HcPro [protein]**"", ""Wild-type (wt) Hc...",True,"In this dataset, each entry refers to a specif..."
315,123050,"{""123050"": [""poly(A)-ending species [rna]"", ""t...","{""123050"": [[""polyadenylated RNA 3' ends [geno...","{""123050"": [[""polyadenylated RNA 3' ends [geno...",False,"Great job, the clustering appears to be correc...","[[""polyadenylated RNA 3' ends [genomic region]...",True,We examined all 26 entities using their full p...


In [56]:
variable_input_enitity_dict = {}
for item in df["Input"]:
    try:
        parsed_dict = json.loads(item)
        variable_input_enitity_dict.update(parsed_dict)
    except json.JSONDecodeError:
        print(f"Skipping invalid JSON: {item}")

variable_output_enitity_dict = {}
for item in df["Output"]:
    try:
        parsed_dict = json.loads(item)
        variable_output_enitity_dict.update(parsed_dict)
    except json.JSONDecodeError:
        print(f"Skipping invalid JSON: {item}")

variable_o3_user_prompt_enitity_dict =  df.set_index('Entity Name')['user_prompt'].to_dict()

variable_o3mini_corrected_cluster_enitity_dict =  df.set_index('Entity Name')['o3_corrected_cluster'].to_dict()

#variable_reasoning_enitity_dict =  df.set_index('Entity Name')['reasoning'].to_dict() # run this for V5 only


In [59]:
#create chunks for each variable dictonary

chunks_variable_input_entity = split_dict(variable_input_enitity_dict,1)
chunks_variable_output_entity = split_dict(variable_output_enitity_dict,1)
chunks_variable_o3_user_prompt = split_dict(variable_o3_user_prompt_enitity_dict,1)
chunks_variable_o3mini_corrected_cluster = split_dict(variable_o3mini_corrected_cluster_enitity_dict,1)
#chunks_variable_o3mini_reasoning = split_dict(variable_reasoning_enitity_dict,1) # run this for V5 only
#print length of each chunk
print('length of chunks_variable_input_entity:', len(chunks_variable_input_entity))
print('length of chunks_variable_output_entity:', len(chunks_variable_output_entity))
print('length of chunks_variable_o3_user_prompt:', len(chunks_variable_o3_user_prompt))
print('length of chunks_variable_o3mini_corrected_cluster:', len(chunks_variable_o3mini_corrected_cluster))
#print('length of chunks_variable_o3mini_reasoning:', len(chunks_variable_o3mini_reasoning)) # run this for V5 only

length of chunks_variable_input_entity: 996
length of chunks_variable_output_entity: 996
length of chunks_variable_o3_user_prompt: 996
length of chunks_variable_o3mini_corrected_cluster: 996


In [60]:
variable_o3mini_corrected_cluster_enitity_dict =  df.set_index('Entity Name')['o3_corrected_cluster'].to_dict()
chunks_variable_o3mini_corrected_cluster = split_dict(variable_o3mini_corrected_cluster_enitity_dict,1)

### Creating jsonl file for V1

In [172]:
import jsonlines
output_path = '/'
out_file = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_testset_25.jsonl'
count_examples = 0
with jsonlines.open(out_file, mode='w') as writer:
    # Loop through each different chhunk and create a line in jsonl file
    for i in range(len(chunks_variable_input_entity)):
        line = {
                "messages": [

                        {
                            "role": "system",
                            "content": system_message
                        },

                        {
                            "role": "user",
                            "content": json.dumps(chunks_variable_input_entity[i], indent=2)
                       
                        },

                        {
                            "role": "assistant",
                            "content": json.dumps(chunks_variable_output_entity[i], indent=2)
                        },

                        {
                            "role": "user",
                            "content": json.dumps(chunks_variable_o3_user_prompt[i], indent=2)
                        },

                        {
                            "role": "assistant",
                            "content": json.dumps(chunks_variable_o3mini_corrected_cluster[i], indent=2)
                        }
                ]
                }
        # draw a float between 0 and 1
        include_detailed_description = random.random() < 0.2
       #insert detailed description at index 1 if include_detailed_description is true
        if include_detailed_description:
            line["messages"].insert(1, {"role": "user", "content": detailed_description})
        writer.write(line)
        count_examples += 1
print('Done writing {} examples to {}'.format(count_examples, out_file))



Done writing 25 examples to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_testset_25.jsonl


### Jsonl file creation for V2, V3, V4

In [218]:
import jsonlines
output_path = '/'
out_file = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_testset_v4_36.jsonl'
count_examples = 0
with jsonlines.open(out_file, mode='w') as writer:
    # Loop through each different chhunk and create a line in jsonl file
    for i in range(len(chunks_variable_input_entity)):
        line = {
                "messages": [

                        {
                            "role": "system",
                            "content": system_message
                        },

                        {
                            "role": "user",
                            "content": detailed_description
                        },

                        {
                            "role": "user",
                            "content": json.dumps(chunks_variable_input_entity[i], indent=2)
                       
                        }, 

                        {
                            "role": "assistant",
                            "content": json.dumps(chunks_variable_o3mini_corrected_cluster[i], indent=2)
                        }
                ]
                }

        writer.write(line)
        count_examples += 1
print('Done writing {} examples to {}'.format(count_examples, out_file))



Done writing 36 examples to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_testset_v4_36.jsonl


### jsonl file creation for V5 (this way we can avoid Backslahses)

In [115]:
# convert list of dictionaries, chunks_variable_input_entity into a single dictionary
variable_input_entity_dict = {}
for d in chunks_variable_input_entity:
    variable_input_entity_dict.update(d)

# same for reasoning 
variable_o3mini_reasoning_dict = {}
for d in chunks_variable_o3mini_reasoning:
    variable_o3mini_reasoning_dict.update(d)

# same for corrected cluster
variable_o3mini_corrected_cluster_dict = {}
for d in chunks_variable_o3mini_corrected_cluster:
    variable_o3mini_corrected_cluster_dict.update(d)


In [122]:
import jsonlines
import json
output_path = '/'
out_file = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_trainset_v5_600.jsonl'
count_examples = 0
with jsonlines.open(out_file, mode='w') as writer:
    # Loop through each different chhunk and create a line in jsonl file
    for i in range(len(chunks_variable_input_entity)):
        cluster_id = list(chunks_variable_input_entity[i].keys())[0]
 
        line = {
                "messages": [

                        {
                            "role": "system",
                            "content": system_message
                        },

                        {
                        "role": "user",
                        "content": f"{cluster_id}: {variable_input_entity_dict[cluster_id]}"
                        }, 

                       {
                        "role": "assistant",
                        "content": (
                            f"{cluster_id}:\n"
                            f"reasoning: {variable_o3mini_reasoning_dict[int(cluster_id)]},\n"
                            f"clusters: {eval(variable_o3mini_corrected_cluster_dict[int(cluster_id)])}"
                        )
                    }
                ]
                }

        writer.write(line)
        count_examples += 1
print('Done writing {} examples to {}'.format(count_examples, out_file))



Done writing 600 examples to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_finetuning/input_data/4omini_trainset_v5_600.jsonl
