In [None]:
import pandas as pd

# read kmeans clusters
df = pd.read_parquet('/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/multisect_kmeans_all_clusters_max_clust_30_type_disamb_missing.parquet',engine='pyarrow')


In [11]:
#remove clusters with size equal to 1
df = df[df['cluster_size'] > 1]
df = df.reset_index(drop=True)
len(df)

893

In [12]:
df['node_ids_in_cluster'] = df['node_ids_in_cluster'].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x)

In [6]:
#randomize rows with a seed
seed = 42
df_randomized_rows = df.sample(frac=1, random_state=seed).reset_index(drop=True)
df_randomized_rows

Unnamed: 0,cluster_id,cluster_size,node_ids_in_cluster
0,174666,6,"[ancestral stress treatment [environment], anc..."
1,4934,3,"[substitution of Val by Met [mutation], Tyr to..."
2,329097,27,"[pairwise alignments [phenotype], Comparative ..."
3,148225,6,"[known orthologs of A. thaliana [organism], co..."
4,281542,23,"[Erianthus arundinaceus [organism], E. sativa ..."
...,...,...,...
393887,318199,6,"[PIF6-beta [rna], loss of EIN4 [gene], Loss of..."
393888,449343,10,"[MADS-box TFs [gene], TFs from the MADS-box pr..."
393889,160525,3,[the motor domain with ATPase activity [protei...
393890,179048,2,"[wider repla [phenotype], larger sizes of reco..."


In [7]:
#create df_sample each with 50k rows from df_randomized_rows
df_sample_0_50k = df_randomized_rows.iloc[0:50000]  # 0-50k
df_sample_50k_100k = df_randomized_rows.iloc[50000:100000] # 50k-100k
df_sample_100k_150k = df_randomized_rows.iloc[100000:150000] # 100k-150k
df_sample_150k_200k = df_randomized_rows.iloc[150000:200000] # 150k-200k
df_sample_200k_250k = df_randomized_rows.iloc[200000:250000] # 200k-250k
df_sample_250k_300k = df_randomized_rows.iloc[250000:300000] # 250k-300k
df_sample_300k_350k = df_randomized_rows.iloc[300000:350000] # 300k-350k
df_sample_350k_393k = df_randomized_rows.iloc[350000:] # 350k-393k

In [9]:
len(df_sample_0_50k), len(df_sample_50k_100k), len(df_sample_100k_150k), len(df_sample_150k_200k), len(df_sample_200k_250k), len(df_sample_250k_300k), len(df_sample_300k_350k), len(df_sample_350k_393k)

(50000, 50000, 50000, 50000, 50000, 50000, 50000, 43892)

In [10]:
#create corresponding entity list for each df_sample

entity_list_0_50k = df_sample_0_50k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_50k_100k = df_sample_50k_100k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_100k_150k = df_sample_100k_150k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_150k_200k = df_sample_150k_200k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_200k_250k = df_sample_200k_250k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_250k_300k = df_sample_250k_300k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_300k_350k = df_sample_300k_350k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()
entity_list_350k_393k = df_sample_350k_393k.set_index('cluster_id')['node_ids_in_cluster'].to_dict()

In [11]:
len(entity_list_0_50k), len(entity_list_50k_100k), len(entity_list_100k_150k), len(entity_list_150k_200k), len(entity_list_200k_250k), len(entity_list_250k_300k), len(entity_list_300k_350k), len(entity_list_350k_393k)

(50000, 50000, 50000, 50000, 50000, 50000, 50000, 43892)

## 4omini after finetuning V9 prompt

In [21]:

system_message = """\n
You are a data scientist specializing in grouping plant biological entities. Your task is to cluster similar entities while strictly adhering to the following guidelines:\n\t1.\tExact Phrase Matching Matters: \n1.1 Consider the Entire Phrase: Treat each entity as a single, whole phrase. This includes all key biological terms and any bracketed text\n1.2 Ignore Minor Surface Differences: Minor variations such as letter casing (uppercase vs. lowercase), spacing, punctuation, standard abbreviations, or singular vs. plural forms do not create new or separate entities.\n\t2.\tStrict (100%) Key Term Separation: If an entity has a different key biological term, it MUST GO into a separate cluster.\n3. Sub-identifier separation: If an entity differs by any numeric value, sub-identifier, or qualifier, they MUST BE placed in separate clusters.\n\t4.\tAvoid False Similarity: DO NOT cluster two entities together simply because they share a common word or term if their overall key term or concept is different.\n5. Extra Descriptor Differentiation: If one entity has an extra descriptor that changes its meaning, do not group them together.\n\t6.\tStrict Synonym/Near-Synonym Grouping: Only group entities together if they refer to the exact same biological structure, process, or concept.\n\t7.\tMaintain 100% Precision: If there is any doubt about whether two entities are the same, MUST place them in separate clusters.\n\t8.\tPreserve Original Data: DO NOT introduce new items, create duplicates, or omit any entity from your final output.\n\t9.\tOutput Format: Always return results in valid JSON format. You MUST USE GIVEN KEY.\n10. Choose cluster representative: YOU MUST pickup most appropriate and easy-to-understand cluster representative and enclose it with '**', if there is more than one entity in that particular cluster. For example, pick the full term instead of an abbreviation.\n\nRead the input list, and return clustered entities, STRICTLY following the given guidelines above.\n"""

In [18]:
import ast

def split_dict(d, chunk_size=100):
    keys = list(d.keys())
    chunks = []
    for i in range(0, len(keys), chunk_size):
        chunk_keys = keys[i:i+chunk_size]
        chunk_dict = {k: d[k] for k in chunk_keys}
        chunks.append(chunk_dict)
    return chunks

def flatten_bracketed_strings(value_list):
    """
    Takes a list. For each item:
      - If item is a string that looks like '[...]', parse it and extend the list.
      - Otherwise, keep as is.
    Returns a new flattened list.
    """
    new_list = []
    for val in value_list:
        if (
            isinstance(val, str) 
            and val.strip().startswith("[") 
            and val.strip().endswith("]")
        ):
            # Attempt to parse the bracketed string
            try:
                parsed = ast.literal_eval(val)  # convert string -> Python list
                if isinstance(parsed, list):
                    new_list.extend(parsed)  # flatten
                else:
                    # If it's not a list, just append as-is
                    new_list.append(val)
            except (SyntaxError, ValueError):
                # If parsing fails, keep original
                new_list.append(val)
        else:
            new_list.append(val)
    return new_list

def prepend_key_to_values(d):
    """
    For each key k in d:
      1) Ensure value is a *list*.
      2) Flatten bracketed strings if needed.
      3) Prepend k to the final list.
    """
    for k in d:
        val = d[k]
        # 1) If not a list, make it one
        if not isinstance(val, list):
            val = [val]

        # 2) Flatten bracketed strings
        val = flatten_bracketed_strings(val)

        # 3) Prepend key
        #d[k] = [k] + val
    return d


# Example usage:
#chunks = split_dict(entity_list, 1)
#print(len(chunks))  # Should be 3 if entity_list has 953 keys

In [14]:
#create chunks for each entity dict

chunks_0_50k = split_dict(entity_list_0_50k, 1)
chunks_50k_100k = split_dict(entity_list_50k_100k, 1)
chunks_100k_150k = split_dict(entity_list_100k_150k, 1)
chunks_150k_200k = split_dict(entity_list_150k_200k, 1)
chunks_200k_250k = split_dict(entity_list_200k_250k, 1)
chunks_250k_300k = split_dict(entity_list_250k_300k, 1)
chunks_300k_350k = split_dict(entity_list_300k_350k, 1)
chunks_350k_393k = split_dict(entity_list_350k_393k, 1)

In [15]:
len(chunks_0_50k), len(chunks_50k_100k), len(chunks_100k_150k), len(chunks_150k_200k), len(chunks_200k_250k), len(chunks_250k_300k), len(chunks_300k_350k), len(chunks_350k_393k)

(50000, 50000, 50000, 50000, 50000, 50000, 50000, 43892)

In [16]:
import jsonlines
import json

output_path = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/'

# Define a dictionary of files to write
# key = output filename suffix, value = the list of chunks
chunk_groups = {
    '0_50k': chunks_0_50k,
    '50k_100k': chunks_50k_100k,
    '100k_150k': chunks_100k_150k,
    '150k_200k': chunks_150k_200k,
    '200k_250k': chunks_200k_250k,
    '250k_300k': chunks_250k_300k,
    '300k_350k': chunks_300k_350k,
    '350k_393k': chunks_350k_393k
}

for suffix, chunk_list in chunk_groups.items():
    out_file = f'v9_4omini_maxclust30_{suffix}.jsonl'
    with jsonlines.open(output_path + out_file, mode='w') as file:
        for i, chunk in enumerate(chunk_list):
            line = {
                "custom_id": str(i),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "ft:gpt-4o-mini-2024-07-18:mutwil-lab:4omini-v9-train-1306-test-78:B9u5DUsf",
                    "temperature": 0,
                    "top_p": 0,
                    "frequency_penalty": 0,
                    "presence_penalty": 0,
                    "response_format": {"type": "json_object"},
                    "messages": [
                        {
                            "role": "system",
                            "content": system_message
                        },
                        {
                            "role": "user",
                            "content": json.dumps(chunk, separators=(',', ':'))
                        }
                    ],
                    "max_tokens": 16384
                }
            }
            file.write(line)
    print(f'Done writing to {output_path + out_file}')

Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_0_50k.jsonl
Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_50k_100k.jsonl
Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_100k_150k.jsonl
Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_150k_200k.jsonl
Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_200k_250k.jsonl
Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inp

In [23]:
#FOR MISSING ONES


import jsonlines
import json

output_path = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/'

# Define a dictionary of files to write
# key = output filename suffix, value = the list of chunks
chunk_groups = {
    'missing': chunks
}

for suffix, chunk_list in chunk_groups.items():
    out_file = f'v9_4omini_maxclust30_{suffix}.jsonl'
    with jsonlines.open(output_path + out_file, mode='w') as file:
        for i, chunk in enumerate(chunk_list):
            line = {
                "custom_id": str(i),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "ft:gpt-4o-mini-2024-07-18:mutwil-lab:4omini-v9-train-1306-test-78:B9u5DUsf",
                    "temperature": 0,
                    "top_p": 0,
                    "frequency_penalty": 0,
                    "presence_penalty": 0,
                    "response_format": {"type": "json_object"},
                    "messages": [
                        {
                            "role": "system",
                            "content": system_message
                        },
                        {
                            "role": "user",
                            "content": json.dumps(chunk, separators=(',', ':'))
                        }
                    ],
                    "max_tokens": 16384
                }
            }
            file.write(line)
    print(f'Done writing to {output_path + out_file}')

Done writing to /Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_missing.jsonl


In [17]:
# create list of all files created
files_created = [output_path + f'v9_4omini_maxclust30_{suffix}.jsonl' for suffix in chunk_groups.keys()]
files_created

['/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_0_50k.jsonl',
 '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_50k_100k.jsonl',
 '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_100k_150k.jsonl',
 '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_150k_200k.jsonl',
 '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_200k_250k.jsonl',
 '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_250k_300k.jsonl',
 '/Users/manojitharajula/Document

## upload to batchapi server

In [18]:

#Create batchapi upload
from openai import OpenAI

# Initialize OpenAI Client
client = OpenAI(api_key="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")

for fname in files_created:
    # Upload the file first
    file_upload = client.files.create(
        file=open(fname, "rb"),
        purpose="batch"
    )

    print(f"Uploaded file id: {file_upload.id}")

    # Create the batch job referencing the uploaded file ID
    batch_response = client.batches.create(
        input_file_id=file_upload.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    print(f"Batch job created: {batch_response.id}")

Uploaded file id: file-SCAn12pCgCCQtPBJoDbFWm
Batch job created: batch_67dad4f7587881908da646b1343bf90c
Uploaded file id: file-4NKhRAWsTyWsiJJknNMr3h
Batch job created: batch_67dad50ad3dc819094ad389c08b222b7
Uploaded file id: file-Vf2pq1H5ctGkJ8nEjMFH1T
Batch job created: batch_67dad51e19388190b329427c3e872424
Uploaded file id: file-YG2dBrcXnqHsXfZq4WyoR4
Batch job created: batch_67dad52f57108190bca049af78361d9d
Uploaded file id: file-KN9kGhFUaLm9Gadi89dmk2
Batch job created: batch_67dad542102081908506fa5a55f4621b
Uploaded file id: file-WWb2ru2fzRyDd6Dh7whvnG
Batch job created: batch_67dad5530bd88190a7e231c56398e20f
Uploaded file id: file-U5GjQprAT4C7cz8HWcBvne
Batch job created: batch_67dad563f2548190845a4ca8e6ef1484
Uploaded file id: file-JRYYDujDTsnFcfYsWaKzFG
Batch job created: batch_67dad5734b0881909d08699386b2662b


In [None]:
# SUBMIT MISSING batch


#Create batchapi upload
from openai import OpenAI

# Initialize OpenAI Client
client = OpenAI(api_key="ENTER YOUR API KEY")
fname = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_inps/v9_4omini_maxclust30_missing.jsonl'

# Upload the file first
file_upload = client.files.create(
    file=open(fname, "rb"),
    purpose="batch"
)

print(f"Uploaded file id: {file_upload.id}")

# Create the batch job referencing the uploaded file ID
batch_response = client.batches.create(
    input_file_id=file_upload.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

print(f"Batch job created: {batch_response.id}")

Uploaded file id: file-GuSebdYkaft1rojvUCZMVo
Batch job created: batch_67dc3affb65481909cc1ca0afa430525


### Convert jsonl output files to Csv format

In [None]:
# using os list dir get all the batch api outputs from batchapi_final_inps

import os
import jsonlines
import json

# Define the path to the directory containing the batch API output files
batch_output_path = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_outs/'

# Get a list of all files in the directory
batch_output_files = os.listdir(batch_output_path)


#only get the jsonl files
batch_output_files = [f for f in batch_output_files if f.endswith('.jsonl')]

# Sort the files to ensure they are processed in order
batch_output_files.sort()



['batch_67dad4f7587881908da646b1343bf90c_output.jsonl',
 'batch_67dad50ad3dc819094ad389c08b222b7_output.jsonl',
 'batch_67dad51e19388190b329427c3e872424_output.jsonl',
 'batch_67dad52f57108190bca049af78361d9d_output.jsonl',
 'batch_67dad542102081908506fa5a55f4621b_output.jsonl',
 'batch_67dad5530bd88190a7e231c56398e20f_output.jsonl',
 'batch_67dad563f2548190845a4ca8e6ef1484_output.jsonl',
 'batch_67dad5734b0881909d08699386b2662b_output.jsonl']

In [9]:
import json
import jsonlines
import csv
#now read each jsonl file and convert to csv
# Define the path to the directory where the CSV files will be saved
output_csv_path = '/Users/manojitharajula/Documents/PhD/Connectome/Entity_disambiguation/kmeans/4omini_after_finetuning/batchapi_final_outs/'
csv_file_names = ['4omini_v9_0_50k.csv', '4omini_v9_50k_100k.csv', '4omini_v9_100k_150k.csv', '4omini_v9_150k_200k.csv', '4omini_v9_200k_250k.csv', '4omini_v9_250k_300k.csv', '4omini_v9_300k_350k.csv', '4omini_v9_350k_393k.csv']

for i, batch_file in enumerate(batch_output_files):
    skip_lines = 0
    # Open the JSONL file
    with jsonlines.open(batch_output_path + batch_file) as reader:
        # Define the CSV file name
        csv_file = csv_file_names[i]
        # Open the CSV file for writing
        with open(output_csv_path + csv_file, 'w') as f:
            writer = csv.writer(f)
            # Write the header row
            writer.writerow(["Cluster Id", "Group Items"])
            for line in reader:
                try:
                    body = line["response"]["body"]
                    # Get the JSON string from the assistant's "content"
                    content_str = body["choices"][0]["message"]["content"]
                    data = json.loads(content_str)
                    #print(len(data.keys()))
                    all_keys = list(data.keys())
                    all_values = list(data.values())

                    for i in range(len(all_keys)):
                        #writer.writerow([all_keys[i], all_values[i]])
                        writer.writerow([all_keys[i], json.dumps(all_values[i], ensure_ascii=False)])
                
                    
                    

                except (KeyError, IndexError, json.JSONDecodeError):
                    # Skip lines that don't match the expected structure
                    #print("Skipping line:", line)
                    skip_lines += 1
                    #print(line["response"]["body"]["choices"][0]["message"]["content"])
                    
                    pass
    print(f"Skipped {skip_lines} lines in {batch_file}")

Skipped 99 lines in batch_67dad4f7587881908da646b1343bf90c_output.jsonl
Skipped 99 lines in batch_67dad50ad3dc819094ad389c08b222b7_output.jsonl
Skipped 114 lines in batch_67dad51e19388190b329427c3e872424_output.jsonl
Skipped 82 lines in batch_67dad52f57108190bca049af78361d9d_output.jsonl
Skipped 94 lines in batch_67dad542102081908506fa5a55f4621b_output.jsonl
Skipped 87 lines in batch_67dad5530bd88190a7e231c56398e20f_output.jsonl
Skipped 91 lines in batch_67dad563f2548190845a4ca8e6ef1484_output.jsonl
Skipped 88 lines in batch_67dad5734b0881909d08699386b2662b_output.jsonl
