In [1]:
def merge_csvs_to_df(folder):
    import os
    import csv
    import pandas as pd

    rows = []

    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                path = os.path.join(root, file)
                with open(path, newline='', encoding='utf-8') as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        rows.append(row)

    df = pd.DataFrame(rows)
    # Ensure correct types for sorting
    df['doc_id'] = df['doc_id'].astype(str)
    df['entity1_offset'] = pd.to_numeric(df['entity1_offset'])
    df['entity2_offset'] = pd.to_numeric(df['entity2_offset'])
    # Sort by doc_id, then offsets
    df = df.sort_values(by=['doc_id', 'entity1_offset', 'entity2_offset']).reset_index(drop=True)
    # Remove duplicate rows
    df = df.drop_duplicates().reset_index(drop=True)

    return df

folder = "Sorted data/relation_specific/relation_specific_test"
merged_df_test = merge_csvs_to_df(folder)
print(merged_df_test.shape)  # Print shape of the merged DataFrame
print(merged_df_test.head()) # Print first few rows 
folder_train = "Sorted data/relation_specific/relation_specific_train"
merged_df_train = merge_csvs_to_df(folder_train)
print(merged_df_train.shape)  # Print shape of the merged DataFrame



(2116, 20)
          relation_type filename count percentage    doc_id  \
0           Association      NaN   NaN        NaN  15000256   
1           Association      NaN   NaN        NaN  15000256   
2           Association      NaN   NaN        NaN  15000256   
3  Positive_Correlation      NaN   NaN        NaN  15000256   
4  Positive_Correlation      NaN   NaN        NaN  15000256   

                                        passage_text   entity1_id  \
0  Single-strand conformation polymorphism analys...         2332   
1  Single-strand conformation polymorphism analys...         2332   
2  Single-strand conformation polymorphism analys...         2332   
3  Single-strand conformation polymorphism analys...  rs782013865   
4  Single-strand conformation polymorphism analys...  rs782013865   

                                entity1_text       entity1_type  \
0                                   ['FMR1']  GeneOrGeneProduct   
1                                   ['FMR1']  GeneOrGeneProdu

In [2]:
unique_relations = merged_df_test['relation_type'].unique()
print("Unique relation types:", unique_relations)
print("Number of unique relation types:", len(unique_relations))
relation_counts = merged_df_test['relation_type'].value_counts()
relation_counts2 = merged_df_train['relation_type'].value_counts()
print("Relation type counts test:")
print(relation_counts)
print("Relation type counts train:")
print(relation_counts2)
num_duplicates = merged_df_test.duplicated().sum()
print("Number of duplicate rows in the DataFrame:", num_duplicates)
print(len(merged_df_test))
print(len(merged_df_train))

Unique relation types: ['Association' 'Positive_Correlation' 'Negative_Correlation' 'No_Relation'
 'Cotreatment' 'Drug_Interaction' 'Bind' 'Comparison' 'Conversion']
Number of unique relation types: 9
Relation type counts test:
relation_type
No_Relation             945
Association             636
Positive_Correlation    326
Negative_Correlation    172
Cotreatment              15
Bind                     10
Comparison                7
Drug_Interaction          3
Conversion                2
Name: count, dtype: int64
Relation type counts train:
relation_type
No_Relation             3394
Association             2193
Positive_Correlation    1090
Negative_Correlation     764
Bind                      62
Cotreatment               32
Comparison                29
Drug_Interaction          12
Conversion                 4
Name: count, dtype: int64
Number of duplicate rows in the DataFrame: 0
2116
7580


### to test a case where we want to see how less classes effect the performance we now construct a binary case of relation no relation

In [25]:
import pandas as pd

# Assuming your DataFrame is named 'df' and the column with labels is 'label'
merged_df_test['relation_type'] = merged_df_test['relation_type'].apply(lambda x: 'Relation' if x != 'No_Relation' else x)
relation_counts = merged_df_test['relation_type'].value_counts()
print("Relation type counts:")
print(relation_counts)
merged_df_train['relation_type'] = merged_df_train['relation_type'].apply(lambda x: 'Relation' if x != 'No_Relation' else x)


Relation type counts:
relation_type
Relation       1171
No_Relation     945
Name: count, dtype: int64


In [3]:
def generate_multiple_prompt_examples_single(df, n_prompts=3, seed=42):
    import random
    import numpy as np

    relation_types = df['relation_type'].unique()
    prompts = []

    for prompt_num in range(n_prompts):
        used_indices = set()
        examples = []
        example_num = 1
        random.seed(seed + prompt_num)
        np.random.seed(seed + prompt_num)

        for rel in relation_types:
            rel_rows = df[df['relation_type'] == rel]
            if rel_rows.empty:
                continue
            available_indices = list(set(rel_rows.index) - used_indices)
            if available_indices:
                idx = random.choice(available_indices)
            else:
                idx = random.choice(rel_rows.index)
            used_indices.add(idx)
            row = df.loc[idx]
            entities = f"Entity1: {row['entity1_text']} ({row['entity1_type']}), Entity2: {row['entity2_text']} ({row['entity2_type']})"
            example = (
                f"Example {example_num}\n"
                f"## Input:\n"
                f"Text: {row['passage_text']}\n"
                f"{entities}\n"
                f"## Output:\n"
                f"{entities}, Relation: {row['relation_type']}\n"
            )
            examples.append(example)
            example_num += 1
            if len(examples) >= 9:
                break

        prompt_text = "\n".join(examples)
        prompts.append(prompt_text)

    return prompts



In [4]:
def generate_multiple_prompt_examples_multi(df, n_prompts=1, seed=42, number_relations=True, relations_entitys_separately=False):
    import random
    import numpy as np

    relation_types = df['relation_type'].unique()
    prompts = []

    for prompt_num in range(n_prompts):
        used_docids = set()
        examples = []
        example_num = 1
        random.seed(seed + prompt_num)
        np.random.seed(seed + prompt_num)

        for rel in relation_types:
            rel_rows = df[df['relation_type'] == rel]
            if rel_rows.empty:
                continue
            available_indices = list(set(rel_rows.index) - used_docids)
            if available_indices:
                idx = random.choice(available_indices)
            else:
                idx = random.choice(rel_rows.index)
            doc_id = df.loc[idx, 'doc_id']
            if doc_id in used_docids:
                continue
            used_docids.add(doc_id)
            doc_rows = df[df['doc_id'] == doc_id]

            passage_text = doc_rows.iloc[0]['passage_text']
            # List all entity pairs
            if number_relations:
                entity_lines = [
                    f"{i+1}. Entity1: {row['entity1_text']} ({row['entity1_type']}), Entity2: {row['entity2_text']} ({row['entity2_type']})"
                    for i, (_, row) in enumerate(doc_rows.iterrows())
                ]
            else:
                entity_lines = [
                    f"Entity1: {row['entity1_text']} ({row['entity1_type']}), Entity2: {row['entity2_text']} ({row['entity2_type']})"
                    for _, row in doc_rows.iterrows()
                ]
            if number_relations:
                if relations_entitys_separately:
                    relation_lines = [
                        f"{i+1}. Relation: {row['relation_type']}"
                        for i, (_, row) in enumerate(doc_rows.iterrows())
                    ]
                else:
                    relation_lines = [
                        f"{i+1}. Entity1: {row['entity1_text']}, Entity2: {row['entity2_text']}, Relation: {row['relation_type']}"
                        for i, (_, row) in enumerate(doc_rows.iterrows())
                    ]
            else:
                relation_lines = [
                    f"Entity1: {row['entity1_text']}, Entity2: {row['entity2_text']}, Relation: {row['relation_type']}"
                    for _, row in doc_rows.iterrows()
                ]
            example = (
                f"Example {example_num}\n"
                #f"DocID: {doc_id}\n"
                f"## Input:\n"
                f"Text: {passage_text}\n"
                f"Entities:\n" + "\n".join(entity_lines) + "\n"
                f"## Output:\n" + "\n".join(relation_lines) + "\n"
            )
            examples.append(example)
            example_num += 1
            if len(examples) >= 9:
                break

        prompt_text = "\n".join(examples)
        prompts.append(prompt_text)

    return prompts


In [5]:
def fill_prompt(template_path, text, e1="", entity_type1="", e2="", entity_type2="", examples="", entity_list="", prompt_id=None):
    with open(template_path, "r", encoding="utf-8") as f:
        template = f.read()
    filled = template.format(
        text=text,
        e1=e1,
        entity_type1=entity_type1,
        e2=e2,
        entity_type2=entity_type2,
        examples=examples,
        entity_list=entity_list,
        prompt_id=prompt_id
    )
    return filled


def create_prompt_relation_df_single(df,df_2, template_path, examples=False,same_examples=False, seed=42, n_prompts=1, prompt_id=False):
    import pandas as pd
    prompts = []
    relations = []
    doc_ids = []
    runs = []
    prompt_ids = []

    prompt_id_counter = 1  # Start numbering from 1

    for run_idx in range(1, n_prompts + 1):
        if examples:
            if same_examples == True:
                example_prompts = generate_multiple_prompt_examples_single(df_2, n_prompts=1, seed=seed)
                examples_str = example_prompts[0] if example_prompts else ""
            else:
            # Generate new examples for this run with updated seed
                example_prompts = generate_multiple_prompt_examples_single(df_2, n_prompts=1, seed=seed + run_idx - 1)
            # If you want to use these as context/examples, you can pass them to fill_prompt as needed
            # Here, we assume you want to add them as an 'examples' variable in the template
                examples_str = example_prompts[0] if example_prompts else ""
        else:
            examples_str = ""

        for _, row in df.iterrows():
            if prompt_id:
                prompt = fill_prompt(
                    template_path,
                    text=row['passage_text'],
                    e1=row['entity1_text'],
                    entity_type1=row['entity1_type'],
                    e2=row['entity2_text'],
                    entity_type2=row['entity2_type'],
                    examples=examples_str,
                    prompt_id=prompt_id_counter
                )
                prompts.append(prompt)
                relations.append(row['relation_type'])
                doc_ids.append(row['doc_id'])
                runs.append(run_idx)
                prompt_ids.append(prompt_id_counter)
                prompt_id_counter += 1
            else:
                prompt = fill_prompt(
                    template_path,
                    text=row['passage_text'],
                    e1=row['entity1_text'],
                    entity_type1=row['entity1_type'],
                    e2=row['entity2_text'],
                    entity_type2=row['entity2_type'],
                    examples=examples_str,
                    prompt_id=""
                )
                prompts.append(prompt)
                relations.append(row['relation_type'])
                doc_ids.append(row['doc_id'])
                runs.append(run_idx)
                prompt_ids.append(prompt_id_counter)
                prompt_id_counter += 1

    result_df = pd.DataFrame({
        'Prompt': prompts,
        'Relation': relations,
        'DocID': doc_ids,
        'run': runs,
        'prompt_id': prompt_ids
    })
    return result_df




def prompts_df_to_json_from_runs(df, n_key="n_1"):
    """
    Converts a DataFrame with 'Prompt', 'Relation', 'DocID', and 'run' columns to the required JSON structure.
    Each run contains lists for prompts, relations, and doc_ids.
    """
    runs = {}
    for run_num in sorted(df['run'].unique()):
        run_df = df[df['run'] == run_num]
        prompts_list = run_df['Prompt'].tolist()
        relations_list = run_df['Relation'].tolist()
        doc_ids_list = run_df['DocID'].tolist()
        runs[f"run_{run_num}"] = {
            "prompts1": prompts_list,
            "relations": relations_list,
            "doc_ids": doc_ids_list
        }

    result = {n_key: runs}
    return result

def prompts_df_to_json_from_runs_multi(df, n_key="n_1"):
    """
    Converts a DataFrame with 'Prompt', 'Relation'/'Multi_Relations', 'DocID'/'Multi_DocID', and 'run' columns
    to the required JSON structure. Each run contains lists for prompts, relations, and doc_ids.
    The JSON keys will match the DataFrame column names (e.g., 'Multi_Relations' -> 'multi_relations').
    """
    runs = {}
    # Find the correct column names for relations and doc_ids
    rel_col = None
    docid_col = None
    for c in df.columns:
        if c.lower().startswith("relation"):
            rel_col = c
        if c.lower().startswith("multi_relation"):
            rel_col = c
        if c.lower() == "relations":
            rel_col = c
        if c.lower() == "multi_relations":
            rel_col = c
        if c.lower().startswith("docid"):
            docid_col = c
        if c.lower().startswith("multi_docid"):
            docid_col = c
        if c.lower() == "doc_ids":
            docid_col = c
        if c.lower() == "multi_docid":
            docid_col = c

    # Fallbacks if not found
    if rel_col is None:
        rel_col = "Relation"
    if docid_col is None:
        docid_col = "DocID"

    # Use lower-case JSON keys for consistency
    rel_json_key = rel_col.lower()
    docid_json_key = docid_col.lower()

    for run_num in sorted(df['run'].unique()):
        run_df = df[df['run'] == run_num]
        prompts_list = run_df['Prompt'].tolist()
        relations_list = run_df[rel_col].tolist()
        doc_ids_list = run_df[docid_col].tolist()
        runs[f"run_{run_num}"] = {
            "prompts1": prompts_list,
            rel_json_key: relations_list,
            docid_json_key: doc_ids_list
        }

    result = {n_key: runs}
    return result



import json
#prompt_json = prompts_df_to_json_from_runs(prompt_relation_df)
#with open("prompts_for_api.json", "w") as f:
#    json.dump(prompt_json, f, indent=2)

In [6]:
def create_prompt_relation_df_multi_by_docid(
    df, df_2, template_path, examples=False, same_examples=False,
    number_relations=False, relations_entitys_separately=False,
    seed=42, n_prompts=1, 
):
    import pandas as pd
    prompts = []
    relations = []
    doc_ids = []
    runs = []

    # Group by doc_id
    grouped = df.groupby('doc_id')

    for run_idx in range(1, n_prompts + 1):
        if examples:
            if same_examples:
                example_prompts = generate_multiple_prompt_examples_multi(
                    df_2, n_prompts=1, seed=seed,
                    number_relations=number_relations,
                    relations_entitys_separately=relations_entitys_separately
                )
                examples_str = example_prompts[0] if example_prompts else ""
            else:
                example_prompts = generate_multiple_prompt_examples_multi(
                    df_2, n_prompts=1, seed=seed + run_idx - 1,
                    number_relations=number_relations,
                    relations_entitys_separately=relations_entitys_separately
                )
                examples_str = example_prompts[0] if example_prompts else ""
        else:
            examples_str = ""

        for doc_id, doc_rows in grouped:
            # Build entity_lines for this doc_id
            if number_relations:
                entity_lines = [
                    f"{i+1}. Entity1: {row['entity1_text']} ({row['entity1_type']}), Entity2: {row['entity2_text']} ({row['entity2_type']})"
                    for i, (_, row) in enumerate(doc_rows.iterrows())
                ]
            else:
                entity_lines = [
                    f"Entity1: {row['entity1_text']} ({row['entity1_type']}), Entity2: {row['entity2_text']} ({row['entity2_type']})"
                    for _, row in doc_rows.iterrows()
                ]
            # Collect relations for this doc_id
            relation_list = doc_rows['relation_type'].tolist()

            # Use the first passage_text for the prompt
            passage_text = doc_rows.iloc[0]['passage_text']

            prompt = fill_prompt(
                template_path,
                text=passage_text,
                entity_list="\n".join(entity_lines),
                examples=examples_str
            )
            prompts.append(prompt)
            relations.append(relation_list)
            doc_ids.append(doc_id)
            runs.append(run_idx)

    result_df = pd.DataFrame({
        'Prompt': prompts,
        'Multi_Relations': relations,
        'Multi_DocID': doc_ids,
        'run': runs
    })
    return result_df

In [7]:
template_path_multi = "LLM_benchmarks/prompt_template_multi.txt"
template_path_single = "LLM_benchmarks/prompt_template_single.txt"

zero_shot_single = create_prompt_relation_df_single(merged_df_train,merged_df_test, template_path_single, examples=False,same_examples=False, seed=42, n_prompts=5)
print(zero_shot_single.head())
import json
prompt_json = prompts_df_to_json_from_runs(zero_shot_single)
with open("LLM_benchmarks/Prompts/zero_shot_single_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2)

few_shot_single = create_prompt_relation_df_single(merged_df_train,merged_df_test, template_path_single, examples=True,same_examples=False, seed=42, n_prompts=5)
print(few_shot_single.head())
prompt_json = prompts_df_to_json_from_runs(few_shot_single)
with open("LLM_benchmarks/Prompts/few_shot_single_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2)


# Run garbage collection to free up memory
import gc
gc.collect()

zero_shot_multi = create_prompt_relation_df_multi_by_docid(merged_df_train,merged_df_test, template_path_multi, examples=False,same_examples=False, seed=42, n_prompts=5, number_relations=True, relations_entitys_separately=False)
prompt_json = prompts_df_to_json_from_runs_multi(zero_shot_multi)
with open("LLM_benchmarks/Prompts/zero_shot_multi_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2)

few_shot_multi = create_prompt_relation_df_multi_by_docid(merged_df_train,merged_df_test, template_path_multi, examples=True,same_examples=False, seed=42, n_prompts=5, number_relations=True, relations_entitys_separately=False)
prompt_json = prompts_df_to_json_from_runs_multi(few_shot_multi)
with open("LLM_benchmarks/Prompts/few_shot_multi_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2) 
print(few_shot_multi.head())

import json



                                              Prompt              Relation  \
0  You are a biomedical relation extraction exper...           Association   
1  You are a biomedical relation extraction exper...           No_Relation   
2  You are a biomedical relation extraction exper...           No_Relation   
3  You are a biomedical relation extraction exper...           Association   
4  You are a biomedical relation extraction exper...  Positive_Correlation   

      DocID  run  prompt_id  
0  10491763    1          1  
1  10491763    1          2  
2  10491763    1          3  
3  10491763    1          4  
4  10491763    1          5  
                                              Prompt              Relation  \
0  You are a biomedical relation extraction exper...           Association   
1  You are a biomedical relation extraction exper...           No_Relation   
2  You are a biomedical relation extraction exper...           No_Relation   
3  You are a biomedical relation extrac

## Make 1 big json for the prompts

In [8]:
def consolidate_prompt_jsons(json_paths, prompt_set_names, output_path):
    """
    Merge multiple prompt JSONs into one, each as a different prompt set name.
    Also adds Multi_Relations and Multi_DocID fields if present and not already in the base.
    """
    import json

    # Load all JSONs
    jsons = [json.load(open(p, "r", encoding="utf-8")) for p in json_paths]
    base = jsons[0]
    n_key = list(base.keys())[0]
    runs = base[n_key].keys()

    # For each run, add all prompt sets and Multi fields if present
    for run in runs:
        for i, js in enumerate(jsons):
            prompt_set_name = prompt_set_names[i]
            # Find the first prompt set key in this json (e.g. "prompts1")
            prompt_set_key = [k for k in js[n_key][run].keys() if k.startswith("prompts")][0]
            base[n_key][run][prompt_set_name] = js[n_key][run][prompt_set_key]
            # Add Multi_Relations if present and not already in base
            if "multi_relations" in js[n_key][run] and "multi_relations" not in base[n_key][run]:
                base[n_key][run]["multi_relations"] = js[n_key][run]["multi_relations"]
            # Add Multi_DocID if present and not already in base
            if "multi_docid" in js[n_key][run] and "multi_docid" not in base[n_key][run]:
                base[n_key][run]["multi_docid"] = js[n_key][run]["multi_docid"]

    # Save merged JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(base, f, indent=2)



# Example usage:
consolidate_prompt_jsons(
    [
        "LLM_benchmarks/Prompts/zero_shot_single_for_api.json",
        "LLM_benchmarks/Prompts/few_shot_single_for_api.json",
        "LLM_benchmarks/Prompts/zero_shot_multi_for_api.json",
        "LLM_benchmarks/Prompts/few_shot_multi_for_api.json"
    ],
    ["single_zero", "single_examples", "multi_zero", "multi_examples"],
    "LLM_benchmarks/Prompts/all_prompts_consolidated_2.json"
)

consolidated_test = json.load(open("LLM_benchmarks/Prompts/all_prompts_consolidated.json", "r", encoding="utf-8"))

print(consolidated_test.keys())  # Should show the main keys like 'n_1'

dict_keys(['n_1'])


In [14]:
print(consolidated_test["n_1"]["run_1"]["multi_examples"][0])
print("##########################################################################################")
print(consolidated_test["n_1"]["run_2"]["multi_examples"][0])

You are a biomedical relation extraction expert following BioRED annotation guidelines.

Task: Identify the relation type between two specified entities.

Valid relation types (respond with EXACTLY ONE of these words):
- Association: A general relationship exists between two entities without specifying direction or strength.
- Positive_Correlation: As one entity increases or becomes more present, the other also increases or becomes more present.
- Bind: Two molecules physically interact or attach to each other.
- Negative_Correlation: As one entity increases or becomes more present, the other decreases or becomes less present.
- Comparison: Two entities are directly compared in terms of effects, outcomes, or characteristics.
- Conversion: One biochemical entity is transformed into another.
- Cotreatment: Two treatments are administered together in the same experimental or clinical context.
- Drug_Interaction: Two drugs interact to produce an altered physiological or adverse effect.
- N

In [17]:
import numpy as np

def consolidate_prompt_jsons(json_paths, prompt_set_names, output_path):
    """
    Merge multiple prompt JSONs into one, each as a different prompt set name.
    Also adds Multi_Relations and Multi_DocID fields if present and not already in the base.
    """
    import json

    # Load all JSONs
    jsons = [json.load(open(p, "r", encoding="utf-8")) for p in json_paths]
    base = jsons[0]
    n_key = list(base.keys())[0]
    runs = base[n_key].keys()

    # For each run, add all prompt sets and Multi fields if present
    for run in runs:
        for i, js in enumerate(jsons):
            prompt_set_name = prompt_set_names[i]
            # Find the first prompt set key in this json (e.g. "prompts1")
            prompt_set_key = [k for k in js[n_key][run].keys() if k.startswith("prompts")][0]
            base[n_key][run][prompt_set_name] = js[n_key][run][prompt_set_key]
            # Add Multi_Relations if present and not already in base
            if "multi_relations" in js[n_key][run] and "multi_relations" not in base[n_key][run]:
                base[n_key][run]["multi_relations"] = js[n_key][run]["multi_relations"]
            # Add Multi_DocID if present and not already in base
            if "multi_docid" in js[n_key][run] and "multi_docid" not in base[n_key][run]:
                base[n_key][run]["multi_docid"] = js[n_key][run]["multi_docid"]

    # Save merged JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(base, f, indent=2)

# Set your seed for reproducibility
seed = 42
np.random.seed(seed)

# Get unique doc_ids
unique_docids = merged_df_train['doc_id'].unique()

# Pick 20 random doc_ids
sampled_docids = np.random.choice(unique_docids, size=20, replace=False)

# Subset the DataFrame to only rows with those doc_ids
subset_df = merged_df_train[merged_df_train['doc_id'].isin(sampled_docids)].copy()

print(subset_df.shape)
print(subset_df['doc_id'].unique())

template_path_multi = "LLM_benchmarks/prompt_template_multi.txt"
template_path_single = "LLM_benchmarks/prompt_template_single.txt"
template_path_single_wo_reason = "LLM_benchmarks/prompt_template_single_wo_reason.txt"

test_single = create_prompt_relation_df_single(subset_df,merged_df_test, template_path_single, examples=False,same_examples=False, seed=42, n_prompts=5, prompt_id=False)
print(test_single.head())
import json
prompt_json = prompts_df_to_json_from_runs(test_single)
with open("LLM_benchmarks/Prompts/test_single_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2)

test_single_w_examples = create_prompt_relation_df_single(subset_df,merged_df_test, template_path_single, examples=True,same_examples=True, seed=42, n_prompts=5, prompt_id=False)
print(test_single_w_examples.head())
import json
prompt_json = prompts_df_to_json_from_runs(test_single_w_examples)
with open("LLM_benchmarks/Prompts/test_single_for_api_examples.json", "w") as f:
    json.dump(prompt_json, f, indent=2)

test_multi = create_prompt_relation_df_multi_by_docid(subset_df,merged_df_test, template_path_multi, examples=False,same_examples=False, seed=42, n_prompts=5, number_relations=True, relations_entitys_separately=False)
print(test_multi.head())
import json
prompt_json = prompts_df_to_json_from_runs_multi(test_multi)
with open("LLM_benchmarks/Prompts/test_multi_for_api.json", "w") as f:
    json.dump(prompt_json, f, indent=2) 

test_multi_w_examples = create_prompt_relation_df_multi_by_docid(subset_df,merged_df_test, template_path_multi, examples=True,same_examples=True, seed=42, n_prompts=5, number_relations=True, relations_entitys_separately=False)
print(test_multi_w_examples.head())
import json
prompt_json = prompts_df_to_json_from_runs_multi(test_multi_w_examples)
with open("LLM_benchmarks/Prompts/test_multi_for_api_examples.json", "w") as f:
    json.dump(prompt_json, f, indent=2)

consolidate_prompt_jsons(
    [
        "LLM_benchmarks/Prompts/test_single_for_api.json",
        "LLM_benchmarks/Prompts/test_single_for_api_examples.json",
        "LLM_benchmarks/Prompts/test_multi_for_api.json",
        "LLM_benchmarks/Prompts/test_multi_for_api_examples.json"
    ],
    ["single_zero", "single_examples", "multi_zero", "multi_examples"],
    "LLM_benchmarks/Prompts/Test_callibration.json"
)
calibration_test = json.load(open("LLM_benchmarks/Prompts/Test_callibration.json", "r", encoding="utf-8"))

(384, 20)
['15033202' '15623763' '15814629' '16158428' '16541406' '16755009'
 '17000021' '17003357' '17634480' '17951029' '19463742' '19696792'
 '19996135' '20105280' '21615796' '27930654' '28260056' '28428256'
 '28487437' '28650467']
                                              Prompt              Relation  \
0  You are a biomedical relation extraction exper...  Positive_Correlation   
1  You are a biomedical relation extraction exper...           Association   
2  You are a biomedical relation extraction exper...           No_Relation   
3  You are a biomedical relation extraction exper...           No_Relation   
4  You are a biomedical relation extraction exper...           No_Relation   

      DocID  run  prompt_id  
0  15033202    1          1  
1  15033202    1          2  
2  15033202    1          3  
3  15033202    1          4  
4  15033202    1          5  
                                              Prompt              Relation  \
0  You are a biomedical relation extra

In [11]:
print(calibration_test["n_1"]["run_1"]["multi_examples"][0])
print(calibration_test["n_1"]["run_1"]["single_examples"][0])

You are a biomedical relation extraction expert following BioRED annotation guidelines.

Task: Identify the relation type between two specified entities.

Valid relation types (respond with EXACTLY ONE of these words):
- Relation: A meaningful connection or interaction between two entities, which may include associations, correlations (positive or negative), physical binding, comparisons, conversions, co-treatments, or drug interactions.
- No_Relation: Two enteties that are not directly related according to the text.

CRITICAL INSTRUCTIONS:
1. Only annotate EXPLICIT relations stated in the text
2. If entities co-occur WITHOUT a clear relationship → respond with "No_Relation"
3. Do NOT infer or assume relationships not explicitly stated
4. When uncertain → respond with "No_Relation"
Example 1
## Input:
Text: Effect of alpha-tocopherol and deferoxamine on methamphetamine-induced neurotoxicity. Methamphetamine (MA)-induced dopaminergic neurotoxicity is believed to be associated with the i

In [21]:
import json
import pandas as pd
import random

def get_relations_from_json(json_data, run_key="run_1"):
    """Extract all relation types from a JSON dataset for a specific run."""
    relations = []
    
    # Check if multi_relations exists (for multi prompts)
    if "multi_relations" in json_data["n_1"][run_key]:
        for relation_list in json_data["n_1"][run_key]["multi_relations"]:
            relations.extend(relation_list)
    
    # Check if relations exists (for single prompts)
    if "relations" in json_data["n_1"][run_key]:
        relations.extend(json_data["n_1"][run_key]["relations"])
    
    return set(relations)


def find_missing_classes(json_path, all_classes):
    """Identify which relation classes are missing from the test dataset."""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Check first run to see what's present
    present_classes = get_relations_from_json(data, "run_1")
    missing_classes = set(all_classes) - present_classes
    
    return list(missing_classes)


def find_abstracts_with_classes(df, missing_classes, seed=42):
    """Find one abstract (doc_id) for each missing class from the training data."""
    random.seed(seed)
    abstracts_to_add = {}
    
    for rel_class in missing_classes:
        # Find all doc_ids that contain this relation type
        matching_rows = df[df['relation_type'] == rel_class]
        
        if matching_rows.empty:
            print(f"Warning: No examples found for class '{rel_class}' in training data")
            continue
        
        # Get unique doc_ids
        doc_ids = matching_rows['doc_id'].unique()
        
        # Randomly select one doc_id
        selected_doc_id = random.choice(doc_ids)
        abstracts_to_add[rel_class] = selected_doc_id
    
    return abstracts_to_add


def append_missing_classes_to_json(
    json_path,
    train_df,
    test_df,
    template_path_single,
    template_path_multi,
    missing_classes,
    seed=42
):
    """
    Append prompts for missing classes to existing JSON file.
    
    Args:
        json_path: Path to the consolidated JSON file
        train_df: Training dataframe
        test_df: Test dataframe (for examples)
        template_path_single: Path to single prompt template
        template_path_multi: Path to multi prompt template
        missing_classes: List of missing relation classes
        seed: Random seed for reproducibility
    """
    # Load existing JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Find abstracts containing missing classes
    abstracts_to_add = find_abstracts_with_classes(train_df, missing_classes, seed)
    
    if not abstracts_to_add:
        print("No abstracts to add - all missing classes not found in training data")
        return
    
    print(f"Adding {len(abstracts_to_add)} abstracts for missing classes: {list(abstracts_to_add.keys())}")
    
    # Import necessary functions from your notebook
    #from create_prompt_relation_df_single import create_prompt_relation_df_single
    #from create_prompt_relation_df_multi_by_docid import create_prompt_relation_df_multi_by_docid
    #from prompts_df_to_json_from_runs import prompts_df_to_json_from_runs
    #from prompts_df_to_json_from_runs_multi import prompts_df_to_json_from_runs_multi
    
    # For each run in the JSON
    for run_key in data["n_1"].keys():
        run_num = int(run_key.split("_")[1])
        
        for rel_class, doc_id in abstracts_to_add.items():
            # Get all rows for this doc_id
            doc_rows = train_df[train_df['doc_id'] == doc_id].copy()
            
            print(f"  Adding abstract {doc_id} (class: {rel_class}) to {run_key}")
            
            # Create single prompts (zero-shot and few-shot)
            single_zero = create_prompt_relation_df_single(
                doc_rows, test_df, template_path_single, 
                examples=False, same_examples=False, 
                seed=seed, n_prompts=1, prompt_id=False
            )
            
            single_few = create_prompt_relation_df_single(
                doc_rows, test_df, template_path_single,
                examples=True, same_examples=True,
                seed=seed, n_prompts=1, prompt_id=False
            )
            
            # Create multi prompts (zero-shot and few-shot)
            multi_zero = create_prompt_relation_df_multi_by_docid(
                doc_rows, test_df, template_path_multi,
                examples=False, same_examples=False,
                seed=seed, n_prompts=1,
                number_relations=True, relations_entitys_separately=False
            )
            
            multi_few = create_prompt_relation_df_multi_by_docid(
                doc_rows, test_df, template_path_multi,
                examples=True, same_examples=True,
                seed=seed, n_prompts=1,
                number_relations=True, relations_entitys_separately=False
            )
            
            # Append to existing data
            # Single zero
            data["n_1"][run_key]["single_zero"].extend(single_zero['Prompt'].tolist())
            
            # Single examples
            data["n_1"][run_key]["single_examples"].extend(single_few['Prompt'].tolist())
            
            # Multi zero
            data["n_1"][run_key]["multi_zero"].extend(multi_zero['Prompt'].tolist())
            data["n_1"][run_key]["multi_relations"].extend(multi_zero['Multi_Relations'].tolist())
            data["n_1"][run_key]["multi_docid"].extend(multi_zero['Multi_DocID'].tolist())
            
            # Multi examples
            data["n_1"][run_key]["multi_examples"].extend(multi_few['Prompt'].tolist())
            
            # Append relations and doc_ids (these are shared across prompt types)
            if "relations" in data["n_1"][run_key]:
                # For single prompts, extend with individual relations
                data["n_1"][run_key]["relations"].extend(single_zero['Relation'].tolist())
            
            if "doc_ids" in data["n_1"][run_key]:
                data["n_1"][run_key]["doc_ids"].extend(single_zero['DocID'].tolist())
    
    # Save updated JSON
    output_path = json_path.replace(".json", "_complete.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    
    print(f"\nSaved complete dataset to: {output_path}")
    return output_path

"""
# Main execution
if __name__ == "__main__":
    # Define all possible classes
    ALL_CLASSES = [
        #'Association',
        #'Positive_Correlation',
        #'Negative_Correlation',
        #'Bind',
        #'Cotreatment',
        #'Comparison',
        'Drug_Interaction',
        'Conversion',
        'No_Relation'
    ]
    
    # Paths
    json_path = "LLM_benchmarks/Prompts/Test_callibration.json"
    template_single = "LLM_benchmarks/prompt_template_single.txt"
    template_multi = "LLM_benchmarks/prompt_template_multi.txt"
    
    # Load dataframes (you'll need to load these from your notebook)
    # merged_df_train = ...
    # merged_df_test = ...
    
    # Find missing classes
    missing = find_missing_classes(json_path, ALL_CLASSES)
    
    if missing:
        print(f"Missing classes found: {missing}")
        print(f"Will add {len(missing)} abstracts to the dataset\n")
        
        #Append missing classes
        
        append_missing_classes_to_json(
            json_path,
             merged_df_train,
             merged_df_test,
             template_single,
             template_multi,
             missing,
             seed=42
         )
    else:
        print("All classes are represented in the dataset!")
"""

'\n# Main execution\nif __name__ == "__main__":\n    # Define all possible classes\n    ALL_CLASSES = [\n        #\'Association\',\n        #\'Positive_Correlation\',\n        #\'Negative_Correlation\',\n        #\'Bind\',\n        #\'Cotreatment\',\n        #\'Comparison\',\n        \'Drug_Interaction\',\n        \'Conversion\',\n        \'No_Relation\'\n    ]\n\n    # Paths\n    json_path = "LLM_benchmarks/Prompts/Test_callibration.json"\n    template_single = "LLM_benchmarks/prompt_template_single.txt"\n    template_multi = "LLM_benchmarks/prompt_template_multi.txt"\n\n    # Load dataframes (you\'ll need to load these from your notebook)\n    # merged_df_train = ...\n    # merged_df_test = ...\n\n    # Find missing classes\n    missing = find_missing_classes(json_path, ALL_CLASSES)\n\n    if missing:\n        print(f"Missing classes found: {missing}")\n        print(f"Will add {len(missing)} abstracts to the dataset\n")\n\n        #Append missing classes\n\n        append_missing_cl

In [27]:
import json
import pandas as pd

def get_doc_ids_from_json(json_path, run_key="run_1"):
    """Extract all doc IDs from a JSON dataset for a specific run."""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    doc_ids = data["n_1"][run_key]["doc_ids"]
    return set(doc_ids)


def find_missing_doc_ids(json_path_complete, json_path_incomplete):
    """
    Find doc IDs that are in the complete JSON but missing from the incomplete one.
    
    Args:
        json_path_complete: Path to the completed/reference JSON
        json_path_incomplete: Path to the JSON that needs completion
    
    Returns:
        list: Doc IDs that need to be added
    """
    complete_doc_ids = get_doc_ids_from_json(json_path_complete)
    incomplete_doc_ids = get_doc_ids_from_json(json_path_incomplete)
    
    missing_doc_ids = complete_doc_ids - incomplete_doc_ids
    
    return list(missing_doc_ids)


def append_missing_doc_ids_to_json(
    json_path,
    train_df,
    test_df,
    template_path_single,
    template_path_multi,
    missing_doc_ids,
    seed=42
):
    """
    Append prompts for missing doc IDs to existing JSON file.
    
    Args:
        json_path: Path to the JSON file to complete
        train_df: Training dataframe
        test_df: Test dataframe (for examples)
        template_path_single: Path to single prompt template
        template_path_multi: Path to multi prompt template
        missing_doc_ids: List of doc IDs to add
        seed: Random seed for reproducibility
    """
    # Load existing JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    print(f"Adding {len(missing_doc_ids)} abstracts with doc IDs: {missing_doc_ids}\n")
    
    # For each run in the JSON
    for run_key in data["n_1"].keys():
        run_num = int(run_key.split("_")[1])
        
        for doc_id in missing_doc_ids:
            # Get all rows for this doc_id from training data
            doc_rows = train_df[train_df['doc_id'] == doc_id].copy()
            
            if doc_rows.empty:
                print(f"  Warning: Doc ID {doc_id} not found in training data")
                continue
            
            # Get the relation types for this doc_id (for logging)
            relations_in_doc = doc_rows['relation_type'].unique()
            print(f"  Adding abstract {doc_id} (relations: {list(relations_in_doc)}) to {run_key}")
            
            # Create single prompts (zero-shot and few-shot)
            single_zero = create_prompt_relation_df_single(
                doc_rows, test_df, template_path_single, 
                examples=False, same_examples=False, 
                seed=seed, n_prompts=1, prompt_id=False
            )
            
            single_few = create_prompt_relation_df_single(
                doc_rows, test_df, template_path_single,
                examples=True, same_examples=True,
                seed=seed, n_prompts=1, prompt_id=False
            )
            
            # Create multi prompts (zero-shot and few-shot)
            multi_zero = create_prompt_relation_df_multi_by_docid(
                doc_rows, test_df, template_path_multi,
                examples=False, same_examples=False,
                seed=seed, n_prompts=1,
                number_relations=True, relations_entitys_separately=False
            )
            
            multi_few = create_prompt_relation_df_multi_by_docid(
                doc_rows, test_df, template_path_multi,
                examples=True, same_examples=True,
                seed=seed, n_prompts=1,
                number_relations=True, relations_entitys_separately=False
            )
            
            # Append to existing data
            data["n_1"][run_key]["single_zero"].extend(single_zero['Prompt'].tolist())
            data["n_1"][run_key]["single_examples"].extend(single_few['Prompt'].tolist())
            data["n_1"][run_key]["multi_zero"].extend(multi_zero['Prompt'].tolist())
            data["n_1"][run_key]["multi_relations"].extend(multi_zero['Multi_Relations'].tolist())
            data["n_1"][run_key]["multi_docid"].extend(multi_zero['Multi_DocID'].tolist())
            data["n_1"][run_key]["multi_examples"].extend(multi_few['Prompt'].tolist())
            
            if "relations" in data["n_1"][run_key]:
                data["n_1"][run_key]["relations"].extend(single_zero['Relation'].tolist())
            
            if "doc_ids" in data["n_1"][run_key]:
                data["n_1"][run_key]["doc_ids"].extend(single_zero['DocID'].tolist())
    
    # Save updated JSON
    output_path = json_path.replace(".json", "_complete.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    
    print(f"\nSaved complete dataset to: {output_path}")
    return output_path


# Main execution
if __name__ == "__main__":
    # Paths
    complete_json = "LLM_benchmarks/Prompts/backup/Test_callibration.json"
    incomplete_json = "LLM_benchmarks/Prompts/Test_callibration_binary.json"
    template_single = "LLM_benchmarks/prompt_template_single_binary.txt"
    template_multi = "LLM_benchmarks/prompt_template_multi _binary.txt"
    
    # Find missing doc IDs
    missing_doc_ids = find_missing_doc_ids(complete_json, incomplete_json)
    
    if missing_doc_ids:
        print(f"Missing doc IDs found: {missing_doc_ids}")
        print(f"Will add {len(missing_doc_ids)} abstracts to the dataset\n")
        
        # Load dataframes (you'll need to load these from your notebook)
        # merged_df_train = ...
        # merged_df_test = ...
        
        # Append missing doc IDs
        append_missing_doc_ids_to_json(
            incomplete_json,
            merged_df_train,
            merged_df_test,
            template_single,
            template_multi,
            missing_doc_ids,
            seed=42
        )
    else:
        print("All doc IDs from the complete dataset are already present!")

Missing doc IDs found: ['21070631', '16720068', '15266215']
Will add 3 abstracts to the dataset

Adding 3 abstracts with doc IDs: ['21070631', '16720068', '15266215']

  Adding abstract 21070631 (relations: ['No_Relation', 'Relation']) to run_1
  Adding abstract 16720068 (relations: ['Relation', 'No_Relation']) to run_1
  Adding abstract 15266215 (relations: ['Relation', 'No_Relation']) to run_1
  Adding abstract 21070631 (relations: ['No_Relation', 'Relation']) to run_2
  Adding abstract 16720068 (relations: ['Relation', 'No_Relation']) to run_2
  Adding abstract 15266215 (relations: ['Relation', 'No_Relation']) to run_2
  Adding abstract 21070631 (relations: ['No_Relation', 'Relation']) to run_3
  Adding abstract 16720068 (relations: ['Relation', 'No_Relation']) to run_3
  Adding abstract 15266215 (relations: ['Relation', 'No_Relation']) to run_3
  Adding abstract 21070631 (relations: ['No_Relation', 'Relation']) to run_4
  Adding abstract 16720068 (relations: ['Relation', 'No_Relati