In [13]:
import json
from tqdm import tqdm
import os
# Set the directory path
gold = "../data/exomiser-gold.jsonl"
target_dir = "../data/gpt_4o_response.csv"
model = "gpt-4o"

#  Load exomiser prompt file
#  Then iterate over files in the directory getting the name of the file and building a jsonl file



def read_folder_files(path: str):
    """
    Read the files in the specified folder.

    Args:
        path (str): Path to the folder containing the files.

    Returns:
        List[str]: List of file paths in the folder.
    """
    import os
    return [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".txt")]

def read_result_json(path: str):
    """
    Read the raw result file.

    Args:
        path (str): Path to the raw result file.

    Returns:
        List[dict]: Contents of the raw result file.
    """
    responses = []
    with open(path, "r") as raw_result:
        for line in raw_result:
            responses.append(json.loads(line))
    return responses

def read_response_csv(path: str):
    """
    Read the CSV file containing the responses.

    Args:
        path (str): Path to the CSV file.

    Returns:
        List[dict]: Contents of the CSV file.
    """
    import pandas as pd
    df = pd.read_csv(path)
    return df

In [11]:
gold_prompts = read_result_json(gold)
gold_prompts_dict = {}

for prompt in gold_prompts:
    gold_prompts_dict[prompt["id"]] = prompt

In [None]:
files = read_folder_files(target_dir)
for file in tqdm(files, total=len(files)):
    with open(file , "r") as f:
        text = f.read().strip()
        filename = os.path.basename(file)
        if filename in gold_prompts_dict:
            gold_prompts_dict[filename]["response"] = text
        else:
            print(f"File {filename} not found in gold prompts.")

In [12]:
df_files = read_response_csv(target_dir)

for index, row in df_files.iterrows():
    filename = row["metadata"]
    if filename in gold_prompts_dict:
        gold_prompts_dict[filename]["response"] = row["service_answer"]
    else:
        print(f"File {filename} not found in gold prompts.")

In [14]:
with open(f"../data/responses/{model}.jsonl", "w") as f:
    for line in tqdm(gold_prompts_dict.values(), total=len(gold_prompts_dict.values())):
        f.write(json.dumps(line) + "\n")

100%|██████████| 5212/5212 [00:00<00:00, 66143.96it/s]
