In [None]:
import pandas as pd
import os

In [None]:
# load model completions on multiq

df_dict = {}

for model in sorted(os.listdir("../../data/model_completions/")):

    df_dict[model[:-4]] = pd.read_csv(f"../../data/model_completions/{model}")

    print(f"Loaded {model} ({len(df_dict[model[:-4]])} rows)")

In [None]:
# load English multiq prompts
multiq_en = pd.read_csv("../../data/MultiQ.csv")
multiq_en = multiq_en[multiq_en["language"] == "en"]
multiq_en = multiq_en.rename(columns={"prompt": "prompt_en"})
multiq_en

In [None]:
# merge English prompts with model completions
for model in df_dict:
    df_dict[model] = df_dict[model].merge(multiq_en[["id", "prompt_en"]], on="id")

df_dict[model]

In [None]:
# create eval prompts by combining prompt_en and completion

def create_eval_prompt(row):

    # if not string, make string
    if not isinstance(row.model_completion, str):
        row.model_completion = str(row.model_completion)

    # remove linebreaks and collapse whitespace
    row.model_completion = row.model_completion.replace("<|assistant|>", " ")
    row.model_completion = row.model_completion.replace("\n", " ")
    row.model_completion = " ".join(row.model_completion.split())

    prompt = f"For the following question, is this a correct answer?\n\nQuestion: {row.prompt_en}\nAnswer: {row.model_completion}\n\nThe question is in English. The answer is either in English or in another language. If the answer is correct, type 'yes'. If the answer is incorrect, type 'no'."
    
    return prompt

for model in df_dict:
        print(model)
        df_dict[model]["eval_prompt"] = df_dict[model].apply(create_eval_prompt, axis=1)

In [None]:
# export to csv

for model in df_dict:
    df_dict[model].to_csv(f"../../data/model_anser_accuracy/prompts/{model}.csv", index=False)
    print(f"Exported multiq/{model}.csv ({len(df_dict[model])} rows)")