In [10]:
import pandas as pd
from itertools import combinations
# DIR = "src/genai/parenting_chatbot/prodigy_eval/data/"
DIR = "data/"
ANSWER_FILE = DIR + "answers_{}.jsonl"

questions = (
    pd.read_json(path_or_buf="data/questions.jsonl", lines=True)
    .question.to_list()
)

answer_types = ["human", "rag", "gpt4"]
answers = [
    pd.read_json(path_or_buf=ANSWER_FILE.format(answer_type), lines=True)[answer_type]
    for answer_type in answer_types
]

answer_type_pairs = list(combinations(answer_types, 2))
answer_type_pairs

question_prefix = "Which one is a better answer to this question:\n<span style='font-weight: bold; font-size:30px'>"
question_suffix = "</span>"

pd.set_option("max_colwidth", 1000)
# Create a dataframe with four columns: question, human, rag, gpt4
df = pd.DataFrame(
    {
        "question": questions,
        "human": answers[0],
        "rag": answers[1],
        "gpt4": answers[2],
    }
)
# Create all possible pairwise combinations of the question + three answer types: human, rag, gpt4
df = df.melt(id_vars=["question"], value_vars=["human", "rag", "gpt4"])
# Rename the value column to answer
df = df.rename(columns={"value": "answer"})
df["question"] = question_prefix + df["question"] + question_suffix
# Create a new dictionary column with the format {"<answer-type>": <answer-text>}
df["answer"] = df.apply(lambda x: {x["variable"]: x["answer"]}, axis=1)
dfs = []
for answer_type_pair in answer_type_pairs:
    dfs.append(
        df[df.variable.isin(answer_type_pair).copy()]
        .groupby("question").agg(lambda x: x.tolist())
        .reset_index()
    )
dfs = pd.concat(dfs, ignore_index=True)
# combine the answer types into a single dictionary
dfs["answers"] = dfs.apply(lambda x: {k: v for d in x["answer"] for k, v in d.items()}, axis=1)
dfs = dfs.drop(columns=["variable", "answer"]).sort_values(by="question").reset_index(drop=True)
dfs.to_json(path_or_buf=DIR + "answers.jsonl", orient="records", lines=True)


In [94]:
import pandas as pd
from itertools import combinations

# Constants
DATA_DIR = "data/"
QUESTION_FILE = DATA_DIR + "questions.jsonl"
ANSWER_FILE = DATA_DIR + "answers_{}.jsonl"
OUTPUT_FILE = DATA_DIR + "answers.jsonl"
# Define answer types and load corresponding answers
ANSWER_TYPES = ["human", "rag", "gpt4"]
# html formatting prefix and suffix for questions
QUESTION_PREFIX = "Which one is a better answer to this question:<span style='font-weight: bold; font-size:30px'>"
QUESTION_SUFFIX = "</span>"

# Set column width option for pandas DataFrame display
pd.set_option("max_colwidth", 1000)

# Load questions
questions = pd.read_json(QUESTION_FILE, lines=True)["question"].to_list()
# Load answers
answers = [
    pd.read_json(path_or_buf=ANSWER_FILE.format(answer_type), lines=True)[answer_type]
    for answer_type in answer_types
]

# Construct a dataframe with columns: question, human, rag, and gpt4
answers_df = (
    pd.DataFrame({"question": questions, "human": answers[0], "rag": answers[1], "gpt4": answers[2]})
    # Melt the dataframe for pairwise combinations and rename the resulting column
    .melt(id_vars=["question"], value_vars=ANSWER_TYPES)
    .rename(columns={"value": "answer"})
    .assign(question=lambda df: QUESTION_PREFIX + df["question"] + QUESTION_SUFFIX)
    .assign(answer=lambda df: df.apply(lambda x: {x["variable"]: x["answer"]}, axis=1))
)

# Generate pairwise combinations of answer types
answer_type_pairs = list(combinations(ANSWER_TYPES, 2))

# Aggregate answers based on the pairwise combinations of answer types
dataframes = []
for answer_type_pair in answer_type_pairs:
    subset_df = answers_df[answers_df["variable"].isin(answer_type_pair)]
    aggregated_df = subset_df.groupby("question").agg(lambda x: x.tolist()).reset_index()
    dataframes.append(aggregated_df)

# Combine the results, merge dictionaries and save the output
(
    pd.concat(dataframes, ignore_index=True)
    .assign(answers=lambda df: df.apply(lambda x: {k: v for d in x["answer"] for k, v in d.items()}, axis=1))
    .drop(columns=["variable", "answer"])
    .sort_values(by="question")
    .reset_index(drop=True)
    .to_json(path_or_buf=OUTPUT_FILE, orient="records", lines=True)
)


In [93]:
answers_df = (
    pd.DataFrame({"question": questions, "human": answers[0], "rag": answers[1], "gpt4": answers[2]})
    # Melt the dataframe for pairwise combinations and rename the resulting column
    .melt(id_vars=["question"], value_vars=ANSWER_TYPES)
    .rename(columns={"value": "answer"}, inplace=True)
    # .assign(question=lambda df: QUESTION_PREFIX + df["question"] + QUESTION_SUFFIX)
    # .assign(answer=lambda df: df.apply(lambda x: {x["variable"]: x["answer"]}, axis=1))
)
answers_df