In [1]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet', 'val': 'data/val-00000-of-00001.parquet'}
base = "hf://datasets/notdiamond/repliqa_gpt4o_gpt4omini_evals/"

# Load full splits
train_df_full = pd.read_parquet(base + splits["train"])
val_df_full = pd.read_parquet(base + splits["val"])
test_df_full = pd.read_parquet(base + splits["test"])


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# Assuming your dataframe is called df
mini = "gpt-4o-mini-2024-07-18/score"
full = "gpt-4o-2024-08-06/score"


def outcome(row):
    if row[mini] == 1 and row[full] == 1:
        return "Tie"
    elif row[mini] == 1 and row[full] == 0:
        return "Mini wins"
    elif row[mini] == 0 and row[full] == 1:
        return "4o wins"
    else:
        return "Neither"


train_df_full["outcome"] = train_df_full.apply(outcome, axis=1)
val_df_full["outcome"] = val_df_full.apply(outcome, axis=1)
test_df_full["outcome"] = test_df_full.apply(outcome, axis=1)

# Get counts
print("\n\ntrain_df_full:\n", train_df_full["outcome"].value_counts())
print("\nval_df_full:\n", val_df_full["outcome"].value_counts())
print("\ntest_df_full:\n", test_df_full["outcome"].value_counts())



train_df_full:
 outcome
Tie          7168
Neither      1159
Mini wins    1138
4o wins       535
Name: count, dtype: int64

val_df_full:
 outcome
Tie          705
Mini wins    123
Neither      116
4o wins       56
Name: count, dtype: int64

test_df_full:
 outcome
Tie          725
Neither      118
Mini wins     99
4o wins       58
Name: count, dtype: int64


More than 70% of the matches are tie. 
There are cases both model got 0 score, which I can't find what that means in the HF data info. 

To make the training more efficient, I'll sample train_df so there is no cases where both models got 0 scores, and make the tied cases 33%. 


In [3]:
def clean_and_sample_data(df, size):
    df_no_neither = df[df["outcome"] != "Neither"]
    tie_df = df_no_neither[df_no_neither["outcome"] == "Tie"]
    non_tie_df = df_no_neither[df_no_neither["outcome"] != "Tie"]

    target_tie_count = int(len(non_tie_df) * (1 / 2))
    tie_sampled = tie_df.sample(n=min(target_tie_count, len(tie_df)), random_state=42)

    df = pd.concat([non_tie_df, tie_sampled], ignore_index=True)
    df = df.drop(columns=["outcome"])
    df = df.sample(n=size, random_state=42)
    return df

In [4]:
# Sample exactly 1000/100/100 rows
df_train = clean_and_sample_data(train_df_full, 1000)
df_val = clean_and_sample_data(val_df_full, 100)
df_test = clean_and_sample_data(test_df_full, 100)

# Save to parquet
df_train.to_parquet("../data/train_1000.parquet")
df_val.to_parquet("../data/val_100.parquet")
df_test.to_parquet("../data/test_100.parquet")

In [5]:
# embedd all prompts
from openai import OpenAI
import pandas as pd
import dotenv

dotenv.load_dotenv()

def embed_prompts(df, filename, batch_size=500):
    print(f"Embedding {len(df)} prompts...")
    client = OpenAI()
    embeddings = []

    # Process in batches
    for start in range(0, len(df), batch_size):
        print(f"Embedding batch {start} of {len(df)}")
        end = start + batch_size
        batch = df["prompt"].iloc[start:end].tolist()

        response = client.embeddings.create(input=batch, model="text-embedding-3-large")
        batch_embeddings = [d.embedding for d in response.data]
        embeddings.extend(batch_embeddings)

    df["prompt_embedding"] = embeddings
    df.to_parquet(f"../data/{filename}.parquet")


df_val = embed_prompts(df_val, "val_100_with_embeddings")
df_test = embed_prompts(df_test, "test_100_with_embeddings")
df_train = embed_prompts(df_train, "train_1000_with_embeddings")

Embedding 100 prompts...
Embedding 100 prompts...
Embedding 1000 prompts...


In [12]:
# load parquet files
import pandas as pd

df_train = pd.read_parquet("../data/train_1000_with_embeddings.parquet")
df_val = pd.read_parquet("../data/val_100_with_embeddings.parquet")
df_test = pd.read_parquet("../data/test_100_with_embeddings.parquet")

# print shape of each dataframe
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(1000, 9)
(100, 9)
(100, 9)


In [13]:
df_train.head()

Unnamed: 0,prompt_id,prompt,question,gold_answer,gpt-4o-mini-2024-07-18/response,gpt-4o-mini-2024-07-18/score,gpt-4o-2024-08-06/response,gpt-4o-2024-08-06/score,prompt_embedding
1940,jgfiiluw-q2,I'd like for you to answer questions about a c...,How does Dr. Huan Li describe AI in the contex...,AI is more than just a feature; it's a transfo...,Dr. Huan Li describes AI as more than just a f...,1,AI is described by Dr. Huan Li as a tool that ...,1,"[-0.013268875889480114, -0.02285110577940941, ..."
282,mkweenlq-q4,I'd like for you to answer questions about a c...,How has the healthcare system adapted to the T...,There has been a reorientation towards respira...,The healthcare system adapted by reorienting t...,1,Local health systems adapted by prioritizing r...,0,"[-0.018244991078972816, -0.012419315986335278,..."
1976,ishzfwxh-q5,I'd like for you to answer questions about a c...,What steps ensure legal compliance in a multi-...,The answer is not found in the document.,UNANSWERABLE,1,UNANSWERABLE,1,"[-0.0009681060910224915, -0.06115332618355751,..."
93,xkzoegnk-q2,I'd like for you to answer questions about a c...,How do the Chronic Disease Crusade workshops d...,"They are interactive, practical, and grounded ...",The Chronic Disease Crusade workshops differ f...,0,"They offer a personalized, community-focused a...",1,"[0.022008977830410004, -0.006557592656463385, ..."
246,zqgnphez-q2,I'd like for you to answer questions about a c...,What weekly event does Sergeant Emma Thompson ...,Weekly coffee meetings at 'Joe's Diner'.,Sergeant Emma Thompson hosts weekly coffee mee...,0,"Weekly coffee meetings at ""Joe's Diner"" on Bro...",1,"[0.03630077838897705, -0.050125014036893845, -..."


In [11]:
df_train["outcome"] = df_train.apply(outcome, axis=1)
df_val["outcome"] = df_val.apply(outcome, axis=1)
df_test["outcome"] = df_test.apply(outcome, axis=1)

print("\n\ntrain_df:\n", df_train["outcome"].value_counts())
print("\nval_df:\n", df_val["outcome"].value_counts())
print("\ntest_df:\n", df_test["outcome"].value_counts())




train_df:
 outcome
Mini wins    445
Tie          338
4o wins      217
Name: count, dtype: int64

val_df:
 outcome
Mini wins    43
Tie          34
4o wins      23
Name: count, dtype: int64

test_df:
 outcome
Mini wins    42
Tie          31
4o wins      27
Name: count, dtype: int64
