In [1]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet', 'val': 'data/val-00000-of-00001.parquet'}
base = "hf://datasets/notdiamond/repliqa_gpt4o_gpt4omini_evals/"

# Load full splits
train_df_full = pd.read_parquet(base + splits["train"])
val_df_full = pd.read_parquet(base + splits["val"])
test_df_full = pd.read_parquet(base + splits["test"])


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# Assuming your dataframe is called df
mini = "gpt-4o-mini-2024-07-18/score"
full = "gpt-4o-2024-08-06/score"


def outcome(row):
    if row[mini] == 1 and row[full] == 1:
        return "Tie"
    elif row[mini] == 1 and row[full] == 0:
        return "Mini wins"
    elif row[mini] == 0 and row[full] == 1:
        return "4o wins"
    else:
        return "Neither"


train_df_full["outcome"] = train_df_full.apply(outcome, axis=1)

# Get counts
stats = train_df_full["outcome"].value_counts()

print(stats)


outcome
Tie          7168
Neither      1159
Mini wins    1138
4o wins       535
Name: count, dtype: int64


More than 70% of the matches are tie. 
There are cases both model got 0 score, which I can't find what that means in the HF data info. 

To make the training more efficient, I'll sample train_df so there is no cases where both models got 0 scores, and make the tied cases 33%. 


In [3]:
# Remove cases where both are 0 ("Neither")
train_df_no_neither = train_df_full[train_df_full["outcome"] != "Neither"]

# Separate out tie vs non-tie cases
tie_df = train_df_no_neither[train_df_no_neither["outcome"] == "Tie"]
non_tie_df = train_df_no_neither[train_df_no_neither["outcome"] != "Tie"]

# Compute how many tie rows to keep so ties are ~33% of the final set
target_tie_count = int(
    len(non_tie_df) * (1 / 2)
)  # 1/2 because ties / (ties + non_ties) ≈ 1/3

# Sample ties
tie_sampled = tie_df.sample(n=min(target_tie_count, len(tie_df)), random_state=42)

# Combine into final training set
train_df = pd.concat([non_tie_df, tie_sampled], ignore_index=True)

# Check new distribution
print(train_df["outcome"].value_counts(normalize=True))

# drop outcome column
train_df = train_df.drop(columns=["outcome"])


outcome
Mini wins    0.453567
Tie          0.333200
4o wins      0.213232
Name: proportion, dtype: float64


In [4]:

# Sample exactly 1000/100/100 rows (deterministic)
df_train = train_df.sample(n=1000, random_state=42)
df_val = val_df_full.sample(n=100, random_state=42)
df_test = test_df_full.sample(n=100, random_state=42)

In [5]:
import pandas as pd

# Assuming your dataframe is called df
mini = "gpt-4o-mini-2024-07-18/score"
full = "gpt-4o-2024-08-06/score"


def outcome(row):
    if row[mini] == 1 and row[full] == 1:
        return "Tie"
    elif row[mini] == 1 and row[full] == 0:
        return "Mini wins"
    elif row[mini] == 0 and row[full] == 1:
        return "4o wins"
    else:
        return "Neither"


train_df_full["outcome"] = train_df_full.apply(outcome, axis=1)

# Get counts
stats = train_df_full["outcome"].value_counts()

print(stats)


outcome
Tie          7168
Neither      1159
Mini wins    1138
4o wins       535
Name: count, dtype: int64


In [6]:
import pandas as pd

# Assuming your dataframe is called df
mini = "gpt-4o-mini-2024-07-18/score"
full = "gpt-4o-2024-08-06/score"


def outcome(row):
    if row[mini] == 1 and row[full] == 1:
        return "Tie"
    elif row[mini] == 1 and row[full] == 0:
        return "Mini wins"
    elif row[mini] == 0 and row[full] == 1:
        return "4o wins"
    else:
        return "Neither"


df_train["outcome"] = df_train.apply(outcome, axis=1)

# Get counts
stats = df_train["outcome"].value_counts()

print(stats)


outcome
Mini wins    445
Tie          338
4o wins      217
Name: count, dtype: int64


In [7]:
# embedd all prompts
from openai import OpenAI
import dotenv

dotenv.load_dotenv()

client = OpenAI()


In [8]:
# generate embeddings for prompts
df_train["prompt_embedding"] = df_train["prompt"].apply(
    lambda x: client.embeddings.create(input=x, model="text-embedding-3-large")
    .data[0]
    .embedding
)


# save dataset with embeddings
df_train.to_parquet("data/train_1000.parquet")


In [None]:
# generate embeddings for prompts
df_test["prompt_embedding"] = df_test["prompt"].apply(
    lambda x: client.embeddings.create(input=x, model="text-embedding-3-large")
    .data[0]
    .embedding
)

# save dataset with embeddings
df_test.to_parquet("data/test_100.parquet")

In [None]:
# generate embeddings for prompts
df_val["prompt_embedding"] = df_val["prompt"].apply(
    lambda x: client.embeddings.create(input=x, model="text-embedding-3-large")
    .data[0]
    .embedding
)


# save dataset with embeddings
df_val.to_parquet("data/val_100.parquet")
