In [1]:
# !huggingface-cli login --token your_huggingface_token

In [30]:
from datasets import load_dataset

# Login the dataset
ds = load_dataset("openai/frontierscience")
df = ds['test'].to_pandas()

In [31]:
import pandas as pd

# Vectorized categorization - much faster than apply()
df["category"] = df["answer"].str.strip().str.startswith("Points").map({
    True: "research",
    False: "olympiad"
})

# Save both formats
output_base = "../data/frontierscience_full"
df.to_parquet(f"{output_base}.parquet", index=False)
df.to_csv(f"{output_base}.csv", index=False)

print(f"Saved:\n{output_base}.parquet\n{output_base}.csv")

Saved:
../data/frontierscience_full.parquet
../data/frontierscience_full.csv


In [32]:
# Count records by category
category_counts = df["category"].value_counts()
print("\nCategory counts:")
print(category_counts)
print(f"\nTotal: {len(df)}")


Category counts:
category
olympiad    100
research     60
Name: count, dtype: int64

Total: 160


In [33]:
# Earlier in your code, wherever df_research is created:
df_research = df[df['category'] == 'research'].copy()  # Add .copy() here

In [34]:
import pandas as pd
import re

def sum_points(answer: str) -> float:
    """Sum all point values found in answer string."""
    if pd.isna(answer):
        return 0.0
    # find all "Points: " occurrences
    points = re.findall(r"Points:\s*([0-9]*\.?[0-9]+)", str(answer))
    return sum(float(p) for p in points)

df_research["points_sum"] = df_research["answer"].apply(sum_points)
df_research["points_ok"] = abs(df_research["points_sum"] - 10.0) < 0.01

# Display rows where points don't sum to 10
incorrect_points = df_research[~df_research["points_ok"]][["points_sum", "answer"]]

if len(incorrect_points) > 0:
    print(f"Found {len(incorrect_points)} answers with incorrect point totals:")
    display(incorrect_points)
else:
    print("✓ All answers have correct point totals (10.0)")

✓ All answers have correct point totals (10.0)
