In [None]:
!pip install -qqq pandas numpy matplotlib seaborn

In [None]:
!ls ../volume

In [None]:
from pathlib import Path

In [None]:
!ls ../volume/dev-llm

In [None]:
run_dirs = list(sorted((Path("../volume") / "dev-llm").glob("run-*")))

In [None]:
run_dirs

In [None]:
for run_dir in reversed(run_dirs):
    if len(list(run_dir.iterdir())) < 327:
        print(f"skipping incomplete run {run_dir}")
    else:
        print(f"found complete run {run_dir}")
        break
else:
    raise ValueError("no complete runs found")

In [None]:
all_result_paths = list(run_dir.glob("*.jsonl_results.jsonl"))
len(all_result_paths)

In [None]:
import json

import pandas as pd

data = []
for path in all_result_paths:
    data += [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines()]

for element in data:
    del element["completion"]

df = pd.DataFrame.from_records(data)

df.sample(10)

In [None]:
gb = df.groupby("task_id")
gb.describe()

In [None]:
passes = gb["passed"].sum()

In [None]:
import itertools
from typing import List, Union

import numpy as np

def estimate_pass_at_k(
    num_samples: Union[int, List[int], np.ndarray],
    num_correct: Union[List[int], np.ndarray],
    k: int
) -> np.ndarray:
    """
    Estimates pass@k of each problem and returns them in an array.
    """

    def estimator(n: int, c: int, k: int) -> float:
        """
        Calculates 1 - comb(n - c, k) / comb(n, k).
        """
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    if isinstance(num_samples, int):
        num_samples_it = itertools.repeat(num_samples, len(num_correct))
    else:
        assert len(num_samples) == len(num_correct)
        num_samples_it = iter(num_samples)

    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

In [None]:
pass_at_ks = {}

for k in [1, 10, 100, 1000]:
    pass_at_ks[k] = estimate_pass_at_k(1000, passes, k)

In [None]:
pass_at_k = {k: np.mean(v) for k, v in pass_at_ks.items()}
pass_at_k

In [None]:
plot_df = pd.DataFrame(
    {"k": pass_at_k.keys(),
     "pass@k": pass_at_k.values()}
)
plot_df["fail@k"] = 1 - plot_df["pass@k"]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style='dark')
plt.style.use("dark_background")

plt.rcParams['font.sans-serif'] = ["Inter", "Arial", "DejaVu Sans", "Liberation Sans", "Bitstream Vera Sans", "sans-serif"]

sns.despine()

sns.set_context("talk", rc={"lines.linewidth": 2.5})

In [None]:
gpt4o_benchmark = 0.902

In [None]:
fg = sns.lineplot(
    x="k",
    y="pass@k",
    data=plot_df,
    color="#7FEE64",
    linewidth=6,
    alpha=0.9,
    label="Ministral 8B pass@k"
  )

initial_lim = fg.axes.get_xlim()
fg.axes.hlines(
      gpt4o_benchmark, *initial_lim,
      linestyle="--",
      alpha=0.6,
      zorder=-1,
      label="GPT-4o pass@1"
  )
fg.axes.set_xlim(*initial_lim);
fg.axes.set_ylabel("");
fg.axes.set_ylim(0, 1);
plt.legend();

In [None]:
fg = sns.lineplot(
    x="k",
    y="fail@k",
    data=plot_df,
    color="#7FEE64",
    linewidth=6,
    alpha=0.9,
    label="Ministral 8B fail@k"
  )

initial_lim = fg.axes.get_xlim()
fg.axes.hlines(
      1 - gpt4o_benchmark, *initial_lim,
      linestyle="--",
      alpha=0.6,
      zorder=-1,
      label="GPT-4o fail@1"
  )
fg.axes.set_xlim(*initial_lim);
fg.axes.set_ylabel("");
fg.axes.set_yscale("log")
fg.axes.set_xscale("log")
fg.axes.set_xlim(0.5, 2000);
fg.axes.set_ylim(1e-2, 1e0);
plt.legend();