# Naive Paired Verifier Data Collection
One simple way of getting negatives is by shuffling the dataset (mixing answers from other inputs to this one).
We are also trying to gather multiple positives and realistic negatives by sampling from SOTA systems (one prong is in [this repo](https://github.com/matt-seb-ho/reprover_hf)).
However given my compute woes, here is a simple baseline for the time being.

In [15]:
import json
import random
from time import perf_counter
from tqdm import tqdm
# preprocessing script lives in src directory
import sys
sys.path.append("../src")

In [2]:
# preprocessing depends on lean_dojo which has a newer git requirement
import os
path_with_updated_git_binary = "/home/msho/.personalbin/bin"  
os.environ["PATH"] = path_with_updated_git_binary + os.pathsep + os.environ["PATH"]

In [5]:
from prep_sft_data import preprocess_data
from utils import prepend_repo_root, _pp

In [6]:
RANDOM_TRAIN_DATA_PATH = prepend_repo_root("data/leandojo_benchmark_4/random/train.json")
NOVELP_TRAIN_DATA_PATH = prepend_repo_root("data/leandojo_benchmark_4/novel_premises/train.json")
thm_data = preprocess_data(RANDOM_TRAIN_DATA_PATH, True, False)

100%|██████████| 98514/98514 [00:02<00:00, 44263.96it/s]
[32m2024-03-04 04:54:58.870[0m | [1mINFO    [0m | [36mprep_sft_data[0m:[36mpreprocess_data[0m:[36m52[0m - [1m204205 examples loaded[0m


In [16]:
def add_random_negatives(data, output_file=None, max_iters=1000, seed=42):
    random.seed(seed)
    pairs = []
    for entry in tqdm(data, total=len(data)):
        positive = entry["tactic"]
        negative = None
        for _ in range(max_iters):
            candidate = random.choice(data)["tactic"]
            if candidate != positive:
                negative = candidate
                break
        if negative is None:
            continue
        pairs.append({
            "state": entry["state"],
            "positive": positive,
            "negative": negative
        })

    if output_file:
        with open(output_file, 'w') as f:
            output = {"seed": seed, "pairs": pairs}
            json.dump(output, f)
            print(f"Wrote paired data to: {output_file}")
    
    return pairs

In [17]:
pairs = add_random_negatives(
    thm_data,
    output_file=prepend_repo_root("data/paired_random_train.json")
)

  0%|          | 0/204205 [00:00<?, ?it/s]

100%|██████████| 204205/204205 [00:00<00:00, 437648.07it/s]


Wrote paired data to: /mnt/hdd/msho/gfn_ntp/data/paired_random_train.json


In [21]:
dummy = random.choice(pairs)

In [22]:
print(dummy["state"])

case refl
l : List Char
c : Char
r : List Char
h : ValidFor l (c :: r) { s := { data := List.reverse l ++ c :: r }, i := { byteIdx := utf8Len (List.reverse l) } }
⊢ ValidFor (c :: l) r { s := { data := List.reverseAux l (c :: r) }, i := { byteIdx := utf8Len l + csize c } }


In [25]:
print(dummy["positive"])

constructor


In [26]:
print(dummy["negative"])

have h2_meas : MeasureTheory.AEStronglyMeasurable (fun y : G => ∫ x : G, ‖L (f y) (g (x - y))‖ ∂μ) ν := h_meas.prod_swap.norm.integral_prod_right'
