In [1]:
import json
from collections import defaultdict
import os
import pandas as pd
from tqdm import tqdm

In [2]:
swebench_data_path = ""
log_root = ""
validation_logs = defaultdict(lambda: defaultdict(set))
swebench_data = pd.read_json(swebench_data_path, lines=True, orient="records")
swebench_data_dict = {d["instance_id"]: d for d in swebench_data.to_dict(orient="records")}

In [None]:
# parse test results
for name in ["gold", "empty"]:
    log_path = os.path.join(log_root, name)
    log_dirs = os.listdir(log_path)
    
    total_logs = len(log_dirs)
    missing_logs = 0
    print(f"Processing [{name}] logs")
    pbar = tqdm(total=total_logs)
    
    for log_dir in log_dirs:
        log_file = os.path.join(log_path, log_dir, "report.json")
        pbar.update(1)
        if not os.path.exists(log_file):
            missing_logs += 1
            pbar.set_postfix({"missing": missing_logs, "total": total_logs})
            continue
        with open(log_file, "r") as f:
            log = json.load(f)
        instance_id = list(log.keys())[0]
        if "tests_status" in log[instance_id]:
            validation_logs[instance_id][f"{name}-pass"] = set(log[instance_id]["tests_status"]["PASS"])
            validation_logs[instance_id][f"{name}-fail"] = set(log[instance_id]["tests_status"]["FAIL"])

In [None]:
# From Swe-Bench: we need "at least one test where it changes from fail to pass
validated_instances = []
n_total = 0
n_validated = 0
pbar = tqdm(total=len(validation_logs))
for k, log in validation_logs.items():
    fail_to_pass = log["gold-pass"] & log["empty-fail"]
    pass_to_pass = log["gold-pass"] & log["empty-pass"] 
    n_total += 1
    if len(fail_to_pass) > 0:
        n_validated += 1
        # print(f"{k} has test that changes from fail to pass")
        validated_instances.append(swebench_data_dict[k])
        validated_instances[-1]['FAIL_TO_PASS'] = list(fail_to_pass)
        validated_instances[-1]['PASS_TO_PASS'] = list(pass_to_pass)
    pbar.update(1)
    pbar.set_postfix({"validated": n_validated, "total": n_total})

In [None]:
from datasets import Dataset, Value, Sequence, Features
KEYS = [
    'repo',
    'pull_number',
    'instance_id',
    'issue_numbers',
    'base_commit',
    'patch',
    'test_patch',
    'problem_statement',
    'hints_text',
    'created_at',
    'version',
    'PASS_TO_PASS',
    'FAIL_TO_PASS',
]
# We need to define feature to make sure the dataset is consistent with the huggingface dataset on the hub
FEATURES = Features({
    'repo': Value(dtype='string', id=None),
    'pull_number': Value(dtype='int64', id=None),
    'instance_id': Value(dtype='string', id=None),
    'issue_numbers': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    'base_commit': Value(dtype='string', id=None),
    'patch': Value(dtype='string', id=None),
    'test_patch': Value(dtype='string', id=None),
    'problem_statement': Value(dtype='string', id=None),
    'hints_text': Value(dtype='string', id=None),
    'created_at': Value(dtype='string', id=None),
    'version': Value(dtype='string', id=None),
    'PASS_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    'FAIL_TO_PASS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
})
def to_hf_dataset(data_list):
    return Dataset.from_dict({k: [d[k] for d in data_list] for k in KEYS}, features=FEATURES)
validated_dataset = to_hf_dataset(validated_instances)

In [None]:
validated_dataset.push_to_hub("", split="", private=True)