# MultiPL-E Synthetic Solutions

This notebook builds a dataset of solutions to HumanEval and MBPP in all the
MultiPL-E languages and uploads it to the Hugging Face Hub.

In [1]:
from pathlib import Path
import gzip
import json
from typing import Optional, List, Dict
import datasets
import pandas as pd

In [2]:
def gunzip_json(path: Path) -> Optional[Dict]:
    """
    Reads a .json.gz file, and produces None if any error occured.
    """
    try:
        with gzip.open(path, "rt") as f:
            return json.load(f)
    except Exception as e:
        return None


def find_working_solution(results_path):
    """
    Find the index of the first working solution.
    
    results_path ends .results.json.gz
    """
    results = gunzip_json(results_path)
    if results is None:
        return None
    for i, result in enumerate(results["results"]):
        if result["status"] == "OK":
            completions_name = results_path.name[:-len(".results.json.gz")] + ".json.gz"
            completions_path = results_path.parent / completions_name
            completions = gunzip_json(completions_path)
            if completions is None:
                return None
            return {
                "name": completions["name"],
                "language": completions["language"],
                "prompt": completions["prompt"],
                "solution": completions["completions"][i]
            }
    return None

def gather_benchmarks(root: Path, solutions_glob = "*-*-davinci-0.8-reworded"):
    results = [ ]
    for lang_root in root.glob(solutions_glob):
        for results_path in lang_root.glob("*.results.json.gz"):
            solution = find_working_solution(results_path)
            if solution is None:
                continue
            results.append(solution)
    return results

Change the directory below if needed. You can also add the `solutions_glob` argument to `gather_benchmarks`.

In [3]:
results = gather_benchmarks(Path("/work/arjunguha-research-group/arjun/repos/hfdatasets/nuprl/MultiPL-E-raw-data"))
results_df = pd.DataFrame(results)
results_df[["name", "language"]].groupby("language").count()

Unnamed: 0_level_0,name
language,Unnamed: 1_level_1
cpp,145
cs,101
d,107
go_test.go,135
java,137
jl,136
js,154
lua,151
php,151
pl,146


Build the dataset and push to the hub. Sadly, this doesn't work on Discovery.

In [4]:
datasets.Dataset.from_list(results).push_to_hub("nuprl/MultiPL-E-synthetic-solutions")

Alternatively, save the dataset to disk on Discovery, then run the next cell on another machine to push to the Hub.

In [None]:
datasets.Dataset.from_list(results).save_to_disk("./dataset")

In [2]:
datasets.Dataset.load_from_disk("./dataset").push_to_hub("nuprl/MultiPL-E-synthetic-solutions")