Reformat the output JSON & code from the preprocessing step in `notebooks/codeio/PreprocessCode.ipynb`.

The output format will align with the data we extract from existing CodeI/O dataset, in `notebooks/codeio.ipynb`.

In [1]:
import json
from pathlib import Path

with open(Path("output/processed_code.jsonl"), "r") as f:
    samples = [json.loads(line) for line in f]

for sample in samples:
    main_code = sample["reference_code"]
    del sample["reference_code"]
    if "def main(" in main_code:
        main_code = main_code.replace("def main(", "def main_solution(")
    sample["code_sample"] = main_code

    input_generator = sample["input_generator"]
    if "def input_generator()" in input_generator:
        input_generator = input_generator.replace("def input_generator()", "def generate_inputs(random: Random)")
    if "import random" in input_generator:
        input_generator = input_generator.replace("import random\n    ", "").replace("import random\n", "")
    sample["input_generator"] = input_generator

    sample["input_output_spec"] = sample["parameters"]
    del sample["parameters"]

    sample["task_description"] = sample["query"]
    del sample["query"]

with open(Path("output/formatted_code.jsonl"), "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

Now we need to filter out unsuitable samples from the data. First we prioritise samples which are inherently random, reliant on external services (e.g. network requests), or whose input generators do not match the correct random usage requirements, as this could cause irreproducibility in RL training.

In [2]:
def verify_input_generator(input_generator_code):
    if "def generate_inputs(random: Random)" not in input_generator_code and "def generate_inputs(rng: Random)" not in input_generator_code:
        return False
    if "import numpy" in input_generator_code or "np.random" in input_generator_code:
        return False
    if "import random" in input_generator_code:
        return False
    return True

def verify_main_solution(main_solution_code):
    if "def main_solution(" not in main_solution_code:
        return False
    if "import random" in main_solution_code:
        return False
    if "from random import" in main_solution_code:
        return False
    if "np.random" in main_solution_code:
        return False
    if "import requests" in main_solution_code or " requests." in main_solution_code or "from requests import" in main_solution_code:
        return False
    return True

remove = set()
for i, sample in enumerate(samples):
    if not verify_input_generator(sample["input_generator"]):
        remove.add(i)
        print(f"Removing sample {i} due to bad input generator")
    elif not verify_main_solution(sample["code_sample"]):
        remove.add(i)
        print(f"Removing sample {i} due to bad main solution")

removed_samples = [sample for i, sample in enumerate(samples) if i in remove]
samples = [sample for i, sample in enumerate(samples) if i not in remove]
print(f"Removed {len(remove)} samples")

with open(Path("output/filtered_code.jsonl"), "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

Removing sample 6 due to bad input generator
Removing sample 8 due to bad input generator
Removing sample 28 due to bad input generator
Removing sample 30 due to bad input generator
Removing sample 39 due to bad main solution
Removing sample 43 due to bad main solution
Removing sample 47 due to bad main solution
Removing sample 53 due to bad input generator
Removing sample 59 due to bad input generator
Removing sample 64 due to bad main solution
Removing sample 87 due to bad main solution
Removing sample 112 due to bad main solution
Removing sample 116 due to bad main solution
Removing sample 121 due to bad input generator
Removing sample 141 due to bad main solution
Removing sample 144 due to bad main solution
Removing sample 150 due to bad main solution
Removing sample 155 due to bad main solution
Removing sample 159 due to bad main solution
Removing sample 162 due to bad input generator
Removing sample 168 due to bad input generator
Removing sample 170 due to bad main solution
Remov

In [3]:
removed_samples[0]["input_generator"]

'def generate_inputs(random: Random):\n    import numpy as np\n    \n    height = random.randint(10, 20)\n    width = random.randint(10, 20)\n    image0 = np.random.rand(height, width)\n    image1 = np.random.rand(height, width)\n    num_iter = random.randint(10, 100)\n    alpha = random.uniform(0.01, 1.0) if random.choice([True, False]) else None\n\n    return {"image0": image0, "image1": image1, "num_iter": num_iter, "alpha": alpha}'

In [4]:
removed_samples[43]["code_sample"]

'def main_solution(search_terms):\n    import requests\n    from bs4 import BeautifulSoup\n    from fake_useragent import UserAgent\n    import webbrowser\n\n    url = "https://www.google.com/search?q=" + " ".join(search_terms)\n    res = requests.get(url, headers={"UserAgent": UserAgent().random}, timeout=10)\n    soup = BeautifulSoup(res.text, "html.parser")\n    links = list(soup.select(".eZt8xd"))[:5]\n\n    opened_links = []\n    for link in links:\n        if link.text == "Maps":\n            opened_links.append(link.get("href"))\n            webbrowser.open(link.get("href"))\n        else:\n            opened_links.append(f"https://google.com{link.get(\'href\')}")\n            webbrowser.open(f"https://google.com{link.get(\'href\')}")\n\n    return {"opened_links": opened_links}'