In [1]:
import json
import re

def extract_python_code(jsonl_data):
    """
    Extract the Python code from the `solution` field in the JSONL input.

    Args:
        jsonl_data (str): A JSONL-formatted string.

    Returns:
        list: A list of Python code snippets extracted from the JSONL input.
    """
    python_code_snippets = []

    for line in jsonl_data.strip().splitlines():
        try:
            data = json.loads(line)
            if 'solution' in data:
                # Extract Python code from the triple-backtick block
                match = re.search(r"```python\n(.*?)\n```", data['solution'], re.DOTALL)
                if match:
                    python_code_snippets.append(match.group(1).strip())
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSONL line: {e}")

    return python_code_snippets

# Example JSONL input
with open("samples.jsonl") as f:
    jsonl_data = f.read()
    
# Extract Python code from the JSONL input
python_code = extract_python_code(jsonl_data)

# Output the extracted Python code in output.jsonl
with open("output.jsonl", "w") as f:
    for code in python_code:
        f.write(json.dumps({"code": code}) + "\n")
        
print("Python code extracted and saved to output.jsonl")


Python code extracted and saved to output.jsonl


In [2]:
# Replace the `solution` field with the extracted Python code
with open("samples.jsonl") as f:
    jsonl_data = f.read()
    
for line in jsonl_data.strip().splitlines():
    try:
        data = json.loads(line)
        if 'solution' in data:
            match = re.search(r"```python\n(.*?)\n```", data['solution'], re.DOTALL)
            if match:
                data['solution'] = match.group(1).strip()
        print(json.dumps(data))
    except json.JSONDecodeError as e:
        print(f"Skipping invalid JSONL line: {e}")
    
# Write it to a new file final_output.jsonl
with open("final_output.jsonl", "w") as f:
    for line in jsonl_data.strip().splitlines():
        f.write(line + "\n")
        
print("Python code extracted and saved to final_output.jsonl")

{"task_id": "BigCodeBench/0", "solution": "import itertools\n\ndef task_func(numbers=list(range(1, 3))):\n    \"\"\"\n    Calculates the average of the sums of absolute differences between each pair of consecutive numbers \n    for all permutations of a given list.\n\n    Args:\n    - numbers (list): A list of numbers. Default is numbers from 1 to 2.\n    \n    Returns:\n    float: The average of the sums of absolute differences for each permutation of the list.\n    \"\"\"\n    all_permutations = list(itertools.permutations(numbers))\n    \n    total_diff_sum = 0\n    for perm in all_permutations:\n        total_diff_sum += sum(abs(perm[i] - perm[i + 1]) for i in range(len(perm) - 1))\n    \n    average_diff = total_diff_sum / len(all_permutations)\n    return average_diff", "completion": "\n```python\nimport itertools\n\ndef task_func(numbers=list(range(1, 3))):\n    \"\"\"\n    Calculates the average of the sums of absolute differences between each pair of consecutive numbers \n    