In [5]:
import json
import re

def extract_python_code(jsonl_data):
    """
    Extract the Python code from the `solution` field in the JSONL input.

    Args:
        jsonl_data (str): A JSONL-formatted string.

    Returns:
        list: A list of Python code snippets extracted from the JSONL input.
    """
    python_code_snippets = []

    for line in jsonl_data.strip().splitlines():
        try:
            data = json.loads(line)
            if 'solution' in data:
                # Extract Python code from the triple-backtick block
                match = re.search(r"```python\n(.*?)\n```", data['solution'], re.DOTALL)
                if match:
                    python_code_snippets.append(match.group(1).strip())
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSONL line: {e}")

    return python_code_snippets

# Example JSONL input
with open("samples.jsonl") as f:
    jsonl_data = f.read()
    
# Extract Python code from the JSONL input
python_code = extract_python_code(jsonl_data)

# Output the extracted Python code in output.jsonl
with open("output.jsonl", "w") as f:
    for code in python_code:
        f.write(json.dumps({"code": code}) + "\n")
        
print("Python code extracted and saved to output.jsonl")


Python code extracted and saved to output.jsonl


In [6]:
import json
import re

# Read the JSONL file
with open("samples.jsonl") as f:
    jsonl_data = f.read()

# Create a list to store updated JSON objects
updated_lines = []

for line in jsonl_data.strip().splitlines():
    try:
        data = json.loads(line)
        if 'solution' in data:
            match = re.search(r"```python\n(.*?)\n```", data['solution'], re.DOTALL)
            if match:
                data['solution'] = match.group(1).strip()
        updated_lines.append(json.dumps(data))
    except json.JSONDecodeError as e:
        print(f"Skipping invalid JSONL line: {e}")

# Write the updated JSONL content to a new file
with open("final_output.jsonl", "w") as f:
    for updated_line in updated_lines:
        f.write(updated_line + "\n")

print("Python code extracted and saved to final_output.jsonl")

Python code extracted and saved to final_output.jsonl
