In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
from typing import Dict
import json

In [7]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro-exp-03-25",
    google_api_key="AIzaSyCs6KuYNBDLA9uQxgoZbhtU53ha43waKms"
)

In [8]:
def refine_patch_format(patch):
    """
    Ensures a patch is in valid unified-diff format. If malformed, uses an LLM to correct it.
    """
    # quick heuristic: must start with '--- a/' and contain '@@'
    if not patch.startswith('--- a/') or '@@' not in patch:
        prompt = f"""You are a software engineer fixing an automated code patch.

The following patch is malformed or incomplete: {patch}

Please return a corrected patch using valid unified-diff format:
- It must start with '--- a/...' and '+++ b/...'
- It must contain at least one valid hunk beginning with '@@'
- Do not include explanations, comments, markdown, or any text outside the patch.
- Your response must be a pure, corrected unified diff.
- It should be in raw unified-diff format, without any markdown wrapping

Return a minimal valid patch only."""
        feedback = llm([HumanMessage(content=prompt)]).content
        return feedback
    return patch


In [9]:
# Cell 2: Define the reflection loop function
def generate_patch(instance: Dict, llm, max_reflections: int = 3):
    """
    Generate a patch with iterative self-reflection using an LLM.
    """

    problem = instance['problem_statement']

    # Initial patch generation (O1-style prompt)
    initial_prompt = f"""You are an expert software engineer.

Below is a problem description from a GitHub issue:
{problem}

Generate a fix for this issue in valid unified-diff format.
Your output must:
- Begin with '--- a/...' and '+++ b/...'
- Contain at least one '@@' hunk
- NOT include explanations, comments, or markdown
- Be minimal and syntactically correct

Output only the patch."""
    current_patch = llm([HumanMessage(content=initial_prompt)]).content

    # Reflection loop
    for i in range(max_reflections):
        reflection_prompt = f"""You are reviewing the following patch:
{current_patch}
The patch was intended to fix this issue:

Check:
- Does it correctly address the issue?
- Is it syntactically valid unified-diff (starts with --- a/, contains @@)?
- Can it be applied cleanly (no formatting or logical errors)?

If any issue is found, rewrite the patch and return only the corrected unified diff. Do not include any explanations or markdown."""
        current_patch = llm([HumanMessage(content=reflection_prompt)]).content

    current_patch = refine_patch_format(current_patch)

    return current_patch




In [10]:
from datasets import load_dataset
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
print(dataset.column_names)
predictions = []
for i in range (2):
    instance = dataset[i]
    patch = generate_patch(instance,llm)
    predictions.append({
        "instance_id": instance["instance_id"],
        "model": "my-multi-llm-agent",
        "prediction": patch
    })
    print(i+1)

with open("my_preds.jsonl", "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

['repo', 'instance_id', 'base_commit', 'patch', 'test_patch', 'problem_statement', 'hints_text', 'created_at', 'version', 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit']
1
2


In [11]:
import json

with open("my_preds.jsonl") as fin, open("fixed_preds.jsonl", "w") as fout:
    for line in fin:
        rec = json.loads(line)
        # rename the keys the harness expects
        rec["model_name_or_path"] = rec.pop("model")
        rec["model_patch"]       = rec.pop("prediction")
        fout.write(json.dumps(rec) + "\n")
