In [1]:
import json

In [3]:
base_result = []
with open("result_before_fine_tunning.json", "r") as f:
    base_result = json.load(f)

In [12]:
judge_prompts = """
You are a senior code review evaluator. You will compare two model responses for the same code change.

Your task:
Judge which response demonstrates better **understanding and communication** of the code review task.

Evaluate using these 3 weighted criteria (each 0–10):
1. Correctness — Does it identify real issues, improvements, or decisions relevant to the code change?
2. Completeness — Does it cover all important aspects (added/deleted code, logic impact)?
3. Clarity & Format — Is the response concise, well-structured, and easy to read?

Code context:
Full function:
{full_function}

Added code:
{added_code}

Deleted code:
{deleted_code}

Response A (raw model output, may include prompt repetition):
{raw_result}

Response B (cleaned model output, repetition removed):
{truncated_result}

Your response must follow this **exact format**:

PREFERRED: [A or B]
REASON: [1-2 sentences explaining why one is better, focusing on correctness and clarity]
SCORE_A: [0-10 average score of A across all criteria]
SCORE_B: [0-10 average score of B across all criteria]
CONFIDENCE: [Low / Medium / High — how confident you are in your evaluation]

Do not add any text outside this format.
"""

In [18]:
import os
from groq import Groq
from dotenv import load_dotenv

In [19]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

MODEL_NAME = "llama-3.3-70b-versatile"

In [22]:
judge_results = []
for item in base_result:
    full_function = item["full_function_code"]
    added_code = item["added_code"][0]['code']
    deleted_code = item["deleted_code"][0]['code']
    raw_result = item["raw_result"]
    truncated_result = item["truncated_result"]

    prompt = judge_prompts.format(
        full_function=full_function,
        added_code=added_code,
        deleted_code=deleted_code,
        raw_result=raw_result,
        truncated_result=truncated_result
    )

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that evaluates code review responses."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=512,
        temperature=0.2,
    )
    text = response.choices[0].message.content.strip()
    item["judge_response"] = text
    print("Judge Response:\n", text)
    judge_results.append(item)

Judge Response:
 PREFERRED: B
REASON: Response B is preferred because it maintains the same content as Response A but without the unnecessary repetition of the prompt, making it more concise and easier to read. This improves its clarity and format.
SCORE_A: 6
SCORE_B: 7
CONFIDENCE: Medium
Judge Response:
 PREFERRED: B
REASON: Response B is preferred because it maintains the same content as Response A but without the unnecessary repetition of the prompt, making it more concise and easier to read. This improves its clarity and format, which are crucial for effective communication in a code review.
SCORE_A: 4
SCORE_B: 7
CONFIDENCE: Medium
Judge Response:
 PREFERRED: B
REASON: Response B is better because it maintains the same level of correctness and completeness as Response A but is more concise and easier to read, lacking the unnecessary repetition of the prompt. This improves its clarity and format.
SCORE_A: 8
SCORE_B: 9
CONFIDENCE: High
Judge Response:
 PREFERRED: B
REASON: Response B

In [24]:
with open("judge_results_base_model.json", "w") as f:
    json.dump(judge_results, f, indent=4)