In [1]:
from google import genai
import configparser
import json
import pandas as pd
import numpy as np
import random

random_seed = 50

In [21]:
sample_count = 5
df = pd.read_csv("./data/GoldSet.csv")
df["Diff"] = df["Diff"].apply(lambda x: x.replace("\\ No newline at end of file","").strip())

buggy_df = df[df.Decision == 'Buggy'][["Diff", "Decision", "CommitMessage"]]
rows = np.random.choice(buggy_df.index.values,sample_count)
buggy_df = buggy_df.loc[rows]

notbuggy_df = df[df.Decision == 'NotBuggy'][["Diff", "Decision", "CommitMessage"]]
rows = np.random.choice(notbuggy_df.index.values,sample_count)
notbuggy_df = notbuggy_df.loc[rows]

few_shot_data = [*buggy_df.values.tolist() , *notbuggy_df.values.tolist()]
json.dump(few_shot_data,open("./data/FewShots.json","w"))

In [28]:
class Untangler:
    def __init__(self, model_name):
        config = configparser.ConfigParser()
        config.read('.config')
        GEMINI_API_KEY = config["API_KEYS"]["GEMINI_API_KEY"]
        OPENAI_API_KEY = config["API_KEYS"]["OPENAI_API_KEY"]
        DEEPSEEK_API_KEY = config["API_KEYS"]["DEEPSEEK_API_KEY"]

        self.model_name = model_name
        if self.model_name == "gemini":
            self.__client = genai.Client(api_key=GEMINI_API_KEY)
        elif self.model_name in ["openai", "deepseek"]:
                raise NotImplementedError()
        else:
             raise ValueError("Invalid model name")
    
    def __prepare_prompt_for_gemini(self, commitMessage, diff):
        initial_prompt = "You are a Git commit review assistant. Given a Java source code diff and its commit message, analyze both to determine if the changes align with the described bug fix. Assess the relevance between the commit message and code modifications, identifying patterns such as error-handling updates, logical corrections, exception handling improvements, and other indicators of bug-related changes."
        
        data = json.load(open("./data/FewShots.json","r"))
        random.Random(random_seed).shuffle(data)
        few_shots = "\n"
        for item in data:
            item[0] = item[0].strip()
            item[2] = item[2].replace("\n"," ").strip()
            few_shots = few_shots + f"\nCommit Messaage: {item[2]}\nGit Diff:\n{item[0]}\nAnswer: {item[1]}\n"

        question = f"\nCommit Messaage: {commitMessage}\nGit Diff:\n{diff}\nAnswer:"

        return initial_prompt + few_shots + question

    def prepare_prompt(self, commitMessage, diff):
        if self.model_name == "gemini":
             self.prompt = self.__prepare_prompt_for_gemini(commitMessage, diff)
        else:
             raise NotImplementedError()
    
    def get_prompt(self):
        prompt = self.prompt
        if prompt == "":
             print("Printing last prompt...")
             prompt = self.last_prompt

        print("Prompt for:", self.model_name)
        print()
        print(prompt)

    def predict(self):
        if self.prompt == "":
            raise ValueError("Provide a new diff using prepare_prompt()")
        else:
            response = self.__client.models.generate_content(
                model='gemini-2.0-flash',
                config=genai.types.GenerateContentConfig(
                    system_instruction='I provide you with Java source code diff of a commit, you say Buggy or NotBuggy',
                    temperature=0.3,
                    max_output_tokens=3
                ),
                contents=self.prompt
            )

            self.last_prompt = self.prompt
            self.prompt = ""

            return response.text.replace("\n","")

In [29]:
df = pd.read_csv("./data/GoldSet.csv")
df["Diff"] = df["Diff"].apply(lambda x: x.replace("\\ No newline at end of file","").strip())

y_pred = []
y_true = []
for index, row in df.iterrows():
    untangler = Untangler("gemini")
    untangler.prepare_prompt(row["CommitMessage"], row["Diff"])
    
    y_pred.append(untangler.predict())
    y_true.append(row["Decision"])

ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}

In [30]:
y_true

['Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'NotBuggy']

In [32]:
y_pred

['Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'Buggy',
 'NotBuggy']