In [1]:
!pip install -q -U google-generativeai

In [None]:
key = input('Key:')

In [2]:
import google.generativeai as genai
import os

genai.configure(api_key=key)

In [3]:
model = genai.GenerativeModel("gemini-2.5-flash-preview-04-17")

In [4]:
import pickle

In [5]:
pickle_file_path = 'data/changes_train_lst.pkl'


In [6]:
with open(pickle_file_path, 'rb') as file:
    semantic_data = pickle.load(file)

In [7]:
pickle_file_path = 'data/features_train.pkl'

In [8]:
with open(pickle_file_path, 'rb') as file:
    features_data = pickle.load(file)

### Commit message + code + expert features

In [9]:
system_message = """You are a Just-In-Time Software Defect Prediction (JIT-SDP) system responsible for predicting 
software defects at the time they are about to occur.  To do that, you will receive a commit 
information. The commit information will have the following format:

COMMIT MESSAGE: <a commit message describing the intent of the commit>
ADDED LINES: {<a set of added lines>}
REMOVED LINES: {<a set of removed lines>}
NUMBER OF MODIFIED SUBSYSTEMS: <number of modified subsystems>
NUMBER OF MODIFIED DIRECTORIES: <number of modified directories>
NUMBER OF MODIFIED FILES: <number of modified files>
ENTROPY (DISTRIBUTION OF MODIFIED CODE ACROSS EACH FILE): <distribution of modified code across each file>
NUMBER OF LINES OF CODE ADDED: <number of lines of code added>
NUMBER OF LINES OF CODE DELETED: <number of lines of code deleted>
NUMBER OF LINES OF CODE IN A FILE BEFORE THE CHANGE: <number of lines of code in a file before the change>
FIX (True if the change is a defect fix, False otherwise>): <True if the change is a defect fix, False otherwise>
NUMBER OF DEVELOPERS THAT CHANGED THE MODIFIED FILES: <the number of developers that changed the modified files>
AVERAGE TIME BETWEEN THE LAST AND THE CURRENT CHANGE: <the average time between the last and the current change>
NUMBER OF UNIQUE CHANGES TO THE MODIFIED FILES: <the number of unique changes to the modified files>
DEVELOPER EXPERIENCE: <developer experience>
RECENT DEVELOPER EXPERIENCE: <recent developer experience>
DEVELOPER EXPERIENCE ON THE CURRENT SUBSYSTEM: <developer experience in a subsystem>

You will generate your answer by using commit messages, added/removed lines, and extra information.  You will answer with 
"YES" or "NO" if the commit may introduce a bug and the reason why you reached your conclusion.  
Generate your response in the following JSON format: 
{
    "prediction": <YES|NO>,
    "reason": <reason>
}
    """

In [10]:
exp_mean, rexp_mean, sexp_mean = features_data['exp'].astype(float).mean(), features_data['rexp'].astype(float).mean(), features_data['sexp'].astype(float).mean()

In [11]:
def get_exp_desc(mean, value):
    if int(float(value)) == int(mean):
        return 'average'
    elif float(value) < mean:
        return 'below average'
    elif float(value) > mean:
        return 'above average'
    else:
        return 'average'


In [12]:
exp_mean, rexp_mean, sexp_mean

(206.33247831928668, 106.32190481395303, 194.21912657697746)

In [13]:
prompt = f"""
NUMBER OF MODIFIED SUBSYSTEMS: {features_data.loc[0,'ns']}
NUMBER OF MODIFIED DIRECTORIES: {features_data.loc[0,'nd']}
NUMBER OF MODIFIED FILES: {features_data.loc[0,'nf']}
ENTROPY (DISTRIBUTION OF MODIFIED CODE ACROSS EACH FILE): {features_data.loc[0,'entropy']}
NUMBER OF LINES OF CODE ADDED: {features_data.loc[0,'la']}
NUMBER OF LINES OF CODE DELETED: {features_data.loc[0,'ld']}
NUMBER OF LINES OF CODE IN A FILE BEFORE THE CHANGE: {features_data.loc[0,'lt']}
FIX (True if the change is a defect fix, False otherwise>): {features_data.loc[0,'fix']}
NUMBER OF DEVELOPERS THAT CHANGED THE MODIFIED FILES: {features_data.loc[0,'ndev']}
AVERAGE TIME BETWEEN THE LAST AND THE CURRENT CHANGE: {features_data.loc[0,'age']}
NUMBER OF UNIQUE CHANGES TO THE MODIFIED FILES: {features_data.loc[0,'nuc']}
DEVELOPER EXPERIENCE: {get_exp_desc(exp_mean, features_data.loc[0,'exp'])}
RECENT DEVELOPER EXPERIENCE: {get_exp_desc(rexp_mean, features_data.loc[0,'rexp'])}
DEVELOPER EXPERIENCE ON THE CURRENT SUBSYSTEM: {get_exp_desc(sexp_mean, features_data.loc[0,'sexp'])}
    """

In [14]:
import pickle

In [15]:
pickle_file_path = 'data/changes_test_lst.pkl'

In [16]:
with open(pickle_file_path, 'rb') as file:
    semantic_data = pickle.load(file)

In [17]:
pickle_file_path = 'data/features_test.pkl'

In [18]:
with open(pickle_file_path, 'rb') as file:
    features_data = pickle.load(file)

In [19]:
len(semantic_data[0])

5480

In [20]:
len(features_data)

5480

In [21]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred):
    auc = roc_auc_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return auc, accuracy, precision, recall, f1

## Zero-shot Chain-of-Thought

In [22]:
preds = []
results = []

In [23]:
import pickle

with open('results_cot_gemini_2-5.pkl', 'rb') as f:
    results = pickle.load(f)

In [25]:
labels = semantic_data[1] 

In [26]:
def label_to_str(label):
    if label == 1: 
        return "YES"
    elif label == 0:
        return "NO"

In [27]:
preds = [r['prediction'] if isinstance(r, dict) else label_to_str(1 - labels[i]) for i,r in enumerate(results)]

In [28]:
preds = [1.0 if pred == 'YES' else 0.0 for pred in preds]

In [30]:
rpd = len(features_data)//500 

In [31]:
rpm = 10 

In [None]:
import json
from tqdm import tqdm
from json import JSONDecodeError
import time

commit_ids = semantic_data[0] 
labels = semantic_data[1] 
comments = semantic_data[2] 
code = semantic_data[3]

model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-04-17", system_instruction=system_message)
requests = 0

for i in tqdm(range(len(results),len(commit_ids))):
    prompt = f"""[Q] COMMIT MESSAGE: {comments[i]}"""
    
    if len(code[i]['added_code']) > 0:
        prompt += f"""\nADDED LINES: """ + str(code[i]['added_code']).replace('\'','')
   
    if len(code[i]['removed_code']) > 0:
        prompt += "\nREMOVED LINES:""" + str(code[i]['removed_code']).replace('\'','')
    
    assert features_data.loc[i, 'commit_hash'] == commit_ids[i]
    
    prompt += f"""\n
NUMBER OF MODIFIED SUBSYSTEMS: {features_data.loc[i,'ns']}
NUMBER OF MODIFIED DIRECTORIES: {features_data.loc[i,'nd']}
NUMBER OF MODIFIED FILES: {features_data.loc[i,'nf']}
ENTROPY (DISTRIBUTION OF MODIFIED CODE ACROSS EACH FILE): {features_data.loc[i,'entropy']}
NUMBER OF LINES OF CODE ADDED: {features_data.loc[i,'la']}
NUMBER OF LINES OF CODE DELETED: {features_data.loc[i,'ld']}
NUMBER OF LINES OF CODE IN A FILE BEFORE THE CHANGE: {features_data.loc[i,'lt']}
FIX (True if the change is a defect fix, False otherwise>): {features_data.loc[i,'fix']}
NUMBER OF DEVELOPERS THAT CHANGED THE MODIFIED FILES: {features_data.loc[i,'ndev']}
AVERAGE TIME BETWEEN THE LAST AND THE CURRENT CHANGE: {features_data.loc[i,'age']}
NUMBER OF UNIQUE CHANGES TO THE MODIFIED FILES: {features_data.loc[i,'nuc']}
DEVELOPER EXPERIENCE: {get_exp_desc(exp_mean, features_data.loc[i,'exp'])}
RECENT DEVELOPER EXPERIENCE: {get_exp_desc(rexp_mean, features_data.loc[i,'rexp'])}
DEVELOPER EXPERIENCE ON THE CURRENT SUBSYSTEM: {get_exp_desc(sexp_mean, features_data.loc[i,'sexp'])}
    """
    
    prompt += "\n[A] Let's think step by step. "

    error = False

    response = model.generate_content(prompt, 
                                  generation_config = genai.GenerationConfig(temperature=0,))
    try:
        content = response.candidates[0].content.parts[0].text
    except:
        print(response)
        pred = 1 - labels[i]
        error = True
        
    try:
        content = json.loads(content[content.index("{"):content.rfind("}")+1].replace('\n', ' '))
    except:
        print(content)
        #Se errou o formato de saída, considera como um erro (label oposta ao esperado)
        pred = 1 - labels[i]
        error = True
      #  break
    
    if i < 10:
        print(content)
        
    results.append(content)                                                                                                                                          

    if not error:
        prediction = content["prediction"]
        pred = 0.0
    
        if prediction == "YES":
            pred = 1.0
        
    preds.append(pred)
    
    if i > 0 and (i % 100) == 0:
        auc, accuracy, precision, recall, f1 = calculate_metrics(labels[:i+1], preds)
        print(f"auc = {auc}, accuracy = {accuracy}, precision = {precision}, recall = {recall}, f1 = {f1}")

    time.sleep(12) 
    

In [33]:
len(results)

5480

In [34]:
len(results), len(preds)

(5480, 5480)

In [35]:
sum(preds)

2865.0

In [36]:
preds[-1]

1.0

In [37]:
results[-1]

{'prediction': 'YES',
 'reason': "The change involves removing deprecated API calls, which is a common refactoring task. While the code changes themselves are small and seem straightforward (replacing HTableDescriptor with TableDescriptor and fixing a typo in setTimestamp), the developer's experience is indicated as below average across the board (overall, recent, and on the subsystem). The modified files also show a high number of previous developers and unique changes, suggesting they are frequently modified and potentially complex or central. A less experienced developer working on files with high historical churn increases the risk of introducing subtle bugs, even with seemingly simple API updates."}

In [38]:
auc, accuracy, precision, recall, f1 = calculate_metrics(labels[:len(results)], preds[:len(results)])
print(f"auc = {auc}, accuracy = {accuracy}, precision = {precision}, recall = {recall}, f1 = {f1}")
    

auc = 0.6275450864924549, accuracy = 0.5215328467153285, precision = 0.12530541012216406, recall = 0.7557894736842106, f1 = 0.2149700598802395


In [39]:
with open('results_cot_gemini_2-5.pkl', 'wb') as file:
    pickle.dump(results, file)