In [18]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


from modules.Untangler import Untangler, UntanglerOpenAI
random_seed = 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
df1 = pd.read_csv("./data/GoldSet.csv")
df1["Diff"] = df1["Diff"].apply(lambda x: x.replace("\\ No newline at end of file","").strip())

df2 = pd.read_csv("./data/GoldSet_TrueBuggyMethods.csv")
df2["Diff"] = df2["Diff"].apply(lambda x: x.replace("\\ No newline at end of file","").strip())

df3 = pd.read_csv("./data/GoldSet_TrueNotBuggyMethods.csv")
df3["Diff"] = df3["Diff"].apply(lambda x: x.replace("\\ No newline at end of file","").strip())

df = pd.concat([df1, df2, df3], ignore_index=True)
df.to_csv("./data/Complete_GoldSet.csv", index = False)

In [21]:
df = pd.read_csv("./data/Complete_GoldSet.csv")

# gpt-4o-mini 2 shots

In [24]:
untangler = Untangler(model_name="openai")
result = untangler.batch_detect(df, iteratively=False)
result.to_csv("./Results/openai-4o-mini_2shot_result.csv", index = False)

1776it [00:00, 21403.08it/s]
1776it [43:36,  1.47s/it]


In [25]:
result = pd.read_csv("./Results/openai-4o-mini_2shot_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.86      0.85      0.86       900
    NotBuggy       0.85      0.86      0.85       876

    accuracy                           0.85      1776
   macro avg       0.85      0.85      0.85      1776
weighted avg       0.85      0.85      0.85      1776



# gpt-4o-mini 2 shot cot

In [None]:
untangler = Untangler(model_name="openai", enable_cot=True)
res = untangler.batch_detect(df)
result.to_csv("./Results/openai-4o-mini_2shot_cot_result.csv", index = False)

In [55]:
result = pd.read_csv("./Results/openai-4o-mini_2shot_cot_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.78      0.90      0.84       900
    NotBuggy       0.88      0.75      0.81       876

    accuracy                           0.82      1776
   macro avg       0.83      0.82      0.82      1776
weighted avg       0.83      0.82      0.82      1776



# gpt-4o-mini 6 shots

In [None]:
untangler = Untangler(model_name="openai")
result = untangler.batch_detect(df, iteratively=False)
result.to_csv("./Results/openai_4o-mini_result.csv", index = False)

In [15]:
result = pd.read_csv("./Results/openai_4o-mini_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.77      0.91      0.84       901
    NotBuggy       0.89      0.73      0.80       877

    accuracy                           0.82      1778
   macro avg       0.83      0.82      0.82      1778
weighted avg       0.83      0.82      0.82      1778



# gemini-2.0-flash

In [None]:
untangler = Untangler(model_name="gemini")
result = untangler.batch_detect(df)
result.to_csv("./Results/gemini_result.csv", index = False)

In [11]:
result = pd.read_csv("./Results/gemini_result.csv")
result["Detection"] = result["Detection"].apply(lambda x: x.strip())
result = result[result["Detection"].isin(['Buggy', 'NotBuggy'])]
result["Detection"].unique()
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.83      0.94      0.88       900
    NotBuggy       0.93      0.80      0.86       876

    accuracy                           0.87      1776
   macro avg       0.88      0.87      0.87      1776
weighted avg       0.88      0.87      0.87      1776



# gemini-2.0 2 shot

In [28]:
untangler = Untangler(model_name="gemini")
result = untangler.batch_detect(df)
result.to_csv("./Results/gemini_2shot_result.csv", index = False)

1776it [22:44:36, 46.10s/it]   


In [30]:
result = pd.read_csv("./Results/gemini_2shot_result.csv")
result["Detection"] = result["Detection"].apply(lambda x: x.strip())
result = result[result["Detection"].isin(['Buggy', 'NotBuggy'])]
result["Detection"].unique()
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.74      0.98      0.84       900
    NotBuggy       0.97      0.65      0.78       876

    accuracy                           0.82      1776
   macro avg       0.85      0.81      0.81      1776
weighted avg       0.85      0.82      0.81      1776



# o3-mini

In [36]:
untangler = UntanglerOpenAI(model_name="o3-mini")

In [37]:
result = untangler.batch_detect(df, iteratively=True)
result.to_csv("./Results/openai_o3-mini_result.csv", index = False)

1778it [5:11:40, 10.52s/it]


In [38]:
result = pd.read_csv("./Results/openai_o3-mini_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.63      0.98      0.77       901
    NotBuggy       0.95      0.42      0.58       877

    accuracy                           0.70      1778
   macro avg       0.79      0.70      0.68      1778
weighted avg       0.79      0.70      0.68      1778



# 4o

In [None]:
untangler = Untangler(model_name="openai")
untangler.change_model("gpt-4o")
result = untangler.batch_detect(df, iteratively=True)
result.to_csv("./Results/openai_gpt-4o_result.csv", index = False)

1718it [1:50:58,  3.88s/it]


In [20]:
result = pd.read_csv("./Results/openai_gpt-4o_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.89      0.85      0.87       901
    NotBuggy       0.85      0.89      0.87       877

    accuracy                           0.87      1778
   macro avg       0.87      0.87      0.87      1778
weighted avg       0.87      0.87      0.87      1778



# 4o 2 shot

In [34]:
untangler = Untangler(model_name="openai")
untangler.change_model("gpt-4o")
result = untangler.batch_detect(df, iteratively=True)
result.to_csv("./Results/openai_gpt-4o_2shot_result.csv", index = False)

1444it [1:40:13,  3.55s/it] 

Error - Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_b935c4e4cde17eaab10d2770a5415b67 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}.
Retrying in 1 minute
Error - Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_b8d008da8a14f8f8d25856886a5da74a in your email.)', 'type': 'server_error', 'param': None, 'code': None}}.
Retrying in 1 minute
Error - Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep

1776it [2:28:54,  5.03s/it] 


In [35]:
result = pd.read_csv("./Results/openai_gpt-4o_2shot_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.89      0.87      0.88       900
    NotBuggy       0.87      0.89      0.88       876

    accuracy                           0.88      1776
   macro avg       0.88      0.88      0.88      1776
weighted avg       0.88      0.88      0.88      1776



# 4o 2 shot cot

In [72]:
untangler = Untangler(model_name="openai", enable_cot=True)
untangler.change_model("gpt-4o")
result = untangler.batch_detect(df, iteratively=True)
result.to_csv("./Results/openai_gpt-4o_2shot_cot_result.csv", index = False)

1776it [2:33:50,  5.20s/it]


In [73]:
result = pd.read_csv("./Results/openai_gpt-4o_2shot_cot_result.csv")
print(classification_report(result["Decision"], result["Detection"]))

              precision    recall  f1-score   support

       Buggy       0.83      0.93      0.88       900
    NotBuggy       0.92      0.81      0.86       876

    accuracy                           0.87      1776
   macro avg       0.88      0.87      0.87      1776
weighted avg       0.87      0.87      0.87      1776

