In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import difflib

In [3]:
files = os.listdir('logs')
log_files = [ f for f in files if f != ".gitignore" ]
log_files[:2]

['ReFAct-ab208055-1c82-4afb-a7ed-96356bfb9c9b.json',
 'ReFAct-f4de6a68-fc6c-48be-bba7-98ec9bc21217.json']

In [4]:
print(f"Found {len(log_files)} log files")

Found 94 log files


# Analyse the logs

In [5]:
all_log_file_contents = []
for log_file in log_files:
    with open(f'logs/{log_file}', 'r') as f:
        all_log_file_contents.append(json.load(f))
[d.keys() for d in all_log_file_contents[:2]]

[dict_keys(['agent_settings', 'execution', 'project', 'llm']),
 dict_keys(['agent_settings', 'execution', 'project', 'llm'])]

In [6]:
all_executions = []
for log_file in all_log_file_contents:
    applied_settings = log_file['agent_settings']
    for exec in log_file['execution']:
        all_executions.append((exec, applied_settings))

# get only the executions that are of type dict
all_executions = [e for e in all_executions if type(e[0]) == dict]

triples = []
# get pairs of tool: "open_file", "overwrite_file" and tool: "commit_changes"
for a, b, c in zip(all_executions, all_executions[1:], all_executions[2:]):

    # for a, b, c, the applied settings are in the second element of the tuple
    # check, that the settings are the same for all three executions
    if a[1] != b[1] or b[1] != c[1]:
        continue    

    if a[0].get("tool", None) == 'open_file' and b[0].get("action", None) == 'overwrite_file' and c[0].get("result", None) == 'Tests passed. Please commit your changes.':
        triples.append((a, b, c))

len(triples)

148

In [7]:
[d[0].keys() for d in triples[0]]

[dict_keys(['tool', 'result']),
 dict_keys(['thought', 'action', 'tools_input', 'observation']),
 dict_keys(['tool', 'result'])]

In [8]:
from collections import defaultdict

In [9]:
diff_percentages = defaultdict(list)
adds = defaultdict(list)
dels = defaultdict(list)
diffs = defaultdict(list)


for tuple in triples:

    settings = tuple[0][1]
    
    # get the applied settings, i.e. the settings where value is "True"
    applied_settings = {k: v for k, v in settings.items() if v == True}
    if len(applied_settings) == 0:
        applied_setting = "Default"
    if len(applied_settings) == 1:
        applied_setting = list(applied_settings.keys())[0]
    else:
        if "make_plan" in applied_settings and "dynamic_plan" in applied_settings:
            applied_setting = "dynamic_plan"


    t = [d[0] for d in tuple]
    prev_code = t[0]['result']
    new_code = t[1]['tools_input']['content']

    d = difflib.ndiff(prev_code.splitlines(), new_code.splitlines())

    # count the number of lines that are different
    num_adds = 0
    num_dels = 0

    for line in d:

        if line.startswith('+'):
            num_adds += 1
        elif line.startswith('-'):
            num_dels += 1

    num_diffs = num_adds + num_dels

    diff_percentages[applied_setting].append(num_diffs / len(prev_code.splitlines()))
    adds[applied_setting].append(num_adds)
    dels[applied_setting].append(num_dels)
    diffs[applied_setting].append(num_diffs)

In [10]:
diffs_agg = []
for v in diffs.values():
    diffs_agg.extend(v)

# get the average number of diffs
print(f"Average number of diffs: {sum(diffs_agg) / len(diffs_agg)}")

# get the median number of diffs
import statistics
print(f"Median number of diffs: {statistics.median(diffs_agg)}")

# get the maximum number of diffs
print(f"Maximum number of diffs: {max(diffs_agg)}")

# get the minimum number of diffs
print(f"Minimum number of diffs: {min(diffs_agg)}")

Average number of diffs: 63.28378378378378
Median number of diffs: 48.5
Maximum number of diffs: 282
Minimum number of diffs: 0


In [11]:
# get the average number of diffs

diff_percentages_agg = []
for v in diff_percentages.values():
    diff_percentages_agg.extend(v)

print(f"Average percentage of diffs: {sum(diff_percentages_agg) / len(diff_percentages_agg)}")

Average percentage of diffs: 1.5896618437306012


In [12]:
adds_agg = []
for v in adds.values():
    adds_agg.extend(v)
dels_agg = []
for v in dels.values():
    dels_agg.extend(v)

# average nunmber of added lines
print(f"Average number of added lines: {sum(adds_agg) / len(adds_agg)}")

# average nunmber of deleted lines
print(f"Average number of deleted lines: {sum(dels_agg) / len(dels_agg)}")

Average number of added lines: 33.46621621621622
Average number of deleted lines: 29.81756756756757


## Get the same statistics, but by setting

In [13]:
stats_by_setting = {}

In [14]:
for setting, s_diffs in diffs.items():
    # print(f"Setting: {setting}")
    # # get the average number of diffs
    # print(f"Average number of diffs: {sum(s_diffs) / len(s_diffs)}")

    # # get the median number of diffs
    # import statistics
    # print(f"Median number of diffs: {statistics.median(s_diffs)}")

    # # get the maximum number of diffs
    # print(f"Maximum number of diffs: {max(s_diffs)}")

    # # get the minimum number of diffs
    # print(f"Minimum number of diffs: {min(s_diffs)}")
    
    
    stats_dict = {
        "average_diffs": sum(s_diffs) / len(s_diffs),
        "median_diffs": statistics.median(s_diffs),
        "max_diffs": max(s_diffs),
        "min_diffs": min(s_diffs)
    }
    stats_by_setting[setting] = stats_dict
    

In [15]:
# get the average number of diffs
for setting, s_diffs in diff_percentages.items():
    # print(f"Setting: {setting}")
    # print(f"Average percentage of diffs: {sum(s_diffs) / len(s_diffs)}")
    
    stats_by_setting[setting]["average_diff_percentage"] = sum(s_diffs) / len(s_diffs)

In [16]:
for setting, s_diffs in adds.items():
    # print(f"Setting: {setting}")
    # average nunmber of added lines
    # print(f"Average number of added lines: {sum(s_diffs) / len(s_diffs)}")
    
    stats_by_setting[setting]["average_adds"] = sum(s_diffs) / len(s_diffs)

for setting, s_diffs in dels.items():
    # print(f"Setting: {setting}")
    # average nunmber of deleted lines
    # print(f"Average number of deleted lines: {sum(s_diffs) / len(s_diffs)}")
    
    stats_by_setting[setting]["average_dels"] = sum(s_diffs) / len(s_diffs)

In [17]:
def snake_to_upper(x):
    new_str = x.replace("_", " ").title()
    
    if new_str == "Consider Cyclomatic Complexity":
        return "Consider CYC"

    if new_str == "Make Incremental Changes":
        return "Incremental Changes"

    return new_str
    
stats_by_setting = {snake_to_upper(k): v for k, v in stats_by_setting.items()}

In [18]:
import pandas as pd

df = pd.DataFrame(stats_by_setting).T

# change the column names

lookup = {
    "average_diffs": "Avg. changed lines",
    "median_diffs": "Median changed lines",
    "max_diffs": "Max. changed lines",
    "min_diffs": "Min. changed lines",
    "average_diff_percentage": "Avg. changed lines (\%)",
    "average_adds": "Avg. added lines",
    "average_dels": "Avg. deleted lines",
}

# rename the columns
df.rename(columns=lookup, inplace=True)
df


Unnamed: 0,Avg. changed lines,Median changed lines,Max. changed lines,Min. changed lines,Avg. changed lines (\%),Avg. added lines,Avg. deleted lines
Clip Context Window,55.75,48.0,173.0,0.0,1.764829,31.125,24.625
Default,81.764706,89.0,282.0,0.0,1.885335,38.470588,43.294118
Incremental Changes,30.9,19.5,114.0,0.0,0.28697,13.3,17.6
Use Previous History,59.583333,46.5,224.0,2.0,0.981847,34.25,25.333333
Modify Specific Method,88.0,87.0,210.0,0.0,1.31391,41.666667,46.333333
Use Refactoring Tricks,81.6,67.0,280.0,0.0,1.346454,39.45,42.15
Clear Error After N,73.055556,64.0,173.0,9.0,2.693408,45.722222,27.333333
Dynamic Plan,51.0,37.5,183.0,0.0,0.914955,24.05,26.95
Consider CYC,65.6,61.5,160.0,16.0,1.831339,36.4,29.2
Make Plan,49.714286,40.0,110.0,0.0,1.068378,30.285714,19.428571


In [19]:
# soft by average diff percentage
df = df.sort_values(by="Avg. changed lines (\%)", ascending=False)
df

Unnamed: 0,Avg. changed lines,Median changed lines,Max. changed lines,Min. changed lines,Avg. changed lines (\%),Avg. added lines,Avg. deleted lines
Get Buddy Feedback,46.75,39.5,104.0,8.0,2.745426,28.166667,18.583333
Clear Error After N,73.055556,64.0,173.0,9.0,2.693408,45.722222,27.333333
Default,81.764706,89.0,282.0,0.0,1.885335,38.470588,43.294118
Consider CYC,65.6,61.5,160.0,16.0,1.831339,36.4,29.2
Clip Context Window,55.75,48.0,173.0,0.0,1.764829,31.125,24.625
Use Refactoring Tricks,81.6,67.0,280.0,0.0,1.346454,39.45,42.15
Modify Specific Method,88.0,87.0,210.0,0.0,1.31391,41.666667,46.333333
Make Plan,49.714286,40.0,110.0,0.0,1.068378,30.285714,19.428571
Use Previous History,59.583333,46.5,224.0,2.0,0.981847,34.25,25.333333
Dynamic Plan,51.0,37.5,183.0,0.0,0.914955,24.05,26.95


In [22]:
# add a new column "delta", which is the diff between Avg. added lines and Avg. deleted lines
df["Average delta"] = df["Avg. added lines"] - df["Avg. deleted lines"]
df = df.sort_values(by="Average delta", ascending=False)
df

Unnamed: 0,Avg. changed lines,Median changed lines,Max. changed lines,Min. changed lines,Avg. changed lines (\%),Avg. added lines,Avg. deleted lines,Average delta
Clear Error After N,73.055556,64.0,173.0,9.0,2.693408,45.722222,27.333333,18.388889
Make Plan,49.714286,40.0,110.0,0.0,1.068378,30.285714,19.428571,10.857143
Get Buddy Feedback,46.75,39.5,104.0,8.0,2.745426,28.166667,18.583333,9.583333
Use Previous History,59.583333,46.5,224.0,2.0,0.981847,34.25,25.333333,8.916667
Consider CYC,65.6,61.5,160.0,16.0,1.831339,36.4,29.2,7.2
Clip Context Window,55.75,48.0,173.0,0.0,1.764829,31.125,24.625,6.5
Use Refactoring Tricks,81.6,67.0,280.0,0.0,1.346454,39.45,42.15,-2.7
Dynamic Plan,51.0,37.5,183.0,0.0,0.914955,24.05,26.95,-2.9
Incremental Changes,30.9,19.5,114.0,0.0,0.28697,13.3,17.6,-4.3
Modify Specific Method,88.0,87.0,210.0,0.0,1.31391,41.666667,46.333333,-4.666667


In [124]:
df.to_latex("diffs.tex", float_format="%.2f", multicolumn_format="c")