In [None]:
%cd ..
%pwd

## Fetch alerts from Perfherder

In [None]:
import requests
import time
url = "https://treeherder.mozilla.org/api/performance/alertsummary/"
alertsummaries = []
i = 0
while True:
    i += 1
    print(f"{i}. GET {url}", end="\r")
    r = requests.get(url, headers={'User-Agent': 'basic'})
    j = r.json()
    alertsummaries += j['results']
    url = j['next']
    if url is None:
        break
    time.sleep(0.5)

In [None]:
import datetime
date = datetime.datetime.now().strftime('%Y-%m-%d')
write_json_to_file(alertsummaries, f'data/labeling/alerts_{date}.json')

## SZZ Evaluation

In [None]:
from src.utils import *
from src.labeling import *
from src.repo_miner import get_commit_log

In [None]:
alertsummaries = read_data_from_json('data/labeling/alerts_2022-03-01.json')
alertsummaries2 = read_data_from_json('data/labeling/alerts_2021-09-13.json')

In [None]:
alert_ids = {alert['id'] for alert in alertsummaries}
alert_ids2 = {alert['id'] for alert in alertsummaries2}

In [None]:
len(alert_ids), len(alert_ids2), len(alert_ids.intersection(alert_ids2))

In [None]:
for alert in alertsummaries2:
    if alert['id'] not in alert_ids:
        alertsummaries.append(alert)

In [None]:
all_commits = get_all_commits()
hg_to_git, git_to_hg = get_hg_git_mapping()
commit_log = get_commit_log('data/repo_miner/commit_log.csv')

In [None]:
szz_labeling = pd.read_csv(f'data/labeling/fixed_defect_szz.csv')
fix_bug_ids_by_kind, _ = get_defects_and_fixes()
fix_and_introducers = read_data_from_json(f'data/labeling/fixed_defect_szz/results/fix_and_introducers_pairs.json')

In [None]:
for pair in fix_and_introducers:
    pair[0] = git_to_hg[pair[0]]
    pair[1] = git_to_hg[pair[1]]

In [None]:
len(fix_and_introducers)

In [None]:
fix_and_introducers_df = pd.DataFrame(fix_and_introducers, columns=['fix', 'introducer'])
fix_and_introducers_df

In [None]:
# add bug id of fix
fix_and_introducers_df = fix_and_introducers_df.merge(all_commits[['revision', 'bug_id']], left_on='fix', right_on='revision').drop('revision', axis=1)
fix_and_introducers_df

In [None]:
fix_and_introducers_df[fix_and_introducers_df['introducer'] == '9ac290ec5884fd52bb6c16e9794da5b42f211cbb']

In [None]:
introducers = set(fix_and_introducers_df['introducer'])
len(set(fix_and_introducers_df['fix'])), len(introducers)

In [None]:
print_labeling_stats(szz_labeling, 'performance')

In [None]:
selected_introducers = set(szz_labeling.loc[szz_labeling['performance']==1, 'revision'])
len(selected_introducers)

In [None]:
not_selected_introducers = list(introducers.difference(selected_introducers))
commit_log.loc[not_selected_introducers].sort_values('id').tail(25)

In [None]:
# fix_bug_ids_by_kind['performance'] are bug numbers for which we tried to find an introducer
alertsummaries_considered = [alert for alert in alertsummaries if alert['bug_number'] in fix_bug_ids_by_kind['performance']]

# total number of bug ids for which an alert exists and for which we tried to find an introducer
N = len({alert['bug_number'] for alert in alertsummaries_considered})

len(alertsummaries_considered), N

In [None]:
# bug ids of fixes for which we actually found an introducer
bug_ids_found = set(fix_and_introducers_df['bug_id'])
assert len(bug_ids_found.difference(set(fix_bug_ids_by_kind['performance']))) == 0

# corresponding alerts
alertsummaries_found = [alert for alert in alertsummaries_considered if alert['bug_number'] in bug_ids_found]

# all bug ids for which an alert exists and for which we found an introducer
alert_bug_ids = {alert['bug_number'] for alert in alertsummaries_found}

TP_plus_FP = len(alert_bug_ids)

len(alertsummaries_found), TP_plus_FP # *bug numbers*

In [None]:
# number of bug ids for which we tried to find an introducer but did not
# lower bound on false negative *commits*
N - TP_plus_FP

In [None]:
TP = 0
FP = 0

for bug_id in alert_bug_ids:
    introducers_for_bug = fix_and_introducers_df.loc[fix_and_introducers_df['bug_id'] == bug_id, 'introducer']
    introducers_for_bug = set(introducers_for_bug) # found introducers by SZZ 
    assert len(introducers_for_bug) > 0
    
    all_candidate_revisions = set()
    for alert in alertsummaries_considered:
        if alert['bug_number'] == bug_id:
            try:
                if alert['prev_push_revision'] == alert['revision']:
                    # this is sometimes the case, we just take the single commit
                    candidate_revisions = set(commit_log.loc[alert['prev_push_revision']:alert['revision'], 'revision'])
                else:
                    # performance change detected in interval (alert['prev_push_revision'], alert['revision']]
                    # exclude prev_push_revision commit
                    candidate_revisions = set(commit_log.loc[alert['prev_push_revision']:alert['revision'], 'revision'].iloc[1:])
                    
            except KeyError:
                pass
            all_candidate_revisions = all_candidate_revisions.union(candidate_revisions)
            #alert_id = alert['id']
            #print(len(candidate_revisions), f'https://treeherder.mozilla.org/perfherder/alerts?id={alert_id}')


    TP += len(introducers_for_bug.intersection(candidate_revisions)) # number of found introducers in candidate revisions
    FP += len(introducers_for_bug.difference(candidate_revisions))

print(f'{TP=}, {FP=}, {TP+FP=}')

In [None]:
_, _, _, bugbug_fix_to_regressor = get_bugbug_regressors_and_fixes()


In [None]:
bugbug_fixes = set(bugbug_fix_to_regressor.keys())
len(bugbug_fixes)

In [None]:
bug_ids_considered = bugbug_fixes.intersection(set(fix_bug_ids_by_kind['performance']))
N = len(bug_ids_considered)
N

In [None]:
# bug ids for which we actually found an introducer
bug_ids_found = set(fix_and_introducers_df['bug_id'])
assert len(bug_ids_found.difference(set(fix_bug_ids_by_kind['performance']))) == 0

regressed_by_bug_ids = bug_ids_considered.intersection(bug_ids_found)
TP_plus_FP = len(regressed_by_bug_ids)
TP_plus_FP # *bug numbers*

In [None]:
# number of bug ids for which we tried to find an introducer but did not
# lower bound on false negative *commits*
N - TP_plus_FP

In [None]:
TP = 0
FP = 0
TP_commits = set()

for bug_id in regressed_by_bug_ids:
    introducers_for_bug = fix_and_introducers_df.loc[fix_and_introducers_df['bug_id'] == bug_id, 'introducer']
    introducers_for_bug = set(introducers_for_bug) # found introducers by SZZ 
    assert len(introducers_for_bug) > 0
    
    all_candidate_revisions = set()
    for regressors_bug_id in bugbug_fix_to_regressor[bug_id]:
        candidate_revisions = set(all_commits.loc[all_commits['bug_id'] == regressors_bug_id, 'revision'])
        all_candidate_revisions = all_candidate_revisions.union(candidate_revisions)

    tp_commits = introducers_for_bug.intersection(candidate_revisions)
    TP_commits = TP_commits.union(tp_commits)
    TP += len(tp_commits)
    FP += len(introducers_for_bug.difference(candidate_revisions))

print(f'{TP=}, {FP=}, {TP+FP=}')

In [None]:
write_json_to_file(list(TP_commits), 'experiments/results/szz_regressed_by_tp.json')

## Get info about bugs and repo

In [None]:
resolutions = set()
status = set()
types = set()
my_bug = None
with open('data/bugbug/bugs.json', encoding="utf-8") as f:
    for line in tqdm(f, desc='Get defects and fixes'):
        bug = json.loads(line)
        if bug['id'] == 1717171:
            my_bug = bug
        resolutions.add(bug['resolution'])
        status.add(bug['status'])
        types.add(bug['type'])

In [None]:
resolutions

In [None]:
status

In [None]:
types

In [None]:
selected_commits = get_selected_commits()

commit_log = get_commit_log('data/repo_miner/commit_log.csv')

In [None]:
len(selected_commits)

In [None]:
selected_commits['date']