In [1]:
from graph_tool.all import *
from packaging import version
import datetime
import numpy as np
import pytz
from dateutil.parser import parse

In [2]:
repo = "hive"

In [3]:
g = load_graph("outputs/" + repo + "/my_graph.graphml")

In [4]:
g.list_properties()

affected_versions (vertex)  (type: string)
caused_by      (vertex)  (type: string)
commit_date    (vertex)  (type: string)
created_date   (vertex)  (type: string)
fixed_versions (vertex)  (type: string)
jira_type      (vertex)  (type: string)
jiraid         (vertex)  (type: string)
sha            (vertex)  (type: string)
summary        (vertex)  (type: string)


# Earliest bug appearance
**Disagreement**: bug-fix *affected-version* is older than bug-inducing *fixed version*

**Disagreement ratio**: R(S) = D(S) / B(S) 

S: project

D(S): number of disagreements in S

B(S): total number of bugs in S

In [14]:
def get_earliest_version(versions):
    earliest = version.parse(versions[0])
    for v in versions:
        v = version.parse(v)
        if v < earliest:
            earliest = v
    return earliest

def get_earliest(prop_name, v):
    versions = g.vertex_properties[prop_name][v]
    if versions is not "":
        all_versions = versions.split(", ")
#         print(all_versions)
        earliest = get_earliest_version(all_versions)
#         print(earliest)
        return earliest

In [15]:
bug_count = 0
disagreement_count = 0
for v in g.vertices():
    out_neighbors = list(v.out_neighbors())
    is_bug = len(out_neighbors) != 0
    if is_bug:
        bug_count += 1
        earliest_fixed = get_earliest("fixed_versions", v)
        for bugfix in out_neighbors:
            earliest_affected = get_earliest("affected_versions", bugfix)
            if earliest_affected and earliest_fixed and earliest_affected < earliest_fixed:
                disagreement_count += 1
            
print("Disagreements:", disagreement_count)
print("Bugs: ", bug_count)
print("% of disagreements: ", disagreement_count / bug_count)

Disagreements: 945
Bugs:  4667
% of disagreements:  0.20248553674737518


# Future impact of changes

**% of multiple future bugs**

future bug ~ fixed multiple times

**time span of future bugs (average in days)**

time passed between first bugfix and last bugfix

In [7]:
def get_created_date(v):
    return datetime.datetime.strptime(g.vertex_properties["created_date"][v], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(pytz.utc)


In [8]:
bugs = set()
future_bug = set()
time_span = []
for v in g.vertices():
    out_neighbors = list(v.out_neighbors())
    is_bug = len(out_neighbors) != 0
    if is_bug:
        sha = g.vertex_properties["sha"][v]
        if sha not in bugs:
            bugs.add(sha)
        if len(out_neighbors) > 1:
            future_bug.add(sha)
            earliest_created = get_created_date(out_neighbors[0])
            latest_created = get_created_date(out_neighbors[0])
            for bugfix in out_neighbors:
                created = get_created_date(bugfix)
                if created < earliest_created:
                    earliest_created = created
                elif created > latest_created:
                    latest_created = created
            diff = latest_created - earliest_created
            time_span.append(diff.days)
                
print("Future bug count:", len(future_bug))
print("Bugs: ", len(bugs))
print("% of future bugs: ", len(future_bug) / len(bugs))
print("Time span average (days): ", np.mean(np.array(time_span)))

Future bug count: 2661
Bugs:  4667
% of future bugs:  0.5701735590314978
Time span average (days):  837.3881999248403


# Realism of bug introduction

**Number of days between the first and last bug-introducing changes (median)**

In [9]:
def get_committed_date(v):
#     print(g.vertex_properties["commit_date"][v])
    return parse(g.vertex_properties["commit_date"][v]).astimezone(pytz.utc)

In [10]:
time_span_2 = []
for v in g.vertices():
    jira_type = g.vertex_properties["jira_type"][v]
    if jira_type == "Bug":
        in_neighbors = list(v.in_neighbors())
        if len(in_neighbors) != 0:
            earliest_created = get_committed_date(in_neighbors[0])
            latest_created = get_committed_date(in_neighbors[0])
            for buggy in in_neighbors:
                committed = get_created_date(bugfix)
                if committed < earliest_created:
                    earliest_created = committed
                elif committed > latest_created:
                    latest_created = committed
            diff = latest_created - earliest_created
            time_span_2.append(diff.days)

print("Time span median (days): ", np.median(np.array(time_span_2)))

Time span median (days):  1834.0


# Statistics

In [11]:
bug_fix_count = 0
bug_inducing_count = 0
both = 0
for v in g.vertices():
    jira_type = g.vertex_properties["jira_type"][v]
    is_bugfix = jira_type == "Bug"
    if is_bugfix:
        bug_fix_count += 1
    out_neighbors = list(v.out_neighbors())
    is_bug = len(out_neighbors) != 0
    if is_bug:
        bug_inducing_count += 1
    if is_bugfix and is_bug:
        both += 1
        
print("Number of bug-fixing commits: ", bug_fix_count)
print("Number of bug-inducing commits: ", bug_inducing_count)
print("Number of commits that are both: ", both)

Number of bug-fixing commits:  7345
Number of bug-inducing commits:  4667
Number of commits that are both:  2226


# % of linked bugs

In [12]:
bug_fix_count = 0
bug_fix_has_linked_bug = 0
for v in g.vertices():
    jira_type = g.vertex_properties["jira_type"][v]
    is_bugfix = jira_type == "Bug"
    if is_bugfix:
        bug_fix_count += 1
        in_neighbors = list(v.in_neighbors())
        has_linked_bug = len(in_neighbors) != 0
        if has_linked_bug:
            bug_fix_has_linked_bug += 1
            
print("Number of bug-fixing commits: ", bug_fix_count)
print("Number of bug-fixing commits that have linked bugs: ", bug_fix_has_linked_bug)
print("% of linked bugs: ", bug_fix_has_linked_bug / bug_fix_count)

Number of bug-fixing commits:  7345
Number of bug-fixing commits that have linked bugs:  4927
% of linked bugs:  0.6707964601769911


# Found bug-inducing commits

**Agreement**: when the algorithm finds the correct bug-inducing commit

**% of agreements:** agreements / bugs flagged by developers

**Average linked bugs:** number of candidate bugs the algorithm proposes on average

In [15]:
bug_count = 0
agreement_count = 0
linked_bugs = []
for i, v in enumerate(g.vertices()):
    jira_type = g.vertex_properties["jira_type"][v]
    is_bugfix = jira_type == "Bug"
    bugfix_jiraid = g.vertex_properties["jiraid"][v]
    bugfix_sha = g.vertex_properties["sha"][v]
    if is_bugfix:
        caused_by = g.vertex_properties["caused_by"][v].split(", ")
        if caused_by != ['']:
            bug_count += 1
            in_neighbors = list(v.in_neighbors())
            has_linked_bug = len(in_neighbors) != 0
            if has_linked_bug:
                linked_bugs.append(len(in_neighbors))
                for bug in in_neighbors:
                    bug_jira_id = g.vertex_properties["jiraid"][bug]
                    if bug_jira_id in caused_by:
                        agreement_count += 1
#                         print("%s fixed by %s (%s)" % (bug_jira_id, bugfix_jiraid, bugfix_sha))
            
print("Agreements:", agreement_count)
print("Bugs flagged by developers: ", bug_count)
print("% of agreements: ", agreement_count / bug_count)
print("Average linked bugs: ", np.mean(linked_bugs))

Agreements: 70
Bugs flagged by developers:  174
% of agreements:  0.40229885057471265
Average linked bugs:  2.1743119266055047


In [21]:
def bugs(jira_id):
    for v in g.vertices():
        bugfix_jira_id = g.vertex_properties["jiraid"][v]
        if bugfix_jira_id == jira_id:
            bugfix = v
    in_neighbors = list(bugfix.in_neighbors())
    blamed_ids = []
    for bug in in_neighbors:
        bug_jira_id = g.vertex_properties["jiraid"][bug]
        blamed_ids.append(bug_jira_id)
    print(blamed_ids)