# Inspect Various Attributes

In [1]:
import sqlite3

In [2]:
import scripts.buildSqlite

10231 entries added to table SStuBs
63923 entries added to table SStuBs Large
25539 entries added to table Bugs
153652 entries added to table Bugs Large
Created indices


In [3]:
conn = sqlite3.connect('../database/sstubs.db')
cursor = conn.cursor()

### Terminologies

DB Col | Dataset Col | Def
--- | --- | --- |
parent | fixCommitParentSHA1 | SHA1 of the parent commit
child | fixCommitSHA1 | SHA1 of the current commit
type | bugType | One of the given 16 bug types
project | projectName | The project's name in format '{repo_owner}.{repo_name}'
file | bugFilePath | Relative path to the buggy file
line | bugLineNum | Line number of the buggy AST node
before | sourceBeforeFix | The fixed AST node in textual format
after | sourceAfterFix | The buggy AST node in textual format

### 1. Unique SHA1

#### 1.1. Bugs

In [4]:
query = 'SELECT count(DISTINCT child), count() FROM bugs_large'
for unq, tot in cursor.execute(query):
    print(f'There are total {unq:,} unique fixCommitSHA1 among total {tot:,} entries')
    print(f'That is each commit fixes nearly {round(tot / unq, 2)} buggy lines')

There are total 66,261 unique fixCommitSHA1 among total 153,652 entries
That is each commit fixes nearly 2.32 buggy lines


#### 1.2. SStuBs

In [5]:
query = 'SELECT count(DISTINCT child), count() FROM sstubs_large'
for unq, tot in cursor.execute(query):
    print(f'There are total {unq:,} unique fixCommitSHA1 among total {tot:,} entries')
    print(f'That is each commit contains nearly {round(tot / unq, 2)} lines of stupid bugs')

There are total 24,486 unique fixCommitSHA1 among total 63,923 entries
That is each commit contains nearly 2.61 lines of stupid bugs


### 2. Chance of introducing new bug / skipping existing bug

#### 2.1. At commit level
If the same `fixCommitSHA1` is found as the `fixCommitParentSHA1` in another commit,
then the former commit needed a subsequent fix.

##### 2.1.1. Bugs

In [6]:
query = '''SELECT count(DISTINCT child) FROM bugs_large WHERE child IN (
    SELECT parent FROM bugs_large
)'''

for row, in cursor.execute(query):
    print(f'{row:,} fixes required another fix')

3,357 fixes required another fix


##### 2.1.2. SStubs

In [7]:
query = '''SELECT count(DISTINCT child) FROM sstubs_large WHERE child IN (
    SELECT parent FROM sstubs_large
)'''

for row, in cursor.execute(query):
    print(f'{row:,} fixes required another fix')

582 fixes required another fix


#### 2.2. At line level
If the same (line, file, project) sequence contains a `fixCommitSHA1`
that was `fixCommitParentSHA1` in a previous entry, that means that line required a subsequent fix

##### 2.2.1. Bugs

In [8]:
query = '''SELECT 1 as dummy_col FROM bugs_large WHERE (child, project, file, line) IN (
    SELECT parent, project, file, line FROM bugs_large
) GROUP BY child, project, file, line'''
print(f'{len(list(cursor.execute(query))):,} line-fixes required another fix')

846 line-fixes required another fix


##### 2.2.2. SStubs

In [9]:
query = '''SELECT 1 as dummy_col FROM sstubs_large WHERE (child, project, file, line) IN (
    SELECT parent, project, file, line FROM sstubs_large
) GROUP BY child, project, file, line'''
print(f'{len(list(cursor.execute(query))):,} line-fixes required another fix')

315 line-fixes required another fix


### 3. Fix-commit tree

In [10]:
from collections import deque

#### 3.1. Get necessary data

##### 3.1.1 Get parent-child relations

In [11]:
tree = {}
query = 'SELECT parent, child FROM sstubs'
for parent, child in cursor.execute(query):
    if parent not in tree:
        tree[parent] = set()
    tree[parent].add(child)

print(f'{len(tree):,} unique parentSHA1')

4,168 unique parentSHA1


##### 3.1.2. Get tree-roots
Imagine a forest of the following structure
```
    a        e      g
   / \      /      /
  b   c    f      h
     /           /
    d           i
```

Now, to find a path of length 3, we must start at either `a` or `g`.
However, `tree.keys()` will have every node except the child nodes.
Thus, we also defined a list `treeRoots` that contains nodes without any parents

In [12]:
query = '''
SELECT DISTINCT parent FROM sstubs WHERE parent NOT IN (
    SELECT child FROM sstubs
)
'''
treeRoots = [p for p, in list(cursor.execute(query))]
print(f'{len(treeRoots)} tree-roots')

4022 tree-roots


##### 3.1.3. Define utility function

In [13]:
def bfs(start, return_path=False, return_refix=False):
    visited_roots = set()
    distance = {start: 0}
    queue = deque()
    queue.append(start)
    # mark `start` as visited
    visited_roots.add(start)

    max_dist = 0
    distant_child = None
    # `parent_map` is used to trace back from leaf to root
    parent_map = {}
    while queue:
        p = queue.popleft()
        if p not in tree:
            continue
        for c in tree[p]:
            if c not in visited_roots:
                parent_map[c] = p
                visited_roots.add(c)
                distance[c] = distance[p] + 1
                queue.append(c)

                if distance[c] > max_dist:
                    max_dist = distance[c]
                    distant_child = c

    ret = [distant_child, max_dist]

    if return_path:
        fix_path = [distant_child]
        node = distant_child
        while node in parent_map:
            node = parent_map[node]
            fix_path.append(node)
        fix_path.reverse()
        ret.append(fix_path)

    if return_refix:
        # commits that are leaf of a path having more than 2 commits
        leaves_of_refix_paths = [
            node for node in distance.keys()
                # if `node` is not a key of `tree` then it does not have any child
                # and thus a leaf
                if distance[node] > 1 and node not in tree
        ]
        num_refix = len(leaves_of_refix_paths)
        ret.append(num_refix)
        ret.append(visited_roots)


    return ret

#### 4.2. Finding a commit sequence of length 5

In [14]:
path = []
for root in treeRoots:
    longestPathEnd, longestDistance = bfs(root)
    if longestDistance == 5:
        _, _, path = bfs(root, True)
        print('Path:')
        print(*path, sep='\n                   ⇓\n')
        break

Path:
063bc8616e8322dca47ae4b9d4860b864a61f215
                   ⇓
0b31e2f4558706b0831744485a80958c93524a44
                   ⇓
4eae69e20692a697a12a705155e972ddf448ca48
                   ⇓
9323424d263a1e573ab7edbfc69d67d8782ce36a
                   ⇓
6f74927366d17a4773006a094a7f0bc29c4b674b
                   ⇓
c32ad40f85061b84724dd9b5b8479eebea8675a0


#### 4.3. Files and corresponding changes of the found fix-path

In [15]:
parent_child = []
placeholder = []
for i in range(1, len(path)):
    query = f'''
    SELECT * FROM (
        SELECT file, group_concat(before) as before, group_concat(after) as after
        FROM bugs
        WHERE (parent=? AND child=?)
        GROUP BY parent, child, file, line
    )'''

    print(path[i-1], path[i], sep='\n', end='\n\n')
    for file, before, after in cursor.execute(query, (path[i-1], path[i])):
        print(file, before, after, sep='\n', end='\n\n')
    print('=========================================')


063bc8616e8322dca47ae4b9d4860b864a61f215
0b31e2f4558706b0831744485a80958c93524a44

hazelcast/src/test/java/com/hazelcast/topic/impl/reliable/ReliableTopicBasicDistributedTest.java
ReliableTopicBasicTest
ReliableTopicAbstractTest

hazelcast/src/test/java/com/hazelcast/topic/impl/reliable/ReliableTopicBasicLocalTest.java
ReliableTopicBasicTest
ReliableTopicAbstractTest

0b31e2f4558706b0831744485a80958c93524a44
4eae69e20692a697a12a705155e972ddf448ca48

hazelcast/src/test/java/com/hazelcast/concurrent/countdownlatch/CountDownLatchBasicDistributedTest.java
CountDownLatchBasicTest
CountDownLatchAbstractTest

hazelcast/src/test/java/com/hazelcast/concurrent/countdownlatch/CountDownLatchBasicLocalTest.java
CountDownLatchBasicTest
CountDownLatchAbstractTest

4eae69e20692a697a12a705155e972ddf448ca48
9323424d263a1e573ab7edbfc69d67d8782ce36a

hazelcast/src/test/java/com/hazelcast/concurrent/lock/ConditionBasicDistributedTest.java
ConditionBasicTest
ConditionAbstractTest

hazelcast/src/test/java/co

#### 4.4. Number of refix

In [16]:
total_refix = 0
visited = set()
for k in treeRoots:
    if k not in visited:
        _, _, num_refix, visited_now = bfs(k, False, True)
        total_refix += num_refix
        visited = visited.union(visited_now)
print(f'There are total {total_refix} paths in bugs that required at least one refix (have at least 2 child commits)')

There are total 127 paths in bugs that required at least one refix (have at least 2 child commits)


### 5. Number of Owner and Projects

In [17]:
query = '''SELECT count(DISTINCT project) FROM bugs_large'''
for row in cursor.execute(query):
    print(f'There are {row[0]} projects / repo')

There are 759 projects / repo


In [18]:
query = '''SELECT count(DISTINCT substr(project, 0, instr(project, '.'))) FROM bugs_large'''
for row in cursor.execute(query):
    print(f'There are {row[0]} repo owners')

There are 558 repo owners


In [19]:
query = '''
SELECT count()
FROM (
    SELECT *
    FROM (
        SELECT
           substr(project, 0, instr(project, '.')) as owner,
           substr(project, instr(project, '.') + 1) as repo
        FROM bugs_large) AS owners
    GROUP BY owner
    HAVING count(DISTINCT repo) > 1
)'''
for row in cursor.execute(query):
    print(f'There are {row[0]} repo owners having multiple projects / repo')

There are 57 repo owners having multiple projects / repo


In [20]:
query = '''
SELECT count()
FROM (
    SELECT DISTINCT owner
    FROM (
        SELECT
           substr(project, 0, instr(project, '.')) as owner,
           substr(project, instr(project, '.') + 1) as repo
        FROM bugs_large) AS owners
    WHERE owner = repo
)'''
for row in cursor.execute(query):
    print(f'There are {row[0]} repo having the same name as owner')

There are 65 repo having the same name as owner


### 6. Sources That Required Sub-Sequent Changes

In [21]:
import pandas as pd
pd.set_option('display.max_rows', 0)

In [22]:
query = '''
SELECT a.before as first, a.after as second, b.after as third
FROM sstubs AS a
    INNER JOIN sstubs as b on a.after = b.before
WHERE 5 < length(b.before) AND length(b.before) < 12
'''
pd.DataFrame(cursor.execute(query), columns=['First', 'Second', 'Third'])

Unnamed: 0,First,Second,Third
0,IOException e,Exception e,Throwable e
1,NumberFormatException nfe,Exception e,Throwable e
2,IllegalArgumentException e,Exception e,Throwable e
3,IllegalArgumentException e,Exception e,Throwable e
4,ReflectiveOperationException e,Exception e,Throwable e
5,ReflectiveOperationException e,Exception e,Throwable e
6,IOException e,Exception e,Throwable e
7,IOException e,Exception e,Throwable e
8,IOException e,Exception e,Throwable e
...,...,...,...


### 7. Length of SHA1 to Uniquely identify a commit

In [23]:
for table in ['bugs', 'bugs_large', 'sstubs', 'sstubs_large']:
    # noinspection SqlResolve
    query = f'''
    SELECT count()
    FROM (
         SELECT parent FROM {table} UNION SELECT child FROM {table}
    )
    '''
    numUnqCommits = next(cursor.execute(query))
    for i in range(40):
        # noinspection SqlResolve
        query = f'''
        SELECT count(DISTINCT substr(parent, 0, {i}))
        FROM (
             SELECT parent FROM {table} UNION SELECT child FROM {table}
        )
        '''
        numIdentifiableCommits = next(cursor.execute(query))
        if numUnqCommits == numIdentifiableCommits:
            print(f'For {table} {i} chars are enough to identify a commit SHA1')
            break

For bugs 9 chars are enough to identify a commit SHA1
For bugs_large 10 chars are enough to identify a commit SHA1
For sstubs 8 chars are enough to identify a commit SHA1
For sstubs_large 9 chars are enough to identify a commit SHA1
