# Inspect Various Attributes

In [1]:
import sqlite3

In [2]:
import scripts.buildSqlite

10231 entries added to DB SStuBs
63923 entries added to DB SStuBs Large
25539 entries added to DB Bugs
153652 entries added to DB Bugs Large


In [3]:
conn = sqlite3.connect('../database/sstubs.db')
cursor = conn.cursor()

### Unique SHA1

In [29]:
query = 'SELECT count(DISTINCT child), count() FROM bugs_large'
for unq, tot in cursor.execute(query):
    print(f'There are total {unq:,} unique fix SHA1 among total {tot:,} entries')
    print(f'That is each commit contains nearly {round(tot / unq, 2)} lines of bugs')

There are total 66,261 unique fix SHA1 among total 153,652 entries
That is each commit contains nearly 2.32 lines of bugs


In [30]:
query = 'SELECT count(DISTINCT child), count() FROM sstubs_large'
for unq, tot in cursor.execute(query):
    print(f'There are total {unq:,} unique fix SHA1 among total {tot:,} entries')
    print(f'That is each commit contains nearly {round(tot / unq, 2)} lines of stupid bugs')

There are total 24,486 unique fix SHA1 among total 63,923 entries
That is each commit contains nearly 2.61 lines of stupid bugs


### Chance of introducing new bug / skipping existing bug

In [6]:
query = '''SELECT count(*) FROM bugs_large WHERE child IN (
    SELECT parent FROM bugs_large
)'''
for row in cursor.execute(query):
    print(f'{row[0]:,} fixes required another fix')
print('Need to make it a tree to see how long this fixing goes')

14,294 fixes required another fix
Need to make it a tree to see how long this fixing goes


### Unique Bug Entry

In [7]:
query = '''SELECT count() as num_grp, sum(cnt) as num_entry
           FROM (
                SELECT *, count(*) as cnt
                FROM bugs_large
                GROUP BY parent, child, file, line
                HAVING count(*) > 1
            )'''
for unq, tot in cursor.execute(query):
    print(f'In bugs, {tot:,} entries in sstubs shares the same data')
    print(f'with {unq:,} unique sequences')

In bugs, 43,751 entries in sstubs shares the same data
with 21,626 unique sequences


In [8]:
query = '''SELECT count() as num_grp, sum(cnt) as num_entry
           FROM (
                SELECT *, count(*) as cnt
                FROM sstubs_large
                GROUP BY parent, child, file, line, type
                HAVING count(*) > 1
            )'''
for unq, tot in cursor.execute(query):
    print(f'In sstubs, {tot:,} entries in sstubs shares the same data')
    print(f'with {unq:,} unique sequences')
print('Need to identify why the constitute different entry')

In sstubs, 21,356 entries in sstubs shares the same data
with 10,460 unique sequences
Need to identify why the constitute different entry


## Fix commit tree

In [23]:
from collections import deque

In [34]:
roots = {}
query = 'SELECT parent, child FROM bugs_large'
for parent, child in cursor.execute(query):
    if parent not in roots:
        roots[parent] = set()
    roots[parent].add(child)

    if child not in roots:
        roots[child] = set()
    roots[child].add(parent)

print(f'{len(roots):,} unique SHA1')

126,898 unique SHA1


In [41]:
def bfs(start):
    visited = set()
    distance = {start: 0}
    queue = deque()
    queue.append(start)
    # mark `start` as visited
    visited.add(start)

    maxDist = 0
    distantChild = None
    while queue:
        a = queue.popleft()
        for b in roots[a]:
            if b not in visited:
                visited.add(b)
                distance[b] = distance[a] + 1
                queue.append(b)

                if distance[b] > maxDist:
                    maxDist = distance[b]
                    distantChild = b
    return distantChild, maxDist

ultimateLongestDist = 0
for firstParent in roots.keys():
    longestPathStart, _ = bfs(firstParent)
    longestPathEnd, longestDistance = bfs(longestPathStart)
    if longestDistance > ultimateLongestDist:
        ultimateLongestDist = longestDistance

print(f'Ultimate longest distance: {ultimateLongestDist}')

Ultimate longest distance: 12
