# Generate Fix Commit Tree

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json('../dataset/sstubsLarge')

In [3]:
print('Rows: {}, Columns: {}'.format(*df.shape))

Rows: 63923, Columns: 14


In [4]:
print('**Columns:**')
print(*df.columns, sep='\n')

**Columns:**
bugType
fixCommitSHA1
fixCommitParentSHA1
bugFilePath
fixPatch
projectName
bugLineNum
bugNodeStartChar
bugNodeLength
fixLineNum
fixNodeStartChar
fixNodeLength
sourceBeforeFix
sourceAfterFix


In [5]:
import sqlite3
conn = sqlite3.connect('../database/sstubs.db')
cursor = conn.cursor()

In [6]:
query = '''SELECT count(*)
    FROM (SELECT * 
    FROM sstubs_large AS b1
        INNER JOIN sstubs_large AS b2
        ON b1.child = b2.parent AND b1.project = b2.project AND b1.file = b2.file AND b1.line = b2.line 
    GROUP BY b1.child, b1.parent, b1.project, b1.line, b1.file, b1.type)'''
for res in cursor.execute(query):
    print(*res)


399


In [7]:
query = '''SELECT count(*) FROM sstubs_large WHERE (child, project, file, line) IN (
                SELECT parent, project, file, line FROM sstubs_large
           ) '''
for row in cursor.execute(query):
    print(*row)


399


In [8]:
query = '''SELECT count(*) FROM sstubs_large AS P WHERE EXISTS (
                SELECT parent
                FROM sstubs_large AS C
                WHERE C.parent = P.child AND C.project = P.project AND C.file = P.file AND C.line = P.line
           ) '''
for row in cursor.execute(query):
    print(*row)

399


In [9]:
child_df = df[['fixCommitSHA1', 'fixCommitParentSHA1', 'bugFilePath', 'projectName', 'bugLineNum', 'bugType']]
parent_df = df[['fixCommitParentSHA1', 'bugFilePath', 'projectName', 'bugLineNum', 'bugType']]\
    .rename(columns={'fixCommitParentSHA1': 'fixCommitSHA1'})

merged_df = child_df.merge(
    parent_df,
    how='inner',
    on=['fixCommitSHA1', 'bugFilePath', 'projectName', 'bugLineNum'],
).drop(columns='bugType_y').rename(columns={'bugType_x': 'bugType'})

In [10]:
columns = ['fixCommitSHA1', 'projectName', 'bugFilePath', 'bugLineNum', 'bugType']
tree_df = pd.DataFrame(data=merged_df.groupby(columns).groups.keys(), columns=columns)

In [11]:
tree_df.to_csv('../dataset/sequential_sstubs.csv', index=False)