In [1]:
import json
import pandas as pd    
import os
import json
import torch
from git import Repo
from git import GitCommandError
import git
import tqdm

In [2]:

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))


def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data



In [12]:
def add_projs(x):
    return '.'.join(x.split('/')[-2:])

with open('./sstubsLarge','rb') as fp:
    sstubs = json.load(fp)
stubs_df = pd.DataFrame(sstubs)
top_projs = pd.read_csv('./topProjects.csv')
repos_series = top_projs.repository_url.apply(add_projs)
top_projs['projectName'] =repos_series
stubs_df_m = stubs_df.merge(top_projs, on='projectName')

In [20]:
# how many characters to consider on either side of the patch (heuristic)
window_size = 512
def run_parallel(row):
    dir_name = './repos/'+'/'.join(row.projectName.split('.') )
    if not os.path.isdir(dir_name):
        return None
    #     repo = Repo.clone_from('https://github.com/'+'/'.join(row.projectName.split('.')), dir_name)
    # else:
    
    try:
        #get basic info
        repo = Repo(dir_name)
        commit_before = repo.commit(row.fixCommitParentSHA1)
        commit_after = repo.commit(row.fixCommitSHA1)

        filea = commit_before.tree[row.bugFilePath].data_stream.read()
        fileb = commit_after.tree[row.bugFilePath].data_stream.read()
        window_context = filea[max(row.bugNodeStartChar-window_size,0):min(row.bugNodeStartChar+row.bugNodeLength+window_size,len(filea))].decode()
        row['source_before_fix_minedbyKevin'] = str(filea[row.bugNodeStartChar:row.bugNodeStartChar+row.bugNodeLength])
        row['source_after_fix_minedbyKevin'] = str(fileb[row.fixNodeStartChar:row.fixNodeStartChar+row.fixNodeLength])
        row['source_before'] = str(filea)
        row['source_after'] = str(fileb)
        row['author_before'] = str(commit_before.author.email)
        row['author_after'] = str(commit_after.author.email)
        row['author_date_commit_before'] = str(commit_before.authored_date) #won't be super informative
        row['author_date_commit_after'] = str(commit_after.authored_date)
        
        #get extra oldest commit with window of context
        loginfo = repo.git.log('--all','-p','--reverse','--source','-S', window_context, '--',str(row.bugFilePath))
        oldest_commit = None

        #extract the oldest commit from the git log info
        c_line = loginfo.splitlines()[0]
        if 'commit' not in c_line:
            return None
        oldest_commit = c_line.split()[1]
        oldest_commit_obj = repo.commit(oldest_commit)

        # oldest_commit_contents = repo.git.show('{}:{}'.format(oldest_commit, row.bugFilePath))
        oldest_commit_contents = oldest_commit_obj.tree[row.bugFilePath].data_stream.read()
        number_of_commits = repo.git.rev_list('--count', oldest_commit+'..'+str(row.fixCommitSHA1))

        row['oldest_commit'] = str(oldest_commit)
        row['source_oldest_commit'] = str(oldest_commit_contents)
        row['author_oldest_commit'] = str(oldest_commit_obj.author.email)
        row['author_date_oldest_commit'] = str(oldest_commit_obj.authored_date)
        row['number_of_commits_oldest'] = str(number_of_commits)

    except Exception as ee:
        # with so many different errors between various commits and branches
        # it does not make sense to try to handle them for only 1.4% of failures in the dataset
        return None
    return row

In [30]:
from joblib import Parallel, delayed
L = Parallel(n_jobs=32)(delayed(run_parallel)(row) for i, row in stubs_df_m.iterrows())

In [34]:
final = [git_item.to_dict() for git_item in L if git_item is not None]

In [36]:
csv=True
if csv:
    pd.DataFrame(final).to_csv('./mined_data_sstubs_feb3.csv')
else:
    dump_jsonl(final, './mined_data_sstubs_feb3.jsonl')

In [15]:
# Used this to premptively list all github urls. In bash I use xargs to git clone all the projects in parallel
# with open('x_args_github.txt', 'w') as f:
#     l = []
#     for i, row in stubs_df_m.iterrows():
#         dir_name = './repos/'+'/'.join(row.projectName.split('.') )
#         # if not os.path.isdir(dir_name):
#         #     repo = Repo.clone_from('https://github.com/'+'/'.join(row.projectName.split('.')), dir_name)

#         l.append('git clone https://github.com/'+'/'.join(row.projectName.split('.'))+' '+dir_name)
#     l = list(set(l))
#     for element in l:
#         f.write(element + "\n")

In [16]:
# def fileInRepo(repo, filePath, commit):
#     '''
#     Useful feature but is not used in final function
#     '''
#     pathdir = os.path.dirname(filePath)
#     # Build up reference to desired repo path
#     rsub = repo.commit(commit).tree
#     for path_element in pathdir.split(os.path.sep):
#         # If dir on file path is not in repo, neither is file. 
#         try : 
#             rsub = rsub[path_element]
#         except KeyError : 
#             return False
#     return(filePath in rsub)

In [38]:
#
## Ignore, this is an iterative implementation to ensure the parallel one works too. It is very slow
## because it is sequential
#

# stubs_df_kevin_edits = []
# import time
# st = time.time()
# conts = 0
# window_size=512
# for i, row in tqdm.tqdm(stubs_df_m.iloc[:100].iterrows()):
#     # print("ITER")
#     try:
#         dir_name = './repos/'+'/'.join(row.projectName.split('.') )
#         if not os.path.isdir(dir_name):
#             continue
#         #     repo = Repo.clone_from('https://github.com/'+'/'.join(row.projectName.split('.')), dir_name)
#         # else:
#         repo = Repo(dir_name)
#         commit_before = repo.commit(row.fixCommitParentSHA1)
#         commit_after = repo.commit(row.fixCommitSHA1)


#         filea = commit_before.tree[row.bugFilePath].data_stream.read()
#         fileb = commit_after.tree[row.bugFilePath].data_stream.read()
#         window_context = filea[max(row.bugNodeStartChar-window_size,0):min(row.bugNodeStartChar+row.bugNodeLength+window_size,len(filea))].decode()
#         row['source_before_fix_minedbyKevin'] = str(filea[row.bugNodeStartChar:row.bugNodeStartChar+row.bugNodeLength])
#         row['source_after_fix_minedbyKevin'] = str(fileb[row.fixNodeStartChar:row.fixNodeStartChar+row.fixNodeLength])
#         row['source_before'] = str(filea)
#         row['source_after'] = str(fileb)
#         row['author_before'] = str(commit_before.author.email)
#         row['author_after'] = str(commit_after.author.email)
#         row['author_date_commit_before'] = str(commit_before.authored_date) #won't be super informative
#         row['author_date_commit_after'] = str(commit_after.authored_date)

#         #loginfo = repo.git.log('--all','-p','--reverse','--source','-S', str(row.sourceBeforeFix), '--',str(row.bugFilePath))
#         print(window_context)
#         loginfo = repo.git.log('--all','-p','--reverse','--source','-S', window_context, '--',str(row.bugFilePath))
#         print(loginfo)
#         import pdb
#         pdb.set_trace()
#         oldest_commit = None
#         try:
#             c_line = loginfo.splitlines()[0]
#             if 'commit' not in c_line:
#                 continue
#         except IndexError as e:
#             continue

#         if 'commit' in c_line:
#             oldest_commit = c_line.split()[1]

#         if not oldest_commit:
#             continue
#         oldest_commit_obj = repo.commit(oldest_commit)
#         try:
#         # oldest_commit_contents = repo.git.show('{}:{}'.format(oldest_commit, row.bugFilePath))
#             oldest_commit_contents = oldest_commit_obj.tree[row.bugFilePath].data_stream.read()
#             number_of_commits = repo.git.rev_list('--count', oldest_commit+'..'+str(row.fixCommitSHA1))
#         except Exception as ee:
#             print(ee)
#             continue
#             # return None

#         row['oldest_commit'] = str(oldest_commit)
#         row['source_oldest_commit'] = str(oldest_commit_contents)
#         row['author_oldest_commit'] = str(oldest_commit_obj.author.email)
#         row['author_date_oldest_commit'] = str(oldest_commit_obj.authored_date)
#         row['number_of_commits_oldest'] = str(number_of_commits)

#         stubs_df_kevin_edits.append(row)
#     except Exception as ee:
#         print(ee)
#         continue
# tot = time.time()-st
# print(tot)