ESE P6: An Empirical Study of GitHub code review tool 
by Eleonora Pura  and Melih Catal


The goal of the preprocessing steps is to building two datasets of triplets (<ms,rnl>->mr) and pairs (ms->mr) 

## Data Mining
This step is skipped because the data is already ready and given by the tutor.

## Methods Extraction and Linking Reviewer Comment
Parsing Java files using the Lizard Python library to extract the methods. We are only interested in the java files.
Link each comment to the specific method (if any) it refers to. If a comment cannot be linked to any method, it is discarded. After having linked comments to methods for each review round, we are in the situation in which we have, for each review round, a set of triplets <ms,mr and {Rnl}> where ms and mr represent the same method before and after the review round, and Rnl is a set of comments ms received in this round.

In [1]:
import lizard
import pandas as pd
import numpy as np
import subprocess
import re
import nltk
nltk.download('stopwords')  # download the stop words corpus
from nltk.corpus import stopwords
import os



[nltk_data] Downloading package stopwords to /Users/melih/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def find_function_body(source_code, start_line, end_line):
    lines = source_code.split('\n')[start_line-1: end_line+1]
    return "\n".join(lines)


def comment_in_method_body(comment_start_line, comment_end_line, function_start_line, function_end_line):
    comment_range = set(range(comment_start_line, comment_end_line+1))
    function_range = set(range(function_start_line, function_end_line+1))
    
    if comment_range.issubset(function_range):
        return True
    else:
        return False
    
# TODO: fix function, not working yet
def remove_unchanged_functions(functions_while, functions_after):
    if functions_while == [] or functions_after == []:
        return functions_while, functions_after
    
    for index, (function_while, function_after) in enumerate(zip(functions_while, functions_after)):
        if function_while == function_after:
            del functions_while[index]
            del functions_after[index]
            
    return functions_while, functions_after

def is_reviewer_commenting(owner_id, user_id):
    if owner_id != user_id:
        return True
    
    else:
        return False
    
    

In [3]:
# Extract all the function from a row of the df
def extract_row_functions(row, file_content):
    functions = []
    # if the comment start line is na that means the comment is a single line comment so we use the end line number as the start line number
    comment_start_line = row['original_start_line'] if not pd.isna(row['original_start_line']) else row['original_line']
    comment_end_line = row['original_line']
    # Check that the comment is given by a reviewer
    if is_reviewer_commenting(row['owner_id'], row['user_id']):
        # Check that both start and end line are not NaN floats
        if (not pd.isna(comment_start_line)) and (not pd.isna(comment_end_line)):
            # Convert all start and end line to int 
            comment_start_line = int(comment_start_line)
            comment_end_line = int(comment_end_line)

            # Extract all the functions from the file
            l = lizard.analyze_file.analyze_source_code(row['filename'], row[file_content])
            l_functions = l.function_list

            # 
            for function in l_functions:
                function_start_line = int(function.start_line)
                function_end_line = int(function.end_line)
                
                # Check that the comment is part of the body of the function
                if comment_in_method_body(comment_start_line, comment_end_line, function_start_line, function_end_line):
                    body = find_function_body(row[file_content], function_start_line, function_end_line)
                    
                    functions.append(
                        {"name": function.name,
                         "long_name": function.long_name,
                         "start_line": int(function.start_line),
                         "end_line": int(function.end_line),
                         "body": body})
    return functions
   

In [4]:
def methods_extraction_linking_reviewer_comments(df):
    # Make a copy of the original df
    df_copy = df.copy()
    functions_while = []
    functions_after = []
    for idx, row in df_copy.iterrows():
        row_functions_while = extract_row_functions(row, 'file_content_while')
        row_functions_after = extract_row_functions(row, 'file_content_after')
        
        functions_while.append(row_functions_while)
        functions_after.append(row_functions_after)
    # Add 2 new columns to df: functions_while and functions_after
    df_copy['functions_while'] = functions_while
    df_copy['functions_after'] = functions_after

    # Set all empty list values as np.nan in order to use pandas dropna function
    df_copy.functions_while = df_copy.functions_while.apply(lambda y: np.nan if len(y)==0 else y)
    df_copy.functions_after = df_copy.functions_after.apply(lambda y: np.nan if len(y)==0 else y)

    # Delete all entries in which there is no functions while or after
    df_copy = df_copy.dropna(subset=['functions_while', 'functions_after']).reset_index(drop=True)


    # If the functions while and after remains the same that means that no error was fixed: remove from df
    df_copy = df_copy[df_copy['functions_while'] != df_copy['functions_after']].reset_index(drop=True)

    # Turn the dataframe in a list of triplets: (function_while, comment, function_after)
    triplets = []

    for idx, row in df_copy.iterrows():
        if len(row['functions_while']) == len(row['functions_after']):
            for function_while, function_after in zip(row['functions_while'], row['functions_after']):
                triplets.append((function_while['body'], row['message'], function_after['body']))
    
    return triplets



## Abstraction
Use `src2abs` tool to abstract the methods. Triplets for which a parsing error occur during the abstraction process on the ms or on the mr methods are removed from the dataset. ms and mr, after the abstraction must be different. Idioms are not abstracted. During the abstraction code comments are removed. Using the pair abstraction mode, the same literals/identifiers in the two methods weill be abstracted using the same IDs. As output of the abstract process, the tool provides an abstraction map M linking the abstracted token to the raw token.

In [5]:
def create_java_file(content, filename):
    path = f"./abstraction/java_files/{filename}"
    with open(path, 'w') as f:
        f.write(content)

In [6]:
def create_abstraction_file():
    code_granularity = "method"
    mode = "pair"
    idioms_path = "./src2abs/idioms.csv"
    src2abs_path = "./src2abs/src2abs-0.1-jar-with-dependencies.jar"
    input_code_A_path = "./abstraction/java_files/ms.java"
    input_code_B_path = "./abstraction/java_files/mr.java"
    output_abstract_A_path = "./abstraction/abstraction_files/ms_abs.java"
    output_abstract_B_path = "./abstraction/abstraction_files/mr_abs.java"
    try :
        # java -jar src2abs-0.1-jar-with-dependencies.jar pair <code_granularity> <input_code_A_path> <input_code_B_path> <output_abstract_A_path> <output_abstract_B_path> <idioms_path>
        command = ['java', '-jar', src2abs_path, mode, code_granularity, input_code_A_path, input_code_B_path, output_abstract_A_path, output_abstract_B_path, idioms_path]
        # execute the command and capture the output
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        if stderr:
            #print("Error creating abstraction data")
            #print(stderr)
            pass
        #else :
            #print("Abstraction data created")
    except Exception as e:
        pass
        #print("Error creating abstraction data")
        #print(e)

In [7]:
def read_abstraction_file(filename):
    path = f"./abstraction/abstraction_files/{filename}"
    with open(path, 'r') as f:
        return f.read()

In [8]:
# read mapping file and create a dictionary with the mapping. key: concrete, value: abstract
def read_abstraction_mapping_file(filename):
    path = f"./abstraction/abstraction_files/{filename}"
    with open(path, 'r') as f:
        lines = f.readlines()
    is_abstract = False
    keys = []
    values = []
    for line in lines:
        words = line.strip().split(',')
        if len(words) > 1 :
            for word in words:
                if word == '':
                    continue
                if is_abstract:
                    values.append(word)
                else:
                    keys.append(word)
            is_abstract = not is_abstract
     # convert the lists to a dictionary
    maps = dict(zip(keys, values))


    return maps

In [9]:
def abstraction_filter_methods(ms,mr):
    # ms and mr should be different 
    if ms == mr:
        return False
    else:
        return True
        
def clean_rnl(rnl):
    # remove links
    rnl = re.sub(r'http\S+', '', rnl)
    # superfluous punctuation
    rnl = re.sub(r'[^\w\s]', '', rnl)
    return rnl

    

In [10]:
'''
template_triplets_data = {
    "ms": "",
    "comment": "",
    "mr": ""
    "ms_abs": "",
    "mr_abs": "",
    "map" : dict()
}
ms = methods while
mr = methods after change
rnl = review comment
'''
def create_abstraction_data(ms,mr,rnl):
    template_triplets_data = {}
    create_java_file(ms, "ms.java")
    create_java_file(mr, "mr.java")
    create_abstraction_file()
    ms_abs = read_abstraction_file("ms_abs.java")
    mr_abs = read_abstraction_file("mr_abs.java")
    mapping = read_abstraction_mapping_file("ms_abs.java.map")
    rnl = clean_rnl(rnl)
    
    template_triplets_data["ms"] = ms
    template_triplets_data["comment"] = rnl 
    template_triplets_data["mr"] = mr
    template_triplets_data["ms_abs"] = ms_abs
    template_triplets_data["mr_abs"] = mr_abs
    template_triplets_data["map"] = mapping
    if abstraction_filter_methods(ms_abs, mr_abs):
        #triplets_with_abstraction.append(template_triplets_data)
        return template_triplets_data

    

In [11]:
def methods_abstraction(triplets):
    triplets_with_abstraction = []
    for triplet in triplets:
        abstracted_method = create_abstraction_data(triplet[0], triplet[2], triplet[1])
        if abstracted_method:
            triplets_with_abstraction.append(abstracted_method)
    return triplets_with_abstraction
    

## Abstracting Reviewer Comments 
Abstract all code components mentioned in any comment using the abstraction map. Any camel casse identifier that is not matched in the abstraction map but that it is present in the comment, is replaced by the special token _CODE_ 

In [12]:
def is_camel_case(s):
    # example: "camelCase", "camelcase", "camelCaseCase", "camelCaseC"
    # not example: "camel", "camelCase_"
    pattern = r'^(?:[A-Z][a-z]*|[a-z]+)((?:[A-Z][a-z]*))+$'
    return bool(re.match(pattern, s))

def remove_stop_words(comment):
    stop_words = set(stopwords.words('english'))
    words = comment.split()
    filtered_comment = [w for w in words if not w in stop_words]
    return " ".join(filtered_comment)


def abstraction_filter_rnl(rnl):
    if len(rnl.split()) > 100:
        return False
    else :
        return True

def abstract_reviewer_comments(triplets_with_abstraction):
    camel_case_abs = "_CODE_"
    for triplet in triplets_with_abstraction:
        comment = triplet["comment"]
        mapping = triplet["map"]
        comment_words = comment.split()
        for word in comment_words:
            # remove any special characters from the word
            word = re.sub(r'[^\w\s]', '', word)
            if word in mapping:               
                comment = comment.replace(word, mapping[word])
            elif is_camel_case(word):
                comment = comment.replace(word, camel_case_abs)
        comment = remove_stop_words(comment)

        if abstraction_filter_rnl(comment):
            triplet["comment_abs"] = comment
        else :
            # delete the triplet
            triplets_with_abstraction.remove(triplet)

    return triplets_with_abstraction


    


## Filtering out Noisy Comments
Using heuristics to filter out noisy comments.


In [13]:
def is_relevant(comment):
    comment_size = len(comment.split())

    is_relevant = True
    # Useless comments, no content after removing stopwords
    if len(comment) == 0:
        is_relevant = False

    # Useless comments, one word, no action required or unclear action
    elif comment_size == 1:
        if comment.__contains__('nice') or comment.__contains__('pleas') \
            or comment.__contains__('ditto') or comment.__contains__('thank') \
            or comment.__contains__('ditto2') or comment.__contains__('fine') \
            or comment.__contains__('agew') or comment.__contains__('hahaha') \
            or comment.__contains__('yeh') or comment.__contains__('lol'):
            is_relevant = False

    elif comment_size == 2:
        if comment.__contains__('ack'):
            is_relevant =False

    # Request to change formatting, no impact on code
    elif comment.__contains__('indent') and comment_size < 5:
        is_relevant = False

    # Likely a thank you message
    elif (comment.__contains__('works for me') or comment.__contains__('sounds good') \
        or comment.__contains__('makes sense') or comment.__contains__('smile') \
        or comment.__contains__('approv')) and comment_size < 5:
        is_relevant = False

    # Request to add test code, no impact on the reviewed code
    elif (comment.__contains__('test') and comment_size < 5) \
        or (comment.__contains__('add') and comment.__contains__('test')):
        is_relevant = False

    # Request for clarification
    elif ((comment.__contains__('please explain') or comment.__contains__('explan') \
            or comment.__contains__('wat') or comment.__contains__('what')) and comment_size < 5) \
        or ((comment.__contains__('understand') or comment.__contains__('meant')) \
            and comment.__contains__('not sure')):
        is_relevant = False

    # Refers to previous comment or external resource with unclear action point
    elif (comment.__contains__('same as') or comment.__contains__('same remark') \
            or comment.__contains__('said above') or comment.__contains__('do the same')) \
        and comment_size < 5:
        is_relevant = False

    # Refers to web pages
    elif (comment.__contains__('like') or comment.__contains__('see')) \
        and comment.__contains__('http'):
        is_relevant = False

    # Request to add comment
    elif comment.__contains__('document') or comment.__contains__('javadoc') \
            or comment.__contains__('comment'):
        is_relevant =  False

    # Feedback about reorganizing the PR
    elif comment.__contains__('pr') and comment_size < 5:
        is_relevant = False

    # Comment contains a +1 to support previous comment.
    # It may be accompanied by another word, like agree or a smile.
    # This is the reason for < 3
    elif comment.__contains__('+1') and comment_size < 3:
        is_relevant = False

    # The code is ok for now
    elif comment.__contains__('for now') and comment_size < 5:
        is_relevant = False

    # Answers
    elif (comment.__contains__('fixed') or comment.__contains__('thank') \
            or comment.__contains__('youre right')) and comment_size < 3:
        is_relevant = False
    
    return is_relevant

In [14]:
# this cell is for testing the is_relevant function and calculating the percentage of irrelevant comments among all comments
# TODO: remove this cell when done testing

# run is_relevant on all comments and get number of irrelevant comments 
def get_irrelevant_comments(triplets_with_abstraction):
    irrelevant_comments = 0
    for triplet in triplets_with_abstraction:
        # check if the key exists
        if "comment_abs" not in triplet:
            continue
        comment_abs = triplet["comment_abs"]
        if not is_relevant(comment_abs):
            irrelevant_comments += 1
    return irrelevant_comments


#irrelevant_comments = get_irrelevant_comments(triplets_with_abstraction)
#print(irrelevant_comments)
#print(len(triplets_with_abstraction) - irrelevant_comments)
# percentage of irrelevant comments, the paper says in their dataset it is around 11%
#print(irrelevant_comments / len(triplets_with_abstraction) * 100)


In [15]:
# remove irrelevant comments
def remove_irrelevant_comments(triplets_with_abstraction):
    for triplet in triplets_with_abstraction:
        # check if the key exists
        if "comment_abs" not in triplet:
            continue
        comment_abs = triplet["comment_abs"]
        if not is_relevant(comment_abs):
            triplets_with_abstraction.remove(triplet)
    return triplets_with_abstraction


## Running the pipeline over all the GitHub Projects

We ran the pipeline over all the GitHub projects. As a first approach, we used multithreading to make the process faster. However, we were skeptical about the correctness of the results. For this reason, we decided to run the pipeline sequentially. And we compared the results. The dataset obtained using the multithreading approach has 175829 triplets and the dataset obtained using sequential approach has 171669 triplets. The difference is not so big, but we decided to use the dataset obtained using the sequential approach since we were able to find some mistakes in the multithreading approach during the comparison. Both approaches and the dataset are available.

In [16]:
# create <ms,mr,rnl> triple from the triplets_with_abstraction
def create_ms_mr_rnl_triple(triplets_with_abstraction):
    ms_mr_rnl_triplets = []
    for triplet in triplets_with_abstraction:
        try: 
            ms_mr_rnl_triplets.append({
                "ms": triplet["ms_abs"],
                "mr": triplet["mr_abs"],
                "rnl": triplet["comment_abs"]
            })
        except Exception as e:
            pass
    return ms_mr_rnl_triplets




In [17]:
def read_csv(path):
    df = pd.read_csv(path)
    return df

In [18]:
'''
This cell is used to run the pipeline over all the files in the github projects folder using multithreading. 

import os
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_file(filename):
    triplets = []
    try:
        path = os.path.join(folder, filename)
        df = read_csv(path)
        triplets = methods_extraction_linking_reviewer_comments(df)
        triplets_with_abstracted_methods = methods_abstraction(triplets)
        triplets_with_abstraction = abstract_reviewer_comments(triplets_with_abstracted_methods)
        # remove irrelevant comments
        triplets_with_abstraction = remove_irrelevant_comments(triplets_with_abstraction)
        # create <ms,mr,rnl> triple from the triplets_with_abstraction
        current_ms_mr_rnl_triplet = create_ms_mr_rnl_triple(triplets_with_abstraction)
        triplets.extend(current_ms_mr_rnl_triplet)
    except Exception as e:
        print("e")
        #return triplets, 1, filename
    return triplets, 0, filename

def parallel_process_files(folder, num_threads=4):
    filepaths = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    ms_mr_rnl_triplets = []
    num_errors = 0
    total_files = len(filepaths)
    completed_files = 0
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {executor.submit(process_file, f): f for f in filepaths}
        
        for future in as_completed(futures):
            triplets, errors, filename = future.result()
            ms_mr_rnl_triplets.extend(triplets)
            num_errors += errors
            completed_files += 1
            print(f"Finished processing file: {filename} ({completed_files}/{total_files})")
            
    return ms_mr_rnl_triplets, num_errors

folder = "data3"
num_threads = 3

final_triple, num_errors = parallel_process_files(folder, num_threads)
print("Number of errors: " + str(num_errors))
''' 


'ArithmeticError\nimport os\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\ndef process_file(filename):\n    triplets = []\n    try:\n        path = os.path.join(folder, filename)\n        df = read_csv(path)\n        triplets = methods_extraction_linking_reviewer_comments(df)\n        triplets_with_abstracted_methods = methods_abstraction(triplets)\n        triplets_with_abstraction = abstract_reviewer_comments(triplets_with_abstracted_methods)\n        # remove irrelevant comments\n        triplets_with_abstraction = remove_irrelevant_comments(triplets_with_abstraction)\n        # create <ms,mr,rnl> triple from the triplets_with_abstraction\n        current_ms_mr_rnl_triplet = create_ms_mr_rnl_triple(triplets_with_abstraction)\n        triplets.extend(current_ms_mr_rnl_triplet)\n    except Exception as e:\n        print("e")\n        #return triplets, 1, filename\n    return triplets, 0, filename\n\ndef parallel_process_files(folder, num_threads=4):\n    filepaths

In [21]:

# this cell is used to run the pipeline over all the files in the github projects folder. It is not using multithreading
def main():
    #path = './data/gh_apache_!_accumulo.csv'
    folder = "data"
    # number of files in the folder
    num_files = len(os.listdir(folder))
    # current file number
    file_num = 1
    ms_mr_rnl_triplets = []
    num_erros = 0
    # for each file in the folder run the process
    for filename in os.listdir(folder):
        # skip the first 550 files and end the process at 600 files
        try:
            print ("Processing file " + str(file_num) + " out of " + str(num_files))
            path = os.path.join(folder, filename)
            df = read_csv(path)
            triplets = methods_extraction_linking_reviewer_comments(df)
            triplets_with_abstracted_methods =  methods_abstraction(triplets)
            triplets_with_abstraction = abstract_reviewer_comments(triplets_with_abstracted_methods)
            # remove irrelevant comments
            triplets_with_abstraction = remove_irrelevant_comments(triplets_with_abstraction)

            # create <ms,mr,rnl> triple from the triplets_with_abstraction
            current_ms_mr_rnl_triplet = create_ms_mr_rnl_triple(triplets_with_abstraction)
            ms_mr_rnl_triplets.extend(current_ms_mr_rnl_triplet)
            print("Finished processing file " + str(file_num) + " out of " + str(num_files))
            print("*******************")
            file_num += 1
        except Exception as e:
            num_erros += 1
            file_num += 1
            continue
    print("Number of errors: " + str(num_erros))
    return ms_mr_rnl_triplets

final_triple = main()



Processing file 1 out of 1764
Finished processing file 1 out of 1764
*******************
Processing file 2 out of 1764
Finished processing file 2 out of 1764
*******************
Processing file 3 out of 1764
Finished processing file 3 out of 1764
*******************
Processing file 4 out of 1764
Finished processing file 4 out of 1764
*******************
Processing file 5 out of 1764
Finished processing file 5 out of 1764
*******************
Processing file 6 out of 1764
Finished processing file 6 out of 1764
*******************
Processing file 7 out of 1764
Finished processing file 7 out of 1764
*******************
Processing file 8 out of 1764
Finished processing file 8 out of 1764
*******************
Processing file 9 out of 1764
Finished processing file 9 out of 1764
*******************
Processing file 10 out of 1764
Finished processing file 10 out of 1764
*******************
Processing file 11 out of 1764
Finished processing file 11 out of 1764
*******************
Processing file 1

  df = read_csv(path)


Processing file 1313 out of 1764
Finished processing file 1313 out of 1764
*******************
Processing file 1314 out of 1764
Finished processing file 1314 out of 1764
*******************
Processing file 1315 out of 1764
Finished processing file 1315 out of 1764
*******************
Processing file 1316 out of 1764
Finished processing file 1316 out of 1764
*******************
Processing file 1317 out of 1764
Finished processing file 1317 out of 1764
*******************
Processing file 1318 out of 1764
Finished processing file 1318 out of 1764
*******************
Processing file 1319 out of 1764
Finished processing file 1319 out of 1764
*******************
Processing file 1320 out of 1764
Finished processing file 1320 out of 1764
*******************
Processing file 1321 out of 1764
Finished processing file 1321 out of 1764
*******************
Processing file 1322 out of 1764
Finished processing file 1322 out of 1764
*******************
Processing file 1323 out of 1764
Finished processi

In [22]:
print(len(final_triple))


171669


## Export the final dataset

In [30]:

# export the data to a json file
import json
with open('ms_mr_rnl_dataset.json', 'w') as fp:
    json.dump(final_triple, fp)



In [26]:
df = pd.DataFrame(final_triple, columns=['ms', 'mr', 'rnl'])
df.head()

Unnamed: 0,ms,mr,rnl
0,public VAR_1 ( ) { this . VAR_2 = true ; },public VAR_1 ( ) { this ( true ) ; },The oneargument constructor called instead thi...
1,public void METHOD_1 ( ) throws TYPE_1 { for (...,public void METHOD_1 ( ) throws TYPE_1 { TYPE_...,I think able use two required VAR_2 ie VAR_2 u...
2,private static void METHOD_1 ( ) { TYPE_1 VAR_...,public static void METHOD_3 ( ) throws TYPE_2 ...,Just minor need call dirmkdirs This clean meth...
3,public void METHOD_1 ( ) throws TYPE_1 { for (...,public static void METHOD_23 ( ) throws TYPE_1...,No need call clean method since test run file ...
4,public void METHOD_1 ( ) throws TYPE_1 { for (...,public static void METHOD_23 ( ) throws TYPE_1...,Only _CODE_ needed create credential store It ...


In [31]:
df.to_csv('ms_mr_rnl_dataset.csv', index=False)
