## Import libraries

In [None]:
import pandas as pd
import lizard 


## Read data

In [None]:
data = pd.read_csv("./data/gh_apache_!_accumulo.csv")

In [None]:
data.head()


## Data preprocessing

In [None]:
def extract_function_annotation(file_content, start_line, end_line):
    

In [None]:
def extract_function(file_content, start_line, end_line):
    """
    extract the function body from the file content
    """
    lines = file_content.splitlines()
    # -1 to exclude the last line }
    function_body = lines[start_line-1:end_line]
    # concat the lines
    function_body = " ".join(function_body)
    return function_body


In [None]:
def parse_functions(file_content_type):
    extracted_functions = []
    # loop through the rows. data is the dataframe that defined above
    for index, row in data.iterrows():
        file_content = row[file_content_type]
        # provide the file content to lizard to extract the functions, the function needs a filename to work so we just use the filename
        parse_result = lizard.analyze_file.analyze_source_code(row['filename'], file_content)
        functions = parse_result.function_list
        for function in functions:
            print(function.__dict__)
            # from the file content, extract the function body using the start and end line
            method = extract_function(file_content, function.start_line, function.end_line)
            function_data = {
                "file_name": row['filename'],
                "dicussion": row['discussion'],
                "method_name": function.long_name,
                "method": method,
                "start_line": function.start_line,
                "end_line": function.end_line,
            }
            extracted_functions.append(function_data)
            
        # break to only loop the first row. TODO remove this 
        break
    return extracted_functions




In [None]:
extracted_functions_while = parse_functions('file_content_while')
extracted_functions_after = parse_functions('file_content_after')


### Linking Reviewer Comments

In [None]:
# same file should'nt have the same method name more than once. Remove the duplicates of the same method name in the same file
def get_unique_methods(extracted_functions):
    extracted_functions = pd.DataFrame(extracted_functions)
    extracted_functions = extracted_functions.drop_duplicates(subset=['method_name', 'file_name'])
    extracted_functions = extracted_functions.reset_index(drop=True)
    return extracted_functions.to_dict('records')

In [None]:
unique_extracted_functions_while = get_unique_methods(extracted_functions_while)
unique_extracted_functions_after = get_unique_methods(extracted_functions_after)


In [None]:
# get the methods that we are in the file that we are interested in
def filter_methods(filename):
    #return [method for method in extracted_functions_while if method['file_name'] == filename]
    return [method for method in unique_extracted_functions_while if method['file_name'] == filename]

In [None]:
def check_comment_in_line( commnet_start_line, comment_end_line, function_start_line, function_end_line):
    if function_start_line >= commnet_start_line and function_end_line <= comment_end_line:
        return True
    else:
        return False


In [None]:
# get the data that has the comment info 
def filter_raw_dataframe(data):
    # get start line not NaN
    df = data[data['start_line'].notna()]
    # get the rows that owner_id and user_id is not same 
    df = df[df['owner_id'] != df['user_id']]
    return df

In [None]:
df = filter_raw_dataframe(data)

# loop through df dataframe 
for index, row in df.iterrows():
    comment_start_line = row['start_line']
    comment_end_line = row['line']
    filename = row['filename']
    filtered_methods = filter_methods(filename)
    #print( str(len(filtered_methods)) + " " + filename)
    #print(row['url'] + " " + row['message'] +  row['filename'] )
    #print("*****************")
    for method in filtered_methods:
        #print (method['method_name'] + " " + str(method['start_line']) + " " + str(method['end_line']))
        if check_comment_in_line(comment_start_line, comment_end_line, method['start_line'], method['end_line']):
            #pass
            #print("found ")
            print(method['method_name']  + " " +  str(method['start_line']) + " " +str( method['end_line']) + " " + row['url'] + " " + row['message'] + " " +  row['filename'] + " " + str(comment_start_line) + " " + str(comment_end_line) )
            #print("found " + str(comment_start_line)  )
    #break            
        
    


In [None]:
# this cell is not related to the project. Just to test the code
def get_method_by_file_name(file_name):
    return [method for method in unique_extracted_functions_while if method['file_name'] == file_name]

target_file_name = "core/src/main/java/org/apache/accumulo/core/client/admin/PluginConfig.java"
target_file_methods = get_method_by_file_name(target_file_name)
target_file_methods