### Setup

In [1]:
# Libraries
import os
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20000)

import pickle

In [2]:
# Specify root directory
rootdir = r'C:\Users\ED61UW\Downloads\Work\Repos\P02763-Project_Finance_Simulation_Engine'

### Functions

In [3]:
def get_filepaths(directory, file_extention):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Restrict to partidular file type 
            if '.'+file_extention in filename:
                # Join the two strings in order to form the full filepath.
                filepath = os.path.join(root, filename)
                file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

In [4]:
def read_input(path):
    """
    Open file and read the code into a string
    """
    
    # Read whole file to a string
    text_file = open(path)    
    data = text_file.read()
    text_file.close()
    
    return data

In [5]:
def process_code_lines (code_lines):

    # output Dict
    code_dict = {'line_number': [],
                 'code': [],
                 'code_type': [],
                 'import_pair': []
                }


    # Technical
    func_name = ""
    class_name = ""
    import_group_flag = 0 # Indicator for multiline import
    import_pair = 0

    for count, line in enumerate(code_lines):
        element_type = ''

        # -----------------------------------------------------
        # Identify import statements
        if 'import ' in line:
            # Technical
            func_name = ""
            class_name = ""
            
            # Multiline import
            if '(' in line:
                import_group_flag = 1 
                import_pair += 1
                import_pair_num = import_pair
                
            else:
                import_pair_num = 0
                
            # Output
            element_type = 'Import'
            
        # Multiline import
        elif import_group_flag == 1 and ')' in line:
            import_group_flag = 0 

            # Output
            element_type = 'Import'
            import_pair_num = import_pair

        # Multiline import
        elif import_group_flag == 1:
            # Output
            element_type = 'Import'            
            import_pair_num = import_pair

        # -----------------------------------------------------    
        # Identify functions
        elif line.startswith('def '):
            # Technical
            class_name = ""
            
            # Fetch function name
            if '(' in line:
                func_name = re.findall('def (.*?)\(', line)[0]
            else:
                func_name = re.findall('def (.*?):', line)[0]
            
            # Output
            element_type = 'Function '+str(func_name) 
            import_pair_num = 0

        elif (line.startswith('    ') or line.strip() == '' or line.strip() == '):') and func_name != "":  
            # Technical
            class_name = ""

            # Output
            element_type = 'Function '+str(func_name) 
            import_pair_num = 0

        # -----------------------------------------------------    
        # Identify Classes
        elif line.startswith('class '):
            # Technical
            func_name = ""
            
            # Fetch class name
            if '(' in line:
                class_name = re.findall('class (.*?)\(', line)[0]
            else:
                class_name = re.findall('class (.*?):', line)[0]            
            
            # Output
            element_type = 'Class '+str(class_name) 
            import_pair_num = 0 
            
        elif (line.startswith('    ') or line.strip() == '' or line.strip() == '):') and class_name != "":
            # Technical
            func_name = ""    
              
            # Output
            element_type = 'Class '+str(class_name) 
            import_pair_num = 0 

        # -----------------------------------------------------
        # Identify raw code
        else:
            # Technical
            func_name = ""
            class_name = ""

            # Output
            element_type = 'Code' 
            import_pair_num = 0

        # -----------------------------------------------------
        # Output
        code_dict['line_number'].append(count+1)
        code_dict['code'].append(line)
        code_dict['code_type'].append(element_type)
        code_dict['import_pair'].append(import_pair_num)
        
    return code_dict 

In [6]:
def fetch_imports(df):
    # output structures
    output_file_function_pair = dict()
    output_import_list = []

    # fetch all imports
    df_import = df[df.code_type == 'Import']

    # break down the import statements and store them
    for index, row in df_import.iterrows():

        '''
        Scenarios:
        - from <library> import <function> or from <library> import (<function>,<function>)
        - import <library>.<library>.<function>
        - import <library>
        '''
        
        # clean comments and aliases:
        import_element = re.sub('#(.*)\n?', '', row['code'])
        import_element = re.sub(" as (.*)","", import_element)
        
        #print('=============================')
        #print(import_element)
        
        # check for scenario 1a:   
        if 'from ' in import_element:
            #print('Scenario 1a')        

            # fetch library:
            library_name = " ".join(re.findall("from (.*) import", import_element)) # select only variables
            #print('library_name: ', library_name)

            # Fetch imported functions
            functions_all = " ".join(re.findall("import (.*)", import_element)) 
            function_list = re.findall("[a-zA-Z_]+", functions_all) # select only variables
            #print('function_list: ', function_list)

        # check for scenario 1b: 
        elif row['import_pair'] != 0:
            #print('Scenario 1b')
            
            # Fetch imported functions
            new_fields = re.findall("[a-zA-Z_]+", import_element) # select only variables
            #print('new_fields: ', new_fields)
            
            function_list.extend(new_fields)
            #print('function_list: ', function_list)
            
            
        # Check for scenario 2
        elif "." in import_element:
            #print('Scenario 2')

            # select next word after from
            library_functions_str = " ".join(re.findall("import (.*?) ", import_element)).split(".")

            # fetch library:
            library_name = " ".join(library_functions_str[:-1])
            #print('library_name: ', library_name)

            # Fetch imported functions
            function_list = library_functions_str[-1:]
            #print('function_list: ', function_list)

        # check for scenario 3
        elif "import " in import_element:
            #print('Scenario 3')

            # remove leading text before import
            all_functions = " ".join(import_element.split('import')[1:]) 

            # Fetch library:
            library_name = " ".join(re.findall("[a-zA-Z_]+", all_functions)) # select only variables
            #print('library_name: ', library_name)

            # Fetch imported functions
            function_list = []
            
        # Output
        if library_name != '':
            output_file_function_pair[library_name] = function_list
            output_import_list.extend(function_list)
            
    # remove duplicates
    output_import_list = list(dict.fromkeys(output_import_list))
    
    return output_import_list, output_file_function_pair

In [7]:
def fetch_functions(df, file_name, path_name):
    
    # output
    output_list = []
    output_names = [] 
    
    # Select all functions: 
    df_functions = df[(df['code_type'].str.contains('Function')) | (df['code_type'].str.contains('Class'))]    
    
    # Create list of all functions:
    function_list = df_functions['code_type'].unique()
    
    for function in function_list:
        
        # filter info for that function:
        df_one_func = df_functions[df_functions['code_type'] == function]
        
        # breakdown the function code
        function_dict, function_name = extract_func_information(df_one_func)
        
        # Add file info
        function_dict['File_name'] = file_name
        function_dict['Location'] = path_name
        
        if 'Function' in function:
            function_dict['Type'] = 'Function'
        elif 'Class' in function:
            function_dict['Type'] = 'Class'    
        
        # Store results
        output_list.append(function_dict)
        output_names.append(function_name)
    
    # Add info about file
    output_list.append({
                            'Function_name' : file_name,
                            'Input_var' : "",
                            'Code_comments' : "",
                            'Output_var' : "",
                            'File_name': file_name,
                            'Location': path_name,
                            'Type':'File'
                        })
    
    return output_list, output_names

In [8]:
def extract_func_information(df):
        
    # Function dictionary:
    function_dict = {
        'Function_name' : "",
        'Input_var' : [],
        'Code_comments' : [],
        'Output_var' : []
    }

    # Technical
    return_multiline_ind = 0
    
    #------------------------------------------
    # fetch function name
    function_name = df['code_type'].iloc[0].split()[1]
    # output
    function_dict['Function_name']  = function_name.lower()
    
    # break down the import statements and store them
    for index, row in df.iterrows():
        
        #------------------------------------------
        # Fetch input fields
        if 'def ' in row['code']:
            input_str = ''.join(re.findall('\((.*?)\)', row['code']))
            input_list = re.findall("[a-zA-Z_]+", input_str)
            
            function_dict['Input_var'] = input_list # output
            
        #------------------------------------------
        # Fetch comments
        if '#' in row['code']:
            comment = re.findall(r'#[^\r\n]*', row['code'])
            
            function_dict['Code_comments'].extend(comment) # output

        #------------------------------------------
        # fetch output fields
        if 'return ' in row['code']:
            
            # identify multiline approach
            if '(' in row['code']:
                return_multiline_ind = 1
            
            output_str = " ".join(re.findall('return (.*?)\n', row['code'])) 
            output_list = re.findall("[a-zA-Z_]+", output_str)

            function_dict['Output_var'].extend(output_list) # output
            
        # multiline approach
        elif return_multiline_ind == 1:
            output_list = re.findall("[a-zA-Z_]+", row['code'])
            
            function_dict['Output_var'].extend(output_list) # output
            
        # multiline approach    
        elif return_multiline_ind == 1 and ')' in row['code']:
            return_multiline_ind == 0   
             
            output_list = re.findall("[a-zA-Z_]+", row['code'])
            function_dict['Output_var'].extend(output_list) # output
    
    return function_dict, function_name

In [9]:
def fetch_dependencies(df, all_dependencies):
    
    # Output
    dependencies_list = []

    def map_values(row):
        if row['code_type'] == 'Code':
            return row['File_name']
        else:
            return row['code_type'].split()[1]
    
    # fetch all functions and Classes present in .py file
    df_imports = df[(df['code_type'].str.contains('Function')) | (df['code_type'].str.contains('Class')) | (df['code_type'].str.contains('Code'))]
    df_imports = df_imports.assign(field_name=df_imports.apply(map_values, axis=1))
    import_list = df_imports['field_name'].unique()

    # Identify dependencies for all functions
    for element in import_list:
        
        # Get function name
        import_name = element
        
        # filter info for that function:
        df_one_element = df_imports[df_imports['field_name'] == element]

        # Combine code into one string
        element_str = df_one_element['code'].str.cat(sep='\n')

        for dependent_import in all_dependencies:
            # Check if function invoked
            if re.findall(dependent_import, element_str):
                # Fetch info about invocation parameters:
                search_string = "{0}\(([^]]*?)\)".format(dependent_import)
                invocation_str = ''.join(re.findall(search_string, element_str))
                invocation_parameters = re.findall("[a-zA-Z_]+", invocation_str)

                # Output
                if import_name != dependent_import:
                    dependencies_list.append([import_name, dependent_import, invocation_parameters])

    return dependencies_list

In [10]:
# 1. Define output files
output_df = pd.DataFrame()
output_func = []
output_elements_list = []
output_library_function_pair = []

# 2. Create list of all subdirectories + file names
full_file_paths = get_filepaths(rootdir, 
                                file_extention = 'py')

# 3. loop through all files
for folder_directory in full_file_paths:
    #folder_directory = full_file_paths[3]

    # 4. Fetch file name
    path_name = folder_directory.replace(rootdir+'\\', '')
    file_name = os.path.split(folder_directory)[-1]

    # 5. extract data from file
    file_string = read_input(folder_directory)

    # 6. Split code by lines
    code_lines = file_string.splitlines()

    # 7. identify elements in code 
    code_dict = process_code_lines(code_lines)
    df = pd.DataFrame(code_dict)

    # 8. Fetch information about imported functions
    import_list, library_function_pair = fetch_imports(df)

    # 9. Fetch information about the function structure
    function_list, function_names = fetch_functions(df, file_name, path_name)

    # 10. Store data
    # Dataframe
    df['File'] = path_name
    df['File_name'] = file_name
    output_df = pd.concat([output_df, df])
    
    # Output all elments for search
    output_elements_list.extend(function_names)
    
    # Import pairs
    output_library_function_pair.append(library_function_pair)

    # Functions
    output_func.extend(function_list)
    
# 11. remove duplicates
output_elements_list = list(dict.fromkeys(output_elements_list))

# 12. Fetch dependencies between functions
output_dependencies = fetch_dependencies(output_df, output_elements_list)

In [11]:
# Store results

# Dataframe
output_df.to_csv('code structure.csv', index=False)

# Import
import_file = open(r'library_function_pair_info.pkl', 'wb')
pickle.dump(output_library_function_pair, import_file)
import_file.close()

# Functions
functions_file = open(r'function_info.pkl', 'wb')
pickle.dump(output_func, functions_file)
functions_file.close()

# Dependencies
dependency_file = open(r'dependency_info.pkl', 'wb')
pickle.dump(output_dependencies, dependency_file)
dependency_file.close()