# Explore c2nes Parser
(C) Maxim Gansert, 2020


In [None]:
import sys

sys.path.insert(0,'../src')

from com.github.c2nes.javalang import tokenizer, parser, ast
from com.github.c2nes.javalang.tree import ClassDeclaration, ClassCreator


In [None]:
def runTokenizerForFile(filename):
    with open(filename,"rb") as current_source_file:
        all_lines_as_string = map(lambda line: line.decode('utf-8'), current_source_file.readlines()[0:])
        current_source_code = "".join(all_lines_as_string) 
        return list(tokenizer.tokenize(current_source_code, ignore_errors=False))


In [None]:
dataset_directory = 'D:\\Downloads\\Big-Code-full\\java_projects\\'

# only one class
# some_source_filename = dataset_directory+'Algorithms\\src\\org\\rekdev\\trees\\BinaryTreeNode.java'

# has multiple classes parallel in one compilation unit
# some_source_filename = dataset_directory+'CSSMin\\CSSMin.java'

# has nested classes
# some_source_filename = dataset_directory+'cvs-plugin\\\src\\\main\\\java\\\hudson\\\scm\\CVSChangeLogSet.java'

# inner and/or anonymous classes
some_source_filename = dataset_directory+'emf\\plugins\\org.eclipse.emf.codegen\\src\\org\\eclipse\\emf\\codegen\\CodeGen.java'


In [None]:
# tokenize the java file
java_tokenlist = runTokenizerForFile(some_source_filename)
java_value_tokens = [x.value for x in java_tokenlist]


# print (java_value_tokens)
# print (java_tokenlist)

In [None]:
# build the tree...
parsed_tree = parser.parse(java_tokenlist)

print (type(parsed_tree))


In [None]:
def extract_method_body ( tokens ):
    collect_mode = False
    extracted_body = []
    depth = 0
    for token in tokens:
        # don't collect the last closing brace token...
        if token.value is '}':
            depth-=1
            if depth is 0:
                collect_mode=False
                # break this loop, since all is done / we have more closing braces than opening braces.
                break
        
        if collect_mode:
            extracted_body.append(token)

        # don't collect the first open brace token...
        if token.value is '{':
            depth+=1
            collect_mode = True

    return extracted_body

In [None]:
def collect_method_tokens (index, collected_start_positions, java_tokenlist ):
    collect_method_tokens = False
    collected_method_tokens = []
    
    for token in java_tokenlist:
        if token.position is collected_start_positions[index]:
            collect_method_tokens = True
        if index+1 not in collected_start_positions:
            pass
        else:
            if token.position is collected_start_positions[index+1]:
                collect_method_tokens = False
            
        if collect_method_tokens is True:
            collected_method_tokens.append(token)
            
    return collected_method_tokens

In [None]:
def extract_method( method_index , collected_start_positions, java_tokenlist):
    # the start positions are off by the modifiers, ans start at the type signature. 
    
    # should be optimized into one method, since it is basically collecting a longer list with "collect_method_teokens" 
    # and then reducing it to a shorter version with "extract_method_body"
    return extract_method_body ( collect_method_tokens( method_index, collected_start_positions, java_tokenlist ) ) 

# print(tokenizer.reformat_tokens(extract_method(10, collected_start_positions, java_tokenlist)))

In [None]:
def calculate_method_start_indexes_for_class( class_declaration ):
    collected_start_positions = []
    collected_method_names = []

    for j in range(len(class_declaration.methods)):
        # print("-- Methodname --")
        # print ( compilation_unit_ast.types[i].methods[j].name )
        # print("-- position of method --")
        # print (compilation_unit_ast.types[i].methods[j].position )
        # print (compilation_unit_ast.types[i].methods[j].modifiers)
        collected_start_positions.append(class_declaration.methods[j].position)
        collected_method_names.append(class_declaration.methods[j].name)
        # print("-- Body of method --")
        # print ( compilation_unit_ast.types[i].methods[j].body )

    return collected_start_positions, collected_method_names


def extract_methods_from_class( class_declaration, java_tokenlist ):
    extracted_methods = []
    
    collected_start_positions, collected_method_names = calculate_method_start_indexes_for_class(class_declaration)
    
    for index in range (len(collected_start_positions)):
        method_for_index = extract_method( index, collected_start_positions, java_tokenlist)
        method_dict_entry = {'method_body':method_for_index , 'method_name':collected_method_names[index]}
        
        extracted_methods.append(method_dict_entry)
    
    return extracted_methods

In [None]:
def extract_classes_from_compilation_unit(compilation_unit_ast):
    classes = []

    for _,node in ast.walk_tree(compilation_unit_ast):
        if isinstance(node, ClassDeclaration):
            classes.append(node)
        
    [ print(clazz.name) for clazz in classes ]
    # print (classes)
    return classes

extract_classes_from_compilation_unit(parsed_tree)

In [None]:
def extract_allmethods_from_compilation_unit(compilation_unit_ast, java_tokenlist):
    clazzes = extract_classes_from_compilation_unit(compilation_unit_ast)
    
    # I guess it would be better to use a walker, which is able to find each class_declaration, instead of iterating over the class only 
    for i in range(len(clazzes)):
        class_declaration = clazzes[i]
        extracted_methods = extract_methods_from_class(class_declaration, java_tokenlist )

        for single_method in extracted_methods:
            print("==["+single_method['method_name']+"]==")
            print(tokenizer.reformat_tokens(single_method['method_body']))
    pass


extract_allmethods_from_compilation_unit(parsed_tree, java_tokenlist)