# Parse Java Methods
----
(C) Maxim Gansert, 2020, Mindscan Engineering

In [None]:
import sys
sys.path.insert(0,'../src')

import os
import datetime

In [None]:
from com.github.c2nes.javalang import tokenizer, parser, ast
from de.mindscan.fluentgenesis.dataprocessing.method_extractor import tokenize_file, extract_allmethods_from_compilation_unit


In [None]:
from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel
from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder
from de.mindscan.fluentgenesis.dataprocessing.method_dataset import MethodDataset

In [None]:
def split_methodbody_into_multiple_lines(method_body):
    result = []
    current_line_number = -1
    current_line_tokens = []
    for token in method_body:
        token_line = token.position[0]
        
        if token_line != current_line_number:
            current_line_number = token_line
            if len(current_line_tokens) != 0:
                result.append(current_line_tokens)
                current_line_tokens = []
        current_line_tokens.append(token.value)
        pass
    if len(current_line_tokens) !=0:
        result.append(current_line_tokens)
        pass
    return result


In [None]:
def process_source_file(dataset_directory, source_file_path, encoder, dataset):
    # derive the full source file path
    full_source_file_path = os.path.join( dataset_directory, source_file_path);
    
    # Work on the source file
    java_tokenlist = tokenize_file(full_source_file_path)
    parsed_compilation_unit = parser.parse(java_tokenlist)
    
    # collect file names, line numbers, method names, class names etc  
    all_methods_per_source = extract_allmethods_from_compilation_unit(parsed_compilation_unit, java_tokenlist)
    
    for single_method in all_methods_per_source:
        try:
            method_name = single_method['method_name']
            method_class_name = single_method['class_name']
            method_body = single_method['method_body']
            
            multi_line_body = split_methodbody_into_multiple_lines(method_body)
            one_line = [item for sublist in multi_line_body for item in sublist]
            print(one_line)
            
            # encode body code and methodnames using the bpe-vocabulary
            bpe_encoded_methodname = encoder.encode( [ method_name ] )
            bpe_encoded_methodbody_ml = encoder.encode_multi_line( multi_line_body )
            
            # do some calculations on the tokens and on the java code, so selection of smaller datasets is possible
            bpe_encoded_method_name_length = len(bpe_encoded_methodname)
            bpe_encoded_method_body_length = sum([len(line) for line in bpe_encoded_methodbody_ml])
            
            # save this into dataset
            method_data = { 
                "source_file_path": source_file_path,
                "method_class_name": method_class_name,
                "method_name": method_name,
                "encoded_method_name_length": bpe_encoded_method_name_length,
                "encoded_method_name": bpe_encoded_methodname,
                "encoded_method_body_length": bpe_encoded_method_body_length,
                "encoded_method_body": bpe_encoded_methodbody_ml,
                "method_body": method_body 
                }
            dataset.add_method_data( method_data )
        except:
            # ignore problematic method
            pass


In [None]:

model = BPEModel("16K-full", "../src/de/mindscan/fluentgenesis/bpe/")
model.load_hparams()

dataset_directory = 'D:\\Downloads\\Big-Code-excerpt\\'

model_vocabulary = model.load_tokens()
model_bpe_data = model.load_bpe_pairs()

encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data)

method_dataset = MethodDataset(dataset_name='parseMethodPythonNotebook1.jsonl')
method_dataset.prepareNewDataset(dataset_directory)
    
process_source_file(dataset_directory,'wordhash/WordMap.java' ,encoder, method_dataset )

method_dataset.finish()
