In [1]:
import sys
sys.path.append('./utils/')
from tokenization_utils import *
from extract_function_utils import *
from execution_utils import *
# from compilation_utils import *

import time
from tqdm.auto import tqdm


In [2]:
%load_ext autoreload
%autoreload 2

### Read Programs before tokenization

In [3]:
program_json_dict, program_id_lang_dic = read_program_file()
programs_dict, pids_dict = get_all_programs(program_id_lang_dic)

### Read Programs

In [4]:
program_json_dict, program_id_lang_dic = read_program_tok_file()
pids_dict, programs_dict, programs_toked_dict, programs_detoked_dict = get_all_programs_detok_from_cache(
                                                                                    program_id_lang_dic,
                                                                                    pids_dict_path, 
                                                                                    programs_dict_path, 
                                                                                    programs_toked_dict_path,
                                                                                    programs_detoked_dict_path)

### Execute programs

#### Linear way to execute (has progress bar)

In [None]:
success_count = 0
result_dict = {}
for i, pp in enumerate(tqdm(programs_list[:1000])):
    result, is_pass = exec_single_code_util(pp, lang, i, 3)
    success_count += is_pass
    result_dict[i] = result

#### Run single code (reliable)

In [None]:
key = 0
output = run_exec_python(py_programs_list[key])
output = run_exec_java(java_programs_list[key])
output = run_exec_cpp(cpp_programs_list[key])
output = run_exec_csharp(csharp_programs_list[key])
output = run_exec_js(js_programs_list[key])
output = run_exec_c(c_programs_list[key])
output = run_exec_php(php_programs_list[key])
print(output)

In [None]:
key = 0
output = run_exec_csharp(csharp_programs_list[key])
print(output)

### Explore Parallelism

#### Use p_tqdm (support progress bar)

In [None]:
# Before and after fixing formatting error (||)
# C++ Java Python C# Javascript PHP C
# 0.88 0.87 0.88 0.88 0.89 0.91 0.70 
# 0.96 0.95 0.89 0.96 0.97 0.98 0.76
# 0.87 0.94 0.88 0.96 0 1 0.61 (after detok)

### Generate non-buggy pairwise data

In [7]:
split_dict = load_split_dict()
test_list, val_list = get_eval_list(split_dict)
with open(cached_path + "xlcost_tokenizer_results_dict.pkl", 'rb') as infile:
    tokenizer_results_dict = pickle.load(infile)
result_key_lang_dict, error_type_lang_dict = single_result_mapping(tokenizer_results_dict['codet5'], pids_dict)

prepro_program_dict = data_prepro_notok(programs_dict, programs_toked_dict)
exec_pids_dict, exec_prepro_program_dict = get_exec_filtered_dict(pids_dict, 
                                                                  result_key_lang_dict, prepro_program_dict)
# get_pair_data_notok(data_path, "pair_data_notok_exec_full", 
#                     exec_pids_dict, exec_prepro_program_dict, test_list, val_list)
# should generate another dataset just for plbart


C++ 10597 601
Java 10278 750
Python 9558 1255
C# 10262 473
Javascript 9673 278
PHP 3304 249
C 431 143


In [10]:
stat_line = ""
for lang in langs:
    stat_line += str(len(programs_dict[lang])) + '\t'
#     print(lang, len(programs_dict[lang]))
print(stat_line)

11198	11028	10813	10735	9951	3553	574	


In [11]:
stat_line = ""
for lang in langs:
    stat_line += str(len(exec_prepro_program_dict[lang])) + '\t'
#     print(lang, len(programs_dict[lang]))
print(stat_line)

10597	10278	9558	10262	9673	3304	431	


### Generate separate dataset for plbart (it has special token "java")

In [None]:
prepro_program_dict = data_prepro_notok(programs_dict, programs_toked_dict, is_plbart=True)
exec_pids_dict, exec_prepro_program_dict = get_exec_filtered_dict(pids_dict, 
                                                                  result_key_lang_dict, prepro_program_dict)
get_pair_data_notok(data_path, "pair_data_notok_exec_full_plbart", 
                    exec_pids_dict, exec_prepro_program_dict, test_list, val_list)

In [None]:
# sanity check
# path = CodeModel/g4g/clean_xlcost.ipynb

# filter by buggy and length
exec_pids_dict, exec_prepro_program_dict = get_exec_filtered_dict(pids_dict, 
                                                                  result_key_lang_dict, prepro_program_dict)
length_lang_dict = get_length_lang_dict(prepro_program_dict, tokenizer) 
# is length filtering necessary??
len_exec_pids_dict, len_exec_prepro_program_dict = get_len_exec_filtered_dict(pids_dict, 
                                                                              result_key_lang_dict, 
                                                                              length_lang_dict,
                                                                             prepro_program_dict)


### Test how different model's tokenization affect execution rate

In [None]:
from transformers import AutoTokenizer
baseline_models = ['codebert', 'plbart', 'codet5']
tokenizer_dict = {model_type:AutoTokenizer.from_pretrained(baseline_model_dict[model_type])
                  for model_type in baseline_models}

In [None]:
# prepro_program_dict = data_prepro_notok(programs_dict, programs_toked_dict)
tokenizer_results_dict = {}
for model_type, tokenizer in tokenizer_dict.items():
    prepro_program_dict = data_prepro_notok(programs_dict, programs_toked_dict)
    decode_program_dict = tokenizer_exec_check(tokenizer, prepro_program_dict, model_type, num_dp=-1)
    results_dict = get_exec_results(decode_program_dict)
    tokenizer_results_dict[model_type] = results_dict

In [None]:
for model_type, results_dict in tokenizer_results_dict.items():
    print(model_type)
    result_type_dict = show_result_summary(results_dict)

In [None]:
# xlcost_detok_gold_result_dict xlcost_gold_result_dict xlcost_tokenizer_results_dict
with open(cached_path + "xlcost_exec_pids_dict.pkl", 'wb') as outfile:
    pickle.dump(exec_pids_dict, outfile)

In [None]:
with open(cached_path + "xlcost_tokenizer_results_dict.pkl", 'rb') as infile:
    tokenizer_results_dict = pickle.load(infile)

In [None]:
# updated xlcost data to fix the java error
# result_type_dict = show_result_summary(results_dict)
result_key_lang_dict, error_type_lang_dict = single_result_mapping(tokenizer_results_dict['codet5'], pids_dict)
error_type_lang_dict

In [None]:
lang = "Java"
for pid, idx in zip(pids_dict[lang], result_type_dict['error'][lang]):
    error = results_dict[lang][idx]
    if "main(String[])" in error:
        print(pid, idx, error)

#### Use ProcessPoolExecutor 

In [None]:
# this is the code that saves return values in order
import time
import concurrent
def useless_function(sec = 1):
    print(f'Sleeping for {sec} second(s)')
    time.sleep(sec)
    print(f'Done sleeping')
    return sec

def exec_program(program):
    result = run_exec(program)
    return {"code_string":program, "output":result}


start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(run_exec, programs_list[:100])
    for res in results:
        print(f'Return Value: {res}')
end = time.perf_counter()
print(f'Finished in {round(end-start, 2)} second(s)') 

#### Using imap (with progress bar)

In [None]:
from multiprocessing import Pool
import tqdm  

n = 1000
with Pool() as pool:
    results = list(tqdm.tqdm(pool.imap(run_exec, programs_list[:n]), total=n))
