In [1]:
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
device = torch.device("cuda")

In [2]:
import sys
sys.path.append('./huggingface_models/')
sys.path.append('./utils/')
from sample_utils import *
from inference_utils import *
from codenet_process_utils import *
from self_training_utils import *

In [3]:
%load_ext autoreload
%autoreload 2

### Remove solutions that are in other languages
See the botton cell.

### Collect accepted problems
Get problems_dict: get_codenet_dict
```
problems_dict['p00001'].keys(): ['desc', 'io', 'solutions']
```
Rare problems have 'meta' also.
```
problems_dict['p00001']['io'].keys(): ['output', 'input']
```
The 'io' seems to be extracted from the 'desc', but not exhaustively. The 'desc' usually contains more input-output pairs than what's in 'io'.

### Parse the programs into codedict
Get code_dict: get_codenet_code_dict
```
codes_dict['p00001'].keys(): ['C++', 'Java', 'Python', 'C#', 'C']
codes_dict['Java'][0].keys(): ['functions', 'program_pieces', 'function_names', 'parameter_lists', 'return_types', 'target_call', 'target_call_args', 'target_call_params', 'target_call_return_type', 'idx', 'pid', 'program_formatted', 'io']
codes_dict['Java'][0]['idx']:'s150444541.java'
codes_dict['Java'][0]['pid']:'p00100'
```

### Filter programs by function and compilation
1. Filter programs that has functions (other than main/Main): get_nonempty_functions
2. Filter by compilation: get_codenet_call_dict. Note that in this step, we don't compile the original program. Instead, we combine the import_str extracted from the original program with the functions into a new program, and compile this new program.
3. Get filtered programs: get_compiled_functions
We get call_dict in this step. \
```
call_dict[lang] = [programs, processed_results, result_keys, error_type_dict]
```
We also get filtered_dict in this step.\
```
filtered_dict["Java"][0].keys(): ['code_dic_id', "import_str", "function", "pid"]
```

### Merge filtered program
Merge all the filtered programs into one dict (merged_filtered_dict).
```
merged_filtered_dict.keys(): ['C++', 'Java', 'Python', 'C#', 'C']
merged_filtered_dict["Java"][0].keys(): ['code_dic_id', 'import_str', 'function', 'pid', 'code_dic', 'batch_id']
```

### No-tok preprocessing
Process the filtered data for model training.
1. remove comments, empty lines format_codestring_codenet(codestring, lang)
2. replace new_lines notok_prepro(codestring, lang, is_plbart)
3. after decoding, do notok_detok notok_detok(codestring, lang, is_plbart)
4. do detok_format(codestring, detokenizer) to get detokenized version for Java and Python

### Cached Files
codenet/codenet_problems_dict_i.json\
codenet/codenet_codedict_i.json\
codenet/codenet_call_dict_i.json\
codenet/codenet_filtered_dict_i.json\
codenet_merged_filtered_dict.json
codenet_merged_filtered_dict_notok.json\

Since "java" is a special token in plbart, we have to create input data for plbart separately.\
codenet_merged_filtered_dict_notok_plbart.json\


### Codenet data preprocessing

#### Get codedicts

In [None]:
num_batch = 41
code_lang_dict_list = []
for i in tqdm(range(num_batch)):
    print(i)
    codedict_path = cached_path + 'codenet/codenet_codedict_' + str(i) + '.json'
    if os.path.exists(codedict_path):
        continue
    with open(cached_path + 'codenet/codenet_problems_dict_' + str(i) + '.json') as infile:
        codenet_problems_dict_batch = json.load(infile)
    programs_dict, programs_idx_dict, program_id_dict = get_codenet_programs(
                                                            codenet_problems_dict_batch, new_langs)
    code_lang_dict = get_codenet_code_dict(programs_dict, programs_idx_dict, program_id_dict, 
                                       codenet_problems_dict_batch)
    with open(cached_path + 'codenet/codenet_codedict_' + str(i) + '.json', 'w') as outfile:
        json.dump(code_lang_dict, outfile)
    code_lang_dict_list.append(code_lang_dict)

#### Filter out programs that compiles

In [None]:
num_batch = 41
todo_idx = [i for i in range(num_batch)]
for i in todo_idx:
    print(i)
    codedict_path = cached_path + 'codenet/codenet_codedict_' + str(i) + '.json'
    if os.path.exists(codedict_path):
        with open(codedict_path) as infile:
            code_lang_dict = json.load(infile)
    func_id_dict, program_dict, imports_dict = get_nonempty_functions(code_lang_dict, new_langs)
    call_dict = get_codenet_call_dict(program_dict, imports_dict, new_langs)
    filtered_dict = get_compiled_functions(call_dict, func_id_dict, imports_dict, program_dict, 
                                       code_lang_dict)
    call_dict_path = cached_path + 'codenet/codenet_call_dict_' + str(i) + '.json'
    with open(call_dict_path, 'w') as outfile:
        json.dump(call_dict, outfile)
    filtered_dict_path = cached_path + 'codenet/codenet_filtered_dict_' + str(i) + '.json'
    with open(filtered_dict_path, 'w') as outfile:
        json.dump(filtered_dict, outfile)

#### Merge into one dataset

In [None]:
codedict_list = []
call_dict_list = []
filtered_dict_list = []
for i in range(41):
    codedict_path = cached_path + 'codenet/codenet_codedict_' + str(i) + '.json'
    call_dict_path = cached_path + 'codenet/codenet_call_dict_' + str(i) + '.json'
    filtered_dict_path = cached_path + 'codenet/codenet_filtered_dict_' + str(i) + '.json'
    with open(codedict_path) as infile:
        codedict = json.load(infile)
    with open(call_dict_path) as infile:
        call_dict = json.load(infile)
    with open(filtered_dict_path) as infile:
        filtered_dict = json.load(infile)
    codedict_list.append(codedict)
    call_dict_list.append(call_dict)
    filtered_dict_list.append(filtered_dict)

In [None]:
merged_filtered_dict = {x:[] for x in new_langs}
for i, filtered_dict in tqdm(enumerate(filtered_dict_list)):
    code_lang_dict = codedict_list[i]
    for lang in new_langs:
        for fd in filtered_dict[lang]:
            fd['code_dic'] = code_lang_dict[lang][fd['code_dic_id']]
            fd['batch_id'] = i
        merged_filtered_dict[lang] += filtered_dict[lang]
for lang in new_langs:
    print(lang, len(merged_filtered_dict[lang]))
with open(cached_path + 'codenet_merged_filtered_dict_full.json', 'w') as outfile:
    json.dump(merged_filtered_dict, outfile)

### No-tok preprocessing

#### No Tokenization Processing Steps:
1. remove comments, empty lines format_codestring_codenet(codestring, lang)
2. replace new_lines notok_prepro(codestring, lang, is_plbart)
3. after decoding, do notok_detok notok_detok(codestring, lang, is_plbart)
4. do detok_format(codestring, detokenizer) to get detokenized version for Java and Python


In [None]:
is_plbart = True
merged_filtered_dict = get_prepro_filtered_dict(merged_filtered_dict, is_plbart)
dic_path = cached_path + "codenet_merged_filtered_dict_notok_plbart.json"
with open(dic_path, 'w') as outfile:
    json.dump(merged_filtered_dict, outfile)

In [None]:
is_plbart = False
merged_filtered_dict = get_prepro_filtered_dict(merged_filtered_dict, is_plbart)
dic_path = cached_path + "codenet_merged_filtered_dict_notok.json"
with open(dic_path, 'w') as outfile:
    json.dump(merged_filtered_dict, outfile)

### Get input stats

In [None]:
num_batch = 41
code_lang_dict_list = []
len_dict = {}
for i in tqdm(range(num_batch)):
    codedict_path = cached_path + 'codenet/codenet_codedict_' + str(i) + '.json'
    if os.path.exists(codedict_path):
        with open(cached_path + 'codenet/codenet_problems_dict_' + str(i) + '.json') as infile:
            codenet_problems_dict_batch = json.load(infile)
        programs_dict, programs_idx_dict, program_id_dict = get_codenet_programs(
                                                                codenet_problems_dict_batch, new_langs)
        len_dict[i] = {}
        for lang in new_langs:
            len_dict[i][lang] = len(programs_dict[lang])
            print(lang, len(programs_dict[lang]))
#         code_lang_dict = get_codenet_code_dict(programs_dict, programs_idx_dict, program_id_dict, 
#                                            codenet_problems_dict_batch)

        with open(codedict_path) as infile:
            code_lang_dict = json.load(infile)
        code_lang_dict_list.append(code_lang_dict)
#         for lang in new_langs:
#             print(lang, len(code_lang_dict[lang]))
#         break

In [5]:
len_lang_dict = {x:0 for x in new_langs}
for i, dic in len_dict.items():
    for lang in new_langs:
        len_lang_dict[lang] += dic[lang]

In [6]:
# raw input programs
print(len_lang_dict)

{'C++': 4353049, 'Java': 354982, 'Python': 1796563, 'C#': 125580, 'C': 313360}


In [7]:
# programs with at least one function
count_lang_dict = {x:0 for x in new_langs}
for code_lang_dict in code_lang_dict_list:
    for lang in new_langs:
        dics = code_lang_dict[lang]
        #TODO. Count dics have more than one func
        for dic in dics:
            if len(dic['functions']) > 0:
                count_lang_dict[lang] += 1
count_lang_dict
    

{'C++': 1927089, 'Java': 140105, 'Python': 268647, 'C#': 67692, 'C': 80661}

In [8]:
# programs that compiles
is_plbart = True
merged_filtered_dict = get_prepro_filtered_dict(None, is_plbart)
for lang in new_langs:
    print(lang, len(merged_filtered_dict[lang]))

C++ 195942
Java 32053
Python 261486
C# 17716
C 26547


### Clean CodeNet Accepted Files

In [None]:
# remove langs other than the 7 langs
fns = os.listdir(codenet_data_path)
for fn in fns:
    lang_path = codenet_data_path + fn + '/'
    lang_fns = os.listdir(lang_path)
    for lang_fn in lang_fns:
        print(lang_fn)
        if lang_fn not in langs:
            shutil.rmtree(lang_path + lang_fn)