In [1]:
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
device = torch.device("cuda")

In [2]:
import sys
sys.path.append('./huggingface_models/')
sys.path.append('./utils/')
from sample_utils import *
from inference_utils import *
from codenet_process_utils import *
from self_training_utils import *

In [3]:
%load_ext autoreload
%autoreload 2

### This notebook contains:
- Show Graph
- Split into batches for sampling
- Create a smaller dataset

### Load Filtered Data (Filtered accepted codenet data)

In [4]:
with open(cached_path + 'codenet_merged_filtered_dict.json') as infile:
    merged_filtered_dict = json.load(infile)
for lang in new_langs:
    print(lang, len(merged_filtered_dict[lang]))

C++ 195942
Java 32053
Python 261486
C# 17716
C 26547


#### Get programs_dict

In [None]:
programs_dict = get_codenet_programs_dict(merged_filtered_dict)

#### Get import_str_dict

In [None]:
import_str_dict = {}
for lang in new_langs:
    all_imports, import_str = get_common_imports(lang, merged_filtered_dict)
    import_str_dict[lang] = import_str
import_str_dict["Java"] = java_imports_str
import_str_dict["C#"] = csharp_imports_str

### Show Graph

In [None]:
lang = "Java"
key = 9981 #9981
code_dic = merged_filtered_dict[lang][key]['code_dic']
program = code_dic['program_formatted']
paras = code_dic['parameter_lists']
return_types = code_dic['return_types']
function_names = code_dic['function_names']
functions = code_dic['functions']
function = "\n".join(functions)
pieces = code_dic['program_pieces']
piece = "".join(pieces)
target_call = code_dic['target_call']
# print(program)
print(function)
print(piece)
print(function_names)
print(return_types)
print(paras)
print(target_call)

In [None]:
lang1 = "Java"
root1, graph1, graph_pruned1, graph_sibs1, graph_pruned_sibs1 = pipeline(code1, ast_parsers[lang1], lang1)
# root1, graph1, graph_pruned1, graph_sibs1, graph_pruned_sibs1 = refine_graphs(root1, graph_pruned_sibs1)
show_graph(root1, graph1)

### Split into batches for self-training

In [15]:
# Java-Python
# 3 batches 
num_batchs = 3
batch_split_dict = {}
for lang in new_langs:
    length = len(merged_filtered_dict[lang])
    batch_size = length//num_batchs
    batch_list = [i*batch_size for i in range(num_batchs+1)]
    batch_list[-1] = length
    batch_split_dict[lang] = batch_list
    print(lang, batch_list)

C++ [0, 12375, 24750, 37126]
Java [0, 3503, 7006, 10511]
Python [0, 8970, 17940, 26911]
C# [0, 1033, 2066, 3101]
C [0, 4141, 8282, 12424]


In [18]:
batch_programs_dict = {}
for lang in new_langs:
    batch_list = batch_split_dict[lang]
    batch_functions = []
    for bid in range(num_batchs):
        batch_dict = merged_filtered_dict[lang][batch_list[bid]:batch_list[bid+1]]
        functions = []
        for dic in batch_dict:
            functions.append(dic['function_notok'])
        batch_functions.append(functions)
    batch_programs_dict[lang] = batch_functions

In [27]:
lang1 = "Java"
lang2 = "Python"
batch_id = 0
src_codes = batch_programs_dict[lang1][batch_id]
tgt_codes = []
# infer with src and tgt
eval_examples, eval_features, eval_dataloader, model, tokenizer, args, decoder_sid = inference_prepro(
 lang1, lang2, model_type, device, src_codes, tgt_codes, None, tag, exp_suffix)

### Create Parallel Dataset

#### Split the hypos into train/val/test

In [5]:
with open(cached_path + "plbart_full_codenet_src_hypo_pair_dict_plbart.pkl", 'rb') as infile:
    lang_pair_dict = pickle.load(infile)

In [None]:
merged_lang_pair_dict = get_merged_lang_pair_dict(lang_pair_dict)

In [7]:
# Any constraints?
# Simple. Just split at problem level
all_problem_ids = get_all_problem_ids(merged_lang_pair_dict, merged_filtered_dict)

In [8]:
len(all_problem_ids)

3086

In [12]:
with open(cached_path + "codenet_hypo_split_dict.json", 'r') as infile:
    codenet_hypo_split_dict = json.load(infile)

In [14]:
train_set = set(codenet_hypo_split_dict['train'])
test_set = set(codenet_hypo_split_dict['test'])
val_set = set(codenet_hypo_split_dict['val'])
print(train_set & test_set)
print(train_set & val_set)
print(test_set & val_set)

set()
set()
set()


In [10]:
old_all_problem_ids_list = codenet_hypo_split_dict['train'] + \
    codenet_hypo_split_dict['test'] + codenet_hypo_split_dict['val']
old_all_problem_ids = set(old_all_problem_ids_list)
print(len(old_all_problem_ids_list), len(old_all_problem_ids))

1617 1617


In [11]:
all_problem_ids_rem = all_problem_ids - old_all_problem_ids

In [17]:
all_problem_ids_list = list(all_problem_ids_rem)
train_ratio = 0.85
val_ratio = 0.05
test_ratio = 0.1
num_problems = len(all_problem_ids_list)
train_num = int(train_ratio*num_problems)
test_num = int(test_ratio*num_problems)
train_proids = all_problem_ids_list[:train_num]
test_proids = all_problem_ids_list[num_problems-test_num:]
val_proids = all_problem_ids_list[train_num:num_problems-test_num]
train_proids += codenet_hypo_split_dict['train']
test_proids += codenet_hypo_split_dict['test']
val_proids += codenet_hypo_split_dict['val']
codenet_hypo_split_dict = {'train':train_proids, 'test':test_proids, 'val':val_proids}

In [18]:
with open(cached_path + "codenet_hypo_split_dict.json", 'w') as outfile:
    json.dump(codenet_hypo_split_dict, outfile)

#### Create a smaller dataset

In [None]:
is_plbart = True
merged_filtered_dict = get_prepro_filtered_dict(None, is_plbart)
programs_dict = get_codenet_programs_dict(merged_filtered_dict)
with open(cached_path + "codenet_src_hypo_pair_dict_plbart.pkl", 'rb') as infile:
    lang_pair_dict = pickle.load(infile)

In [None]:
merged_lang_pair_dict = {}
iterated_set = set()
for lang1 in new_langs:
    for lang2 in new_langs:
        if lang2 == lang1:
            continue
        lang_pair1 = (lang1, lang2)
        if lang_pair1 in iterated_set:
            continue
        lang_pair2 = (lang2, lang1)
        iterated_set.add(lang_pair1)
        iterated_set.add(lang_pair2)
        
        src_codes1, target_codes1, pids1 = [], [], []
        src_codes2, target_codes2, pids2 = [], [], []
        if lang_pair1 in lang_pair_dict:
            src_codes1, target_codes1, pids1 = lang_pair_dict[lang_pair1]
        if lang_pair2 in lang_pair_dict:
            src_codes2, target_codes2, pids2 = lang_pair_dict[lang_pair2]
        src_codes, target_codes, pids = src_codes1, target_codes1, pids1
        pids = [lang1 + "-" + str(x) for x in pids1]
        if len(src_codes1) > len(src_codes2):
            # note that target and src need to be exchanged, becasue lang_pair1 and lang_pair2 are different
            src_codes, target_codes, pids = target_codes2, src_codes2, pids2
            pids = [lang2 + "-" + str(x) for x in pids2]
        merged_lang_pair_dict[lang_pair1] = [src_codes, target_codes, pids]
        print(lang_pair1, len(pids))


In [None]:
split_lang_pair_dict = get_split_lang_pair_dict(merged_lang_pair_dict, merged_filtered_dict, 
                                                codenet_hypo_split_dict)
codenet_pair_path = codenet_processed_data_path + "codenet_function_pairs_small/"
if not os.path.exists(codenet_pair_path):
    os.mkdir(codenet_pair_path)
write_codenet_pairdata(merged_lang_pair_dict, split_lang_pair_dict, codenet_pair_path)