In [1]:
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
device = torch.device("cuda")

In [2]:
import sys
sys.path.append('./huggingface_models/')
sys.path.append('./utils/')
from sample_utils import *
from inference_utils import *
from codenet_process_utils import *
from self_training_utils import *

In [3]:
%load_ext autoreload
%autoreload 2

### Hypo Processing
1. Preprocess filtered hypos: get_lang_pair_dict\
    1.1 No-tok preprocessing
2. Merge lang1-lang2 and lang2-lang1
3. Split in to train/val/test: get_split_lang_pair_dict
4. Write into parallel files: write_codenet_pairdata

We get lang_pair_dict in this step.\
```
lang_pair_list = [src_codes, target_codes, pids]
lang_pair_dict[(lang1, lang2)] = lang_pair_list
```

### Cached Files

- Hypo call_dict\
    plbart_codenet_lang_pair_call_dict.pkl
- Generated Parallel data\
    codet5_codenet_src_hypo_pair_dict_plbart.pkl
- Hypo split_dict\
    codenet_hypo_split_dict.json

### Load No-tok Preprocessed Data

In [4]:
is_plbart = True
merged_filtered_dict = get_prepro_filtered_dict(None, is_plbart)
programs_dict = get_codenet_programs_dict(merged_filtered_dict)

#### Get Import Str

In [5]:
import_str_dict = {}
for lang in new_langs:
    all_imports, import_str = get_common_imports(lang, merged_filtered_dict)
    import_str_dict[lang] = import_str
import_str_dict["Java"] = java_imports_str
import_str_dict["C#"] = csharp_imports_str

### Filtered Hypo Postprocessing

#### Get all the filtered hypos 

In [6]:
with open(cached_path + "plbart_full_codenet_lang_pair_call_dict.pkl", 'rb') as infile:
    call_dict = pickle.load(infile)

In [7]:
filtered_lang_dict = {}
for lang1, lang2 in call_dict.keys():
    new_preds, functions, function_id_dict, call_list = call_dict[(lang1, lang2)] 
    filtered_dict = get_compiled_hypos(call_list, function_id_dict, merged_filtered_dict)
    filtered_lang_dict[(lang1, lang2)] = filtered_dict

#### Separate pids that have filtered hypos and pids that don't

In [8]:
empty_dict = {}
non_empty_dict = {}
for lang1, lang2 in filtered_lang_dict.keys():
    non_empty_dict[(lang1, lang2)] = []
    empty_dict[(lang1, lang2)] = []
    filtered_dict = filtered_lang_dict[(lang1, lang2)]
    for pid, inds in filtered_dict.items():
        if len(inds) > 0:
            non_empty_dict[(lang1, lang2)].append(pid)
        else:
            empty_dict[(lang1, lang2)].append(pid)
    print(lang1, lang2, len(non_empty_dict[(lang1, lang2)]))

C++ Java 69896
C++ C# 69052
C++ Python 188378
Java C++ 7411
Java C# 10871
Java Python 27955
C# C++ 2149
C# Java 2043
C# Python 16534


### Check filtered hypo quality

In [9]:
# compare input output and check quality
# Quality is very good!
# numerics translation is not accurate.
lang1 = 'C++'
lang2 = 'Python'
new_preds, functions, function_id_dict, call_list = call_dict[(lang1, lang2)] 
filtered_dict = filtered_lang_dict[(lang1, lang2)]
src_codes = programs_dict[lang1]
src_codes_formatted = [x['function'] for x in merged_filtered_dict[lang1]]
non_empty_list = non_empty_dict[(lang1, lang2)]

In [None]:
pid = 112
for lang1 in new_langs:
    for lang2 in new_langs:
        if lang2 == lang1:
            continue
        if (lang1, lang2) not in call_dict:
            continue
        new_preds, functions, function_id_dict, call_list = call_dict[(lang1, lang2)] 
        filtered_dict = filtered_lang_dict[(lang1, lang2)]
        src_codes = programs_dict[lang1]
        src_codes_formatted = [x['function'] for x in merged_filtered_dict[lang1]]
        non_empty_list = non_empty_dict[(lang1, lang2)]
        key = non_empty_list[-1]
        if len(filtered_dict[key]) > 0:
            print(lang1, lang2)
            print(detok_format(functions[filtered_dict[key][0]], file_detokenizers[lang2]))
            print(src_codes_formatted[key])

### Generate Parallel Training Data from Filtered Hypo

#### Preprocess Filtered Hypos
1. functions, notok_prepro(codestring, lang, is_plbart)
2. remove empty lines (caused by tokenization)
3. save into paired files; create map_file

In [None]:
lang_pair_dict = get_lang_pair_dict(call_dict, merged_filtered_dict, programs_dict, is_plbart)

C++ Java


HBox(children=(FloatProgress(value=0.0, max=69896.0), HTML(value='')))


C++ C#


HBox(children=(FloatProgress(value=0.0, max=69052.0), HTML(value='')))


C++ Python


HBox(children=(FloatProgress(value=0.0, max=188378.0), HTML(value='')))


Java C++


HBox(children=(FloatProgress(value=0.0, max=7411.0), HTML(value='')))


Java C#


HBox(children=(FloatProgress(value=0.0, max=10871.0), HTML(value='')))


Java Python


HBox(children=(FloatProgress(value=0.0, max=27955.0), HTML(value='')))


C# C++


HBox(children=(FloatProgress(value=0.0, max=2149.0), HTML(value='')))


C# Java


HBox(children=(FloatProgress(value=0.0, max=2043.0), HTML(value='')))


C# Python


In [13]:
with open(cached_path + "plbart_full_codenet_src_hypo_pair_dict_plbart.pkl", 'wb') as outfile:
    pickle.dump(lang_pair_dict, outfile)

#### Check alignment

In [14]:
with open(cached_path + "plbart_full_codenet_src_hypo_pair_dict_plbart.pkl", 'rb') as infile:
    lang_pair_dict = pickle.load(infile)

In [16]:
lang_pair_dict.keys()

dict_keys([('C++', 'Java'), ('C++', 'C#'), ('C++', 'Python'), ('Java', 'C++'), ('Java', 'C#'), ('Java', 'Python'), ('C#', 'C++'), ('C#', 'Java'), ('C#', 'Python')])

In [None]:
lang_pair1 = ("Java", "Python")
lang_pair2 = ("Python", "Java")
src_codes1, target_codes1, pids1 = lang_pair_dict[lang_pair1]
src_codes2, target_codes2, pids2 = lang_pair_dict[lang_pair2]

In [18]:
print(lang_pair1)
lang_pair1 = ("Java", "Python")
src_codes1, target_codes1, pids1 = lang_pair_dict[lang_pair1]
lang1, lang2 = lang_pair1
key = 123
print(detok_format(src_codes1[key], file_detokenizers[lang1]))
print(detok_format(target_codes1[key], file_detokenizers[lang2]))

('Java', 'Python')
private static ArrayList < Integer > sum_dig ( ArrayList < Integer > a ) {
  ArrayList < Integer > b = new ArrayList < Integer > ( ) ;
  for ( int i = 0 ;
  i < a . size ( ) / 2 ; i ++ ) {
    int sum = 0 ;
    if ( i == 0 ) sum = a . get ( 0 ) + a . get ( 1 ) ;
    sum = a . get ( i * 2 ) + a . get ( i * 2 + 1 ) ;
    String sums ;
    sums = String . valueOf ( sum ) ;
    b . add ( sums . length ( ) ) ;
  }
  return b ;
}
private static void print_dig ( ArrayList < Integer > a ) {
  for ( int i = 0 ;
  i < a . size ( ) ; i ++ ) {
    System . out . println ( a . get ( i ) ) ;
  }
}
def sum_dig ( a ) :
    b = [ ]
    for i in range ( 0 , int ( len ( a ) / 2 ) , 1 ) :
        sum = 0
        if ( i == 0 ) :
            sum = a [ 0 ] + a [ 1 ]
        sum = a [ i * 2 ] + a [ i * 2 + 1 ]
        sums = str ( sum )
        b.append ( len ( sums ) )
    return b
def print_dig ( a ) :
    for i in range ( 0 , len ( a ) , 1 ) :
        print ( a [ i ] , end = "" )


#### Merge lang1-lang2 and lang2-lang1

In [19]:
merged_lang_pair_dict = get_merged_lang_pair_dict(lang_pair_dict)

('C++', 'Java') 77307
('C++', 'Python') 188378
('C++', 'C#') 71201
('C++', 'C') 0
('Java', 'Python') 27955
('Java', 'C#') 12914
('Java', 'C') 0
('Python', 'C#') 16534
('Python', 'C') 0
('C#', 'C') 0


#### Load Hypo split_dict

In [22]:
# Any constraints?
# Simple. Just split at problem level
with open(cached_path + "codenet_hypo_split_dict.json") as infile:
    codenet_hypo_split_dict = json.load(infile)

In [23]:
split_lang_pair_dict = get_split_lang_pair_dict(merged_lang_pair_dict, merged_filtered_dict, 
                                                codenet_hypo_split_dict)

C++ Java
C++ Java [65579, 5413, 6315]
C++ Python
C++ Python [161332, 11832, 15214]
C++ C#
C++ C# [60566, 5128, 5507]
C++ C
C++ C [0, 0, 0]
Java Python
Java Python [23589, 1774, 2592]
Java C#
Java C# [10748, 899, 1267]
Java C
Java C [0, 0, 0]
Python C#
Python C# [14292, 871, 1371]
Python C
Python C [0, 0, 0]
C# C
C# C [0, 0, 0]


#### Write into parallel files

In [136]:
codenet_pair_path = codenet_processed_data_path + "codenet_function_pairs_non_plbart/"
write_codenet_pairdata(merged_lang_pair_dict, split_lang_pair_dict, codenet_pair_path)

#### Check Parallel File Alignment

In [None]:
# Check last line
for lang1, lang2 in merged_lang_pair_dict.keys():
    src_codes, target_codes, pids = merged_lang_pair_dict[(lang1, lang2)]
    for tag in tags:
        tag_indices = split_lang_pair_dict[(lang1, lang2)][tag]
        tag_i = tag_indices[-1]
        src_code = src_codes[tag_i]
        target_code = target_codes[tag_i]
        print(lang1, lang2)
        print(src_code)
        print(target_code)

### Create Parallel Dataset (summary of the above steps)

In [None]:
is_plbart = False
merged_filtered_dict = get_prepro_filtered_dict(None, is_plbart)
programs_dict = get_codenet_programs_dict(merged_filtered_dict)
with open(cached_path + "codet5_codenet_lang_pair_call_dict.pkl", 'rb') as infile:
    call_dict = pickle.load(infile)
codenet_pair_path = codenet_processed_data_path + "codet5_codenet_function_pairs/"
if not os.path.exists(codenet_pair_path):
    os.mkdir(codenet_pair_path)
lang_pair_dict_path = cached_path + "codet5_codenet_src_hypo_pair_dict_plbart.pkl"
if os.path.exists(lang_pair_dict_path):
    with open(lang_pair_dict_path, 'rb') as infile:
        lang_pair_dict = pickle.load(infile)
else:
    lang_pair_dict = get_lang_pair_dict(call_dict, merged_filtered_dict, programs_dict, is_plbart)
    with open(lang_pair_dict_path, 'wb') as outfile:
        pickle.dump(lang_pair_dict, outfile)

In [None]:
for lang_pair, lists in lang_pair_dict.items():
    print(lang_pair, len(lists[0]))

In [None]:
merged_lang_pair_dict = get_merged_lang_pair_dict(lang_pair_dict)
split_lang_pair_dict = get_split_lang_pair_dict(merged_lang_pair_dict, merged_filtered_dict, 
                                                codenet_hypo_split_dict)
write_codenet_pairdata(merged_lang_pair_dict, split_lang_pair_dict, codenet_pair_path)