In [None]:
from codegen_sources.model.translate import *

In [None]:
cs_code = "using System;using System.Linq;namespace AOJ{class Program{public static void Main(string[] args){foreach(var i in Enumerable.Range(1, 9)){foreach(var j in Enumerable.Range(1, 9)){Console.WriteLine('{0}x{1}={2}', i, j, i*j);}}}}}"
cpp_code = "#include<iostream>using namespace std;int main(){cin.tie(0);ios::sync_with_stdio(false);for(int i=1;i<10;i++)for(int j=1;j<10;j++)cout << i << 'x' << j << '='' << i*j << '\n';}"
java_code = "class Main {public static void main(String[] args){for(int i = 1; i < 10; ++i){for(int j = 1; j < 10; ++j){System.out.println(Integer.toString(i)+'x'+Integer.toString(j)+'='+Integer.toString(i*j));}}}} "
python_code = """for i in range(1, n):\n\tcounter = 1\n\twhile ((i - counter) >= 0 and\n\t\tA[i] >= A[i - counter]):\n\t\tcounter += ans[i - counter]\n\tans[i] = counter"""
js_code = "prefix[0] = prefix[1] = 0;for (let p = 2; p <= MAX; p++) {prefix[p] = prefix[p - 1];if (prime[p])prefix[p]++;}}"
php_code = "for ($p = 2; $p <= $MAX; $p++){$prefix[$p] = $prefix[$p - 1];if ($prime[$p])$prefix[$p]++;}}"
c_code = """void print(struct Node *root){if (root != NULL){print(root->left);printf("%d ",root->data);print(root->right);}}"""



In [None]:
from tokenization_utils import *
import json
from tqdm.notebook import tqdm
import jsonlines

In [None]:
from codegen_sources.model.classification_model import Classifier
import torch

artifacts_path="/home/mingzhu/CodeModel/CodeGen/code_corrption/last_token_segmented/java"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cls_model = Classifier(artifacts_path, device)

seqs = ["while ( n < 10 )",
"import java . import",
"import java . util .* ;",
"import java . util .* ; class",
"static void printTwoOdd static",
"static void void ( int arr, x ("]


preds, probs, logits = cls_model.classify(seqs)

print("Preds: ", preds)
print("Probs: ", probs)

In [None]:
class Translator:
    def __init__(self, model_path, BPE_path):
        # reload model
        reloaded = torch.load(model_path, map_location="cpu")
        # change params of the reloaded model so that it will
        # relaod its own weights and not the MLM or DOBF pretrained model
        reloaded["params"]["reload_model"] = ",".join([model_path] * 2)
        reloaded["params"]["lgs_mapping"] = ""
        reloaded["params"]["reload_encoder_for_decoder"] = False
        self.reloaded_params = AttrDict(reloaded["params"])

        # build dictionary / update parameters
        self.dico = Dictionary(
            reloaded["dico_id2word"], reloaded["dico_word2id"], reloaded["dico_counts"]
        )
        assert self.reloaded_params.n_words == len(self.dico)
        assert self.reloaded_params.bos_index == self.dico.index(BOS_WORD)
        assert self.reloaded_params.eos_index == self.dico.index(EOS_WORD)
        assert self.reloaded_params.pad_index == self.dico.index(PAD_WORD)
        assert self.reloaded_params.unk_index == self.dico.index(UNK_WORD)
        assert self.reloaded_params.mask_index == self.dico.index(MASK_WORD)

        # build model / reload weights (in the build_model method)
        encoder, decoder = build_model(self.reloaded_params, self.dico)
        self.encoder = encoder[0]
        self.decoder = decoder[0]
        self.encoder.cuda()
        self.decoder.cuda()
        self.encoder.eval()
        self.decoder.eval()

        # reload bpe
        if getattr(self.reloaded_params, "roberta_mode", False):
            print('roberta')
            self.bpe_model = RobertaBPEMode()
        else:
            print('non roberta')
            self.bpe_model = FastBPEMode(
                codes=os.path.abspath(BPE_path), vocab_path=None
            )

    def translate(
        self,
        all_data,
        lang1,
        lang2,
        precondition_topk, 
        condition_lambda,
        suffix1="_sa",
        suffix2="_sa",
        n=1,
        beam_size=1,
        sample_temperature=None,
        device="cuda:0",
        cont=False,
        
    ):

        # Build language processors
        assert lang1 in {"cpp", "java", "python", "csharp", 'javascript', 'php', 'c'}, lang1
        assert lang2 in {"cpp", "java", "python", "csharp", 'javascript', 'php', 'c'}, lang2
        so_path = "/home/mingzhu/CodeModel/CodeGen/codegen_sources/preprocessing/lang_processors"
        src_lang_processor = LangProcessor.processors[lang1](
            root_folder=so_path
        )
        tokenizer = src_lang_processor.tokenize_code
        tgt_lang_processor = LangProcessor.processors[lang2](
            root_folder=so_path
        )
        detokenizer = tgt_lang_processor.detokenize_code

        lang1 += suffix1
        lang2 += suffix2

#         assert (
#             lang1 in self.reloaded_params.lang2id.keys()
#         ), f"{lang1} should be in {self.reloaded_params.lang2id.keys()}"
#         assert (
#             lang2 in self.reloaded_params.lang2id.keys()
#         ), f"{lang2} should be in {self.reloaded_params.lang2id.keys()}"
        
        

        with torch.no_grad():

            lang1_id = self.reloaded_params.lang2id[lang1]
            lang2_id = self.reloaded_params.lang2id[lang2]

            results_l = []
            for i, input in tqdm(enumerate(all_data)):
                # Convert source code to ids
                tokens = [t for t in tokenizer(input)]
#                 print(f"Tokenized {params.src_lang} function:")
#                 print("before bpe", tokens)
                tokens = self.bpe_model.apply_bpe(" ".join(tokens)).split()
#                 print("after bpe", tokens)
                tokens = ["</s>"] + tokens + ["</s>"]
                input = " ".join(tokens)
#                 inputs.append(input_toks)
            
                # Create torch batch
                len1 = len(input.split())
                len1 = torch.LongTensor(1).fill_(len1).to(device)
                inds = [self.dico.index(w) for w in input.split()]
#                 print('inds', inds)
                x1 = torch.LongTensor(inds).to(
                    device
                )[:, None]
                langs1 = x1.clone().fill_(lang1_id)

                # Encode
                enc1 = self.encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
                enc1 = enc1.transpose(0, 1)
                if n > 1:
                    enc1 = enc1.repeat(n, 1, 1)
                    len1 = len1.expand(n)

                # Decode
                if beam_size == 1:
                    if cont:
                        x2, len2 = self.decoder.generate_cont(
                            cont,
                            cls_model,
                            detok, 
                            self.dico,
                            detokenizer,
                            precondition_topk, 
                            condition_lambda,
                            enc1,
                            len1,
                            lang2_id,
                            max_len=int(
                                min(self.reloaded_params.max_len, 3 * len1.max().item() + 10)
                            ),
                            sample_temperature=sample_temperature,
                        )
                    else:
                        x2, len2 = self.decoder.generate(
                            enc1,
                            len1,
                            lang2_id,
                            max_len=int(
                                min(self.reloaded_params.max_len, 3 * len1.max().item() + 10)
                            ),
                            sample_temperature=sample_temperature,
                        )
                else:
                    x2, len2, _ = self.decoder.generate_beam(
                        enc1,
                        len1,
                        lang2_id,
                        max_len=int(
                            min(self.reloaded_params.max_len, 3 * len1.max().item() + 10)
                        ),
                        early_stopping=False,
                        length_penalty=1.0,
                        beam_size=beam_size,
                    )

                # Convert out ids to text
                tok = []
                for i in range(x2.shape[1]):
                    wid = [self.dico[x2[j, i].item()] for j in range(len(x2))][1:]
                    wid = wid[: wid.index(EOS_WORD)] if EOS_WORD in wid else wid
                    if getattr(self.reloaded_params, "roberta_mode", False):
                        tok.append(restore_roberta_segmentation_sentence(" ".join(wid)))
                    else:
                        tok.append(" ".join(wid).replace("@@ ", ""))
                results = []
                for t in tok:
                    results.append(fix_format(detokenizer(t)))
                results_l.append(results)
            return results_l



In [None]:
import re
def fix_format(y):
    y = y.replace(" @ @", "@@").replace("@ @", "@@")
    x = re.sub('[\n]{2,}','\n',y)
    xs = x.split('\n')
#     remove empty lines
    s = "\n".join([t for t in xs if len(t.strip()) > 0])
#     remove linebreak between ()
    s = re.sub(r'\n(?=[^()]*\))', '', s)
#     remove linebreak between []
    s = re.sub(r'\n(?=[^\[\]]*\])', '', s)
    s = re.sub(r'[\t| ]+(?=[^()]*\))', ' ', s)
    s = re.sub(r'[\t| ]+(?=[^\[\]]*\])', ' ', s)
#     reduce unnecessary black space
    s = re.sub('[ ]{2,}',' ',s)
    return s

def detok(x2, dico, detokenizer):
#     x2: bs * candidates * seq_len
#     print(x2)
    with torch.no_grad():
        tok = []
        bz = x2.shape[0]
        cand_size = x2.shape[1]
        seq_len = x2.shape[2]
        for i in range(bz):
            tok_cand = []
            for j in range(cand_size):
                wid = [dico[x2[i, j, k].item()] for k in range(seq_len)][1:]
#                 print("wid", wid)
                wid = wid[: wid.index(EOS_WORD)] if EOS_WORD in wid else wid
                dec_seq = " ".join(wid).replace("@@ ", "")
#                 print("dec_seq", dec_seq)
                detoc_seq = detokenizer(dec_seq)
#                 print("detoc_seq",detoc_seq)
                fixed_seq = fix_format(detoc_seq)
#                 print("fixed_seq",fixed_seq)
                tok_cand.append(fixed_seq)
                
            tok.append(tok_cand)
        return tok

In [107]:
def group_list(a, precondition_topk, batch_size):
    batches = np.array_split(a, (precondition_topk//batch_size) + 1, axis=-1)
    max_tok_size = 5
    for i, batch in enumerate(batches):
        batch = batch.reshape(-1)
        batches[i] = batch
        if len(batch[0].split()) > max_tok_size:
            for j, seq in enumerate(batch):
                lines = seq.split("\n")
                batches[i][j] = " ".join(lines[-4:])
    return batches

In [121]:
def group_list(a, precondition_topk, bsize, cls_bsize, max_tok_size=20):
    num_batch = precondition_topk//cls_bsize
    if precondition_topk%cls_bsize != 0:
        num_batch += 1
#     num_batch * (bsize * cls_bsize)
    batches = np.array_split(a, num_batch, axis=-1)
    new_batches = []
    for i, batch in enumerate(batches):
#         print("batch.shape", batch.shape)
        batch_flat = batch.reshape(-1)
        batches[i] = batch_flat
        if len(batch_flat[0].split()) > max_tok_size:
            for j, seq in enumerate(batch_flat):
                lines = seq.split("\n")
                new_seq = " ".join(lines[-4:])
                batches[i][j] = new_seq
            new_batches.append(batches[i].reshape((bsize, cls_bsize)))
    a_trunc = np.concatenate(new_batches, axis=1)
    return batches, a_trunc

In [124]:
probs1.amax()

AttributeError: 'numpy.ndarray' object has no attribute 'amax'

In [59]:
probs1 = np.random.rand(13, 10, 2)
probs2 = np.random.rand(13, 10, 2)
np.concatenate((probs1, probs2), axis=1).shape
np.concatenate([probs1], axis=1).shape

(13, 10, 2)

In [41]:
import numpy as np
seqs = np.random.rand(13, 150)
precondition_topk = 150
cutoff = 100
seqs_batches = []
if precondition_topk > cutoff:
    for i in range(precondition_topk//cutoff):
        seqs_batches.append(seqs[:, i*cutoff:(i+1) * cutoff])
    remaining = precondition_topk%cutoff
    if remaining > 0:
        seqs_batches.append(seqs[:, -remaining:])
    
# cls_preds, cls_probs, cls_logits = [], [], []
# for seqs_batch in seqs_batches:
#     seqs_reshape = [seq for batch in seqs_batch for seq in batch]
# #                     assert len(seqs_reshape) == precondition_topk * bsize
#     max_tok_size = 20
#     if len(seqs_reshape[0].split()) > max_tok_size:
#         for i, seq in enumerate(seqs_reshape):
#             lines = seq.split("\n")
#             seqs_reshape[i] = " ".join(lines[-4:])

#     cls_preds_batch, cls_probs_batch, cls_logits_batch = cls_model.classify(seqs_reshape)
#     cls_preds += cls_preds_batch
#     cls_probs += cls_probs_batch
# pos_probs = cls_probs.reshape((bsize, -1, 2))[:, :, 0]

In [42]:
len(seqs_batches[0][0])
len(seqs_batches[1][1])
# remaining

50

In [None]:
import sys
lang1 = "cpp" #
lang2 = "java"
sys.argv = ['codegen_sources.model.translate', 
            '--src_lang', lang1, '--tgt_lang', lang2, 
            '--model_path', '<model_path>', '--beam_size', '1']
parser = get_parser()
params = parser.parse_args()

In [None]:
model_path = 'dumppath1/transcoder_dobf_g4g_Python_Java/nzl4eeae7s/best-valid_python_sa-java_sa_mt_bleu.pth'
# dumppath1/transcoder_g4g_1_Python_Java/s9pbal578r/best-valid_python_sa-java_sa_mt_bleu.pth
model_path = "dumppath1/transcoder_dobf_g4g_beam_10_C++_Java/q3sr66wo0u/best-valid_cpp_sa-java_sa_mt_bleu.pth"
# dumppath1/transcoder_g4g_1_C++_Java/8npxtk0klm/best-valid_cpp_sa-java_sa_mt_bleu.pth 
model_path = "dumppath1/transcoder_g4g_program_transfer_C++_Java/m9f4n3mlpl/best-valid_cpp_sa-java_sa_mt_bleu.pth"
# model_path = "TransCoder_model_2.pth"
translator = Translator(
    model_path, 
    Fast_BPE_path)

In [None]:
cpp_code = """#include <bits/stdc++.h> NEW_LINE using namespace std ; void printTwoOdd ( int arr [ ] , int size ) { int xor2 = arr [ 0 ] ; int set_bit_no ; int i ; int n = size - 2 ; int x = 0 , y = 0 ; for ( i = 1 ; i < size ; i ++ ) xor2 = xor2 ^ arr [ i ] ; set_bit_no = xor2 & ~ ( xor2 - 1 ) ; for ( i = 0 ; i < size ; i ++ ) { if ( arr [ i ] & set_bit_no ) x = x ^ arr [ i ] ; else y = y ^ arr [ i ] ; } cout << " The ▁ two ▁ ODD ▁ elements ▁ are ▁ " << x << " ▁ & ▁ " << y ; } int main ( ) { int arr [ ] = { 4 , 2 , 4 , 5 , 2 , 3 , 3 , 1 } ; int arr_size = sizeof ( arr ) / sizeof ( arr [ 0 ] ) ; printTwoOdd ( arr , arr_size ) ; return 0 ; }"""

In [None]:
data_path = "../g4g/pair_data_tok_full/Java-C++/test-Java-C++-tok.cpp"
with open(data_path) as infile:
    lines = infile.readlines()
    cpp_code = [line.strip() for line in lines]

In [None]:
all_data = cpp_code[:10]
beam_size = 1
with torch.no_grad():
    outputs_cont = translator.translate(
            all_data,
            lang1=lang1,
            lang2=lang2,
            beam_size=beam_size,
            precondition_topk=10, 
            condition_lambda=0.5,
            cont=True,
            
        )
for batch in outputs_cont:
    for seq in batch:
        print(seq)
        print("------------------")

In [None]:
all_data = cpp_code
beam_size = 5
with torch.no_grad():
    outputs = translator.translate(
            all_data,
            lang1=lang1,
            lang2=lang2,
            beam_size=beam_size,
            precondition_topk=10, 
            condition_lambda=0.5,
            cont=False,
        )
for batch in outputs:
    for seq in batch:
        print(seq)
        print("------------------")

In [None]:
ref_path = "../g4g/pair_data_tok_full/Java-C++/test-Java-C++-tok.java"
with open(ref_path) as infile:
    lines = infile.readlines()
    java_code = [line.strip() for line in lines]

In [None]:
references_corpus = []
for line in java_code:
    references_corpus.append([line.split(" ")])
#     print(references_corpus)

outputs_eval = [seq[0].split(" ") for seq in outputs]
# outputs_cont_eval = [seq[0].split(" ") for seq in outputs_cont]

In [None]:
len(references_corpus)

In [None]:
from torchtext.data.metrics import bleu_score

b1 = bleu_score(outputs_eval[:-1], references_corpus)
# b2 =  bleu_score(outputs_cont_eval, references_corpus)

In [None]:
b1

In [None]:
print(b1, b2)

In [None]:
lang1 = 'python'
lang2 = 'java'
so_path = "/home/mingzhu/CodeModel/CodeGen/codegen_sources/preprocessing/lang_processors"
src_lang_processor = LangProcessor.processors[lang1](
    root_folder=so_path
)
tokenizer = src_lang_processor.tokenize_code
tgt_lang_processor = LangProcessor.processors[lang2](
    root_folder=so_path
)
detokenizer = tgt_lang_processor.detokenize_code
suffix1="_sa"
suffix2="_sa"
lang1 += suffix1
lang2 += suffix2

In [None]:
all_data = [python_code]
with torch.no_grad():

    lang1_id = self.reloaded_params.lang2id[lang1]
    lang2_id = self.reloaded_params.lang2id[lang2]

    results_l = []
    for i, input in tqdm(enumerate(all_data)):
        # Convert source code to ids
        tokens = [t for t in tokenizer(input)]
        print(f"Tokenized {params.src_lang} function:")
#                 print("before bpe", tokens)
        tokens = self.bpe_model.apply_bpe(" ".join(tokens)).split()
        print("after bpe", tokens)
        tokens = ["</s>"] + tokens + ["</s>"]
        input = " ".join(tokens)
#                 inputs.append(input_toks)

        # Create torch batch
        len1 = len(input.split())
        len1 = torch.LongTensor(1).fill_(len1).to(device)
        inds = [self.dico.index(w) for w in input.split()]
#                 print('inds', inds)
        x1 = torch.LongTensor(inds).to(
            device
        )[:, None]
        langs1 = x1.clone().fill_(lang1_id)

        # Encode
        enc1 = self.encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
        enc1 = enc1.transpose(0, 1)
        if n > 1:
            enc1 = enc1.repeat(n, 1, 1)
            len1 = len1.expand(n)

        # Decode
        if beam_size == 1:
            x2, len2 = self.decoder.generate(
                enc1,
                len1,
                lang2_id,
                max_len=int(
                    min(self.reloaded_params.max_len, 3 * len1.max().item() + 10)
                ),
                sample_temperature=sample_temperature,
            )
        else:
            x2, len2, _ = self.decoder.generate_beam(
                enc1,
                len1,
                lang2_id,
                max_len=int(
                    min(self.reloaded_params.max_len, 3 * len1.max().item() + 10)
                ),
                early_stopping=False,
                length_penalty=1.0,
                beam_size=beam_size,
            )

        # Convert out ids to text
        tok = []
        for i in range(x2.shape[1]):
            wid = [self.dico[x2[j, i].item()] for j in range(len(x2))][1:]
            wid = wid[: wid.index(EOS_WORD)] if EOS_WORD in wid else wid
            if getattr(self.reloaded_params, "roberta_mode", False):
                tok.append(restore_roberta_segmentation_sentence(" ".join(wid)))
            else:
                tok.append(" ".join(wid).replace("@@ ", ""))
        results = []
        for t in tok:
            results.append(detokenizer(t))
        results_l.append(results)