In [1]:
import os
os.putenv("CUDA_VISIBLE_DEVICES", "1")


from transformers import BertConfig, EncoderDecoderModel, EncoderDecoderConfig
from dataclasses import dataclass
from typing import Union, Iterable, List
from pathlib import Path
from tensor2tensor.data_generators import text_encoder
import json
import torch
from catalyst import dl
import re
from catalyst.utils import set_global_seed
import random

from token_to_index import TokenToIndexConverter


def read_jsonl(path):
    with open(path, 'r') as istream:
        return [json.loads(l) for l in istream]


EXPERIMENT_NAME = "seq2seq-transformer"


@dataclass
class Config:
    
    #  bert config:
    vocab_size: int
    pad_token_id: int
    hidden_size: int = 1024
    num_attention_heads: int = 16
    intermediate_size: int = 4096
    max_position_embeddings: int = 512
    
    encoder_num_hidden_layers: int = 6
    decoder_num_hidden_layers: int = 2

    #  optimization:
    max_lr: float = 5e-4
    batch_size: int = 32
    accumulation_steps: int = 16
    
    weight_decay: float = 0
        
    num_epochs: int = 50
    patience: int = 5

    #  lr scheduling:
    warmup_prop: float = 0.15
        
    #  generation parameters:
    eval_set_size: int = 2000
    num_return_sequences: int = 5

    logdir: str = f'logdir_{EXPERIMENT_NAME}'
    resume: str = None
        
    seed: int = 19


def make_model(config):
    encoder_config = BertConfig(
        vocab_size=config.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.encoder_num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        max_position_embeddings=config.max_position_embeddings,
        pad_token_id=config.pad_token_id
    )
    decoder_config = BertConfig(
        vocab_size=config.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.decoder_num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        max_position_embeddings=config.max_position_embeddings,
        pad_token_id=config.pad_token_id
    )
    model = EncoderDecoderModel(
        config=EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
    )
    return model


MODELS_DIR = Path.home() / "models/cubert"


token_to_index = TokenToIndexConverter(
    (MODELS_DIR / "github_python_minus_ethpy150open_deduplicated_vocabulary.txt").as_posix()
)
config = Config(vocab_size=token_to_index.vocab_size, pad_token_id=token_to_index.pad_index)

set_global_seed(config.seed)

In [2]:
DATA_FOLDER = Path.home() / "data/method_name_prediction/python/final/jsonl"


train = read_jsonl(DATA_FOLDER / "train_preprocessed.jsonl")
valid = read_jsonl(DATA_FOLDER / "valid_preprocessed.jsonl")
test = read_jsonl(DATA_FOLDER / "test_preprocessed.jsonl")

In [3]:
from torch.utils.data import Dataset, DataLoader
from torch import Tensor, LongTensor
from torch.nn.utils.rnn import pad_sequence
from typing import Callable, Iterable, Optional


class SequenceToSequenceDataset(Dataset):

    def __init__(
        self,
        src_stream: Iterable['T'],
        src_encoder: Callable[['T'], Tensor],
        ref_stream: Iterable['T'],
        ref_encoder: Callable[['T'], Tensor],
        src_pad_index: int,
        ref_pad_index: Optional[int] = None
    ):
        self.src = [src_encoder(s) for s in src_stream]
        self.ref = [ref_encoder(s) for s in ref_stream]
        assert len(self.src) == len(self.ref)
        self.src_pad_index = src_pad_index
        self.ref_pad_index = ref_pad_index if ref_pad_index is not None else src_pad_index

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.ref[idx]

    def collate_fn(self, data):
        src_batch, ref_batch = zip(*data)
        input_ids = pad_sequence(
            src_batch,
            padding_value=self.src_pad_index,
            batch_first=True
        )
        attention_mask = input_ids != self.src_pad_index
        decoder_input_ids = pad_sequence(
            ref_batch,
            padding_value=self.ref_pad_index,
            batch_first=True
        )
        labels = decoder_input_ids[:,1:]
        decoder_input_ids = decoder_input_ids[:,:-1]
        decoder_attention_mask = decoder_input_ids != self.ref_pad_index
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'decoder_input_ids': decoder_input_ids,
            'decoder_attention_mask': decoder_attention_mask,
            'labels': labels
        }

    def make_loader(self, *args, **kwargs):
        return DataLoader(self, *args, collate_fn=self.collate_fn, **kwargs)


def get_method_name_dataset(data, token_to_index, pad_index, max_length):

    def truncated_encoder(encoder, max_length):
        def wrapper(*args, **kwargs):
            return encoder(*args, **kwargs)[:max_length]
        return wrapper

    def to_torch_encoder(encoder):
        def wrapper(*args, **kwargs):
            return LongTensor(encoder(*args, **kwargs))
        return wrapper

    return SequenceToSequenceDataset(
        src_stream = (e['function_body_tokenized'] for e in data),
        src_encoder = to_torch_encoder(
            truncated_encoder(token_to_index.encode_code, max_length)
        ),
        ref_stream = (e['function_name_tokenized'] for e in data),
        ref_encoder = to_torch_encoder(
            truncated_encoder(token_to_index.encode_code, max_length)
        ),
        src_pad_index = pad_index
    )


In [4]:
model = make_model(config)

In [5]:
train_dataset = get_method_name_dataset(train, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings)
valid_dataset = get_method_name_dataset(valid, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings)

set_global_seed(config.seed)
beam_subset = random.sample(valid, config.eval_set_size)
beam_dataset = get_method_name_dataset(
   beam_subset, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings
)

test_dataset = get_method_name_dataset(test, token_to_index, token_to_index.pad_index, model.encoder.config.max_position_embeddings)

In [6]:
import numpy as np


def beam_search(src, model, bos_id, pad_id, end_id, device, max_len=10, k=5):
    src = src.view(1,-1).to(device)
    src_mask = (src != pad_id).to(device)
    
    memory = None
    
    input_seq = [bos_id]
    beam = [(input_seq, 0)] 
    for i in range(max_len):
        candidates = []
        candidates_proba = []
        for snt, snt_proba in beam:
            if snt[-1] == end_id:
                candidates.append(snt)
                candidates_proba.append(snt_proba)
            else:    
                snt_tensor = torch.tensor(snt).view(1, -1).long().to(device)
                
                if memory is None:
                    memory = model(
                        input_ids=src, 
                        attention_mask=src_mask,
                        decoder_input_ids=snt_tensor,
                        return_dict=False
                    )
                else:
                    memory = model(
                        input_ids=src, 
                        attention_mask=src_mask,
                        decoder_input_ids=snt_tensor,
                        encoder_outputs=(memory[1], memory[-1]),
                        return_dict=False
                    )
                    
                proba = memory[0].cpu()[0,-1, :]
                proba = torch.log_softmax(proba, dim=-1).numpy()
                best_k = np.argpartition(-proba, k - 1)[:k]

                for tok in best_k:
                    candidates.append(snt + [tok])
                    candidates_proba.append(snt_proba + proba[tok]) 
                    
        best_candidates = np.argpartition(-np.array(candidates_proba), k - 1)[:k]
        beam = [(candidates[j], candidates_proba[j]) for j in best_candidates]
        beam = sorted(beam, key=lambda x: -x[1])
        
    return beam

In [7]:
config.logdir

'logdir_seq2seq-transformer'

In [8]:
checkpoint = torch.load(Path(config.logdir) / "checkpoints/best.pth")
model.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

In [9]:
from tqdm import tqdm
import pandas as pd
from utils import compute_metrics


DEVICE = torch.device("cuda")
model.to(DEVICE).eval()

metrics = []

with open("cubert-generated-as-lists.jsonl", "w") as ostream:
    with torch.no_grad():
        for i in tqdm(range(len(test_dataset))):
            src, ref = test_dataset[i]
            name = token_to_index.decode_list(ref)
            gen = beam_search(
                src,
                model,
                bos_id=token_to_index.bos_index,
                pad_id=token_to_index.pad_index,
                end_id=token_to_index.eos_index,
                device=DEVICE
            )
            generated = sorted(
                [{"cand": token_to_index.decode_list(t), "score": s} for t, s in gen],
                key=lambda e: e["score"],
                reverse=True
            )
            entry = {
                "original": name,
                "generated": generated
            }
            ostream.write(f"{json.dumps(entry)}\n")
            
            candidates = [g["cand"] for g in generated]
            metrics.append(compute_metrics(name, candidates))

metrics = pd.DataFrame(metrics)

100%|██████████| 21877/21877 [42:33<00:00,  8.57it/s] 


In [10]:
metrics.mean()

exact-top-1        0.044933
exact-top-5        0.082278
precision-top-1    0.158400
precision-top-5    0.300582
recall-top-1       0.135528
recall-top-5       0.277397
f1-top1            0.141630
f1-top5            0.280047
dtype: float64