In [1]:
import sys  
sys.path.insert(0, '..')

In [2]:
import torch
import torch_xla.core.xla_model as xm

ModuleNotFoundError: No module named 'torch_xla'

In [25]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaModel
from dataclasses import dataclass
from typing import Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from tqdm import tqdm

In [3]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import torch.nn as nn
    
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder
      
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            return self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[1]
        else:
            return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]

In [11]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("../code-search-net/codebert/Siamese-model/demo/python_model")


In [12]:
query = "set a variable as hello world"
query_vec = model(tokenizer(query,return_tensors='pt')['input_ids'])[1]
code_1="print('hello world')"
code1_vec = model(tokenizer(code_1,return_tensors='pt')['input_ids'])[1]
code_2="s = 'hello world'"
code2_vec = model(tokenizer(code_2,return_tensors='pt')['input_ids'])[1]
code_3="hello world"
code3_vec = model(tokenizer(code_3,return_tensors='pt')['input_ids'])[1]
code_vecs=torch.cat((code1_vec,code2_vec,code3_vec),0)
codes = [code_1,code_2,code_3]
scores=torch.einsum("ab,cb->ac",query_vec,code_vecs)
scores=torch.softmax(scores,-1)
print("Query:",query)
for i in range(3):
    print("Code:",codes[i])
    print("Score:",scores[0,i].item())

In [15]:
ds = load_dataset("csv", data_files={"test" : "../data/codesearchnet_test.csv.gz"}, split="test")

Using custom data configuration default-2ad89bbbeda92323
Reusing dataset csv (/home/pascal_voitot/.cache/huggingface/datasets/csv/default-2ad89bbbeda92323/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)


In [None]:
model.load_state_dict(torch.load("../code-search-net/codebert/Siamese-model"),strict=False) 

In [45]:
def eval(model, model_inputs1, model_inputs2):
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def _forward(model_input):
        model_output = model(**model_input)
        embeddings = mean_pooling(model_output, model_input['attention_mask'])
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        return embeddings

    embedding1, embedding2 = _forward(model_inputs1), _forward(model_inputs2)
    return embedding1, embedding2

In [44]:
def get_batched_dataset(dataset, batch_size, seed=None):
    if seed is not None:
        dataset = dataset.shuffle(seed=seed)
    for i in range(len(dataset) // batch_size):
        batch = dataset[i*batch_size: (i+1)*batch_size]
        yield dict(batch)

@dataclass
class DataCollator:
    tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer]
    input1_maxlen: int = 128
    input2_maxlen: int = 128

    def __call__(self, batch):
        # Currently only static padding; TODO: change below for adding dynamic padding support
        model_input1 = self.tokenizer(batch["docstring"], return_tensors="pt", max_length=self.input1_maxlen, truncation=True, padding="max_length")
        model_input2 = self.tokenizer(batch["code"], return_tensors="pt", max_length=self.input2_maxlen, truncation=True, padding="max_length")
        model_input1, model_input2 = dict(model_input1), dict(model_input2)
        return model_input1, model_input2
        # return model_input1, model_input2

In [51]:
from torch_impl.util import cos_sim
import numpy as np

def batch_accuracy(embeddings_a: np.ndarray, embeddings_b: np.ndarray,
                   similarity_fct=cos_sim):
    """

    :param embeddings_a:
    :param embeddings_b: if passing additional hard negatives, use jnp.concatenate([positives, negatives], axis=0) as input.
    :param similarity_fct:
    :return:
    """
    assert (len(embeddings_a) <= len(embeddings_b))
    scores = similarity_fct(embeddings_a, embeddings_b)
    assert scores.shape == (len(embeddings_a), len(embeddings_b))

    indices = np.argmax(scores, axis=1)

    labels = np.arange(len(scores), dtype=np.int32)

    return np.sum(indices == labels)

    

In [41]:
data_collator = DataCollator(
    tokenizer=tokenizer,
    input1_maxlen=200,
    input2_maxlen=200,
)

In [56]:
from itertools import islice

batch_size = 32

total = len(ds) // batch_size
batch_iterator = get_batched_dataset(ds, batch_size, seed=None)

queries = []
codes = []
accs = 0
for j, batch in tqdm(enumerate(islice(batch_iterator, 2)), desc=f"Compute", total=total):
    model_input1, model_input2 = data_collator(batch)
    emb1, emb2 = eval(model, model_input1, model_input2)

    emb1 = emb1.detach().numpy()
    emb2 = emb2.detach().numpy()
    batch_accs = batch_accuracy(emb1, emb2)
    print(emb1.shape, emb2.shape, batch_accs)
    accs += batch_accs
    
    # done in eval
    # emb1 = normalize_L2(emb1)
    # emb2 = normalize_L2(emb2)
    queries.append(emb1)
    codes.append(emb2)

accs = accs / len(ds)
print("accs", accs)



Compute:   0%|          | 1/1113 [00:01<26:57,  1.45s/it](32, 768) (32, 768) 0
Compute:   0%|          | 2/1113 [00:02<26:19,  1.42s/it](32, 768) (32, 768) 0
accs 0.0



In [None]:
codes_all = np.vstack(codes)
print(codes_all.shape)

queries_all = np.vstack(queries)
print(queries_all.shape)

import faiss

index = faiss.IndexFlatL2(768) 
index.add(codes_all)