In [2]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F
import torch
from tqdm import tqdm

tqdm.pandas()

### Load Dataset

In [3]:
df = pd.read_pickle('data/processed.pickle')
df.head()

Unnamed: 0,docstring,code,relevance,repo,func_url,split
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train


### Calculate cosine sim

In [4]:
MAX_TENSOR_SIZE = 514

def get_average_embeddings(natural_language, code, tokenizer, model):
    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    if len(nl_tokens) > MAX_TENSOR_SIZE or len(code_tokens) > MAX_TENSOR_SIZE:
        return None, None

    nl_tokens_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    code_token_ids = tokenizer.convert_tokens_to_ids(code_tokens)

    nl_embeddings = model(torch.tensor(nl_tokens_ids)[None, :])[0]
    code_embeddings = model(torch.tensor(code_token_ids)[None, :])[0]


    if nl_embeddings.size()[1] < code_embeddings.size()[1]:
        nl_embeddings = F.pad(nl_embeddings, (0, 0, code_embeddings.size()[1] - nl_embeddings.size()[1], 0))
    elif code_embeddings.size()[1] < nl_embeddings.size()[1]:
        code_embeddings = F.pad(code_embeddings, (0, 0, nl_embeddings.size()[1] - code_embeddings.size()[1], 0))


    nl_agg = torch.mean(nl_embeddings, 2)
    code_agg = torch.mean(code_embeddings, 2)

    return nl_agg, code_agg

In [8]:
def get_cosine_sim(natural_language, code, tokenizer, model):

    nl_agg, code_agg = get_average_embeddings(natural_language, code, tokenizer, model)

    if nl_agg is None or code_agg is None:
        return None

    cos = nn.CosineSimilarity(dim=1)
    sim = cos(nl_agg, code_agg).item()

    return nl_agg.detach().numpy(), code_agg.detach().numpy(), sim

#### Load Embedding Model

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [9]:
sim_df = df.copy()

sim_df[['nl_embed', 'code_embed', 'sim']] = df.progress_apply(lambda x: get_cosine_sim(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1, result_type='expand')
sim_df.head()

100%|██████████| 315/315 [00:44<00:00,  7.09it/s]


Unnamed: 0,docstring,code,relevance,repo,func_url,split,nl_embed,code_embed,sim
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.049218524, 0.046266314, 0.05542605, 0.0503...",0.685977
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.050279554, 0.048305128, 0.046424236, 0.054...",0.469626
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.049495775, 0.04716861, 0.055548936, 0.0603...",0.49025
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.04833452, 0.0510515, 0.056628924, 0.054732...",0.568199
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.05180261, 0.053425696, 0.05358086, 0.04882...",0.166293


In [10]:
sim_df.to_pickle('data/sim.pickle')