In [2]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F
import torch

### Load Dataset

In [3]:
df = pd.read_csv('data/processed.csv')

### Calculate cosine sim

In [4]:
MAX_TENSOR_SIZE = 514

def get_average_embeddings(natural_language, code, tokenizer, model):
    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    if len(nl_tokens) > MAX_TENSOR_SIZE or len(code_tokens) > MAX_TENSOR_SIZE:
        return None, None

    nl_tokens_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    code_token_ids = tokenizer.convert_tokens_to_ids(code_tokens)

    nl_embeddings = model(torch.tensor(nl_tokens_ids)[None, :])[0]
    code_embeddings = model(torch.tensor(code_token_ids)[None, :])[0]


    if nl_embeddings.size()[1] < code_embeddings.size()[1]:
        nl_embeddings = F.pad(nl_embeddings, (0, 0, code_embeddings.size()[1] - nl_embeddings.size()[1], 0))
    elif code_embeddings.size()[1] < nl_embeddings.size()[1]:
        code_embeddings = F.pad(code_embeddings, (0, 0, nl_embeddings.size()[1] - code_embeddings.size()[1], 0))


    nl_agg = torch.mean(nl_embeddings, 2)
    code_agg = torch.mean(code_embeddings, 2)

    return nl_agg, code_agg

In [5]:
def get_cosine_sim(natural_language, code, tokenizer, model):

    nl_agg, code_agg = get_average_embeddings(natural_language, code, tokenizer, model)

    if nl_agg is None or code_agg is None:
        return None

    cos = nn.CosineSimilarity(dim=1)

    return cos(nl_agg, code_agg).item()

#### Load Model

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [7]:
results_df = df.copy(deep=True)

results_df['sim'] = df.apply(lambda x: get_cosine_sim(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1)
results_df.head()

Unnamed: 0.1,Unnamed: 0,docstring,code,stars,repo,sim
0,0,Bind indexed elements to the supplied collecti...,protected final void bindIndexed(Configuration...,68742,spring-projects/spring-boot,0.714404
1,1,Set {@link ServletRegistrationBean}s that the ...,public void setServletRegistrationBeans(\n\t\t...,68742,spring-projects/spring-boot,0.595693
2,2,Add {@link ServletRegistrationBean}s for the f...,public void addServletRegistrationBeans(\n\t\t...,68742,spring-projects/spring-boot,0.688578
3,3,Set servlet names that the filter will be regi...,public void setServletNames(Collection<String>...,68742,spring-projects/spring-boot,0.912222
4,4,Add servlet names for the filter.\n@param serv...,public void addServletNames(String... servletN...,68742,spring-projects/spring-boot,0.579986


### Analysis on data

In [8]:
print("Total number of docstring/code pairs", results_df.size)
print("Number of failed embeddings: ", results_df.sim.isna().sum())
print("Average docstring length: ", results_df.docstring.apply(len).mean())
print("Average code length:", results_df.code.apply(len).mean())
print("Average sim score:", results_df.sim.mean())
print("Average number of max stars", results_df.stars.mean())

Total number of docstring/code pairs 600
Number of failed embeddings:  1
Average docstring length:  234.22
Average code length: 358.51
Average sim score: 0.6886076997927945
Average number of max stars 68742.0


In [9]:
results_df.loc[results_df.sim.notnull()].reset_index(drop=True).to_csv('data/sim.csv')