In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F
import torch

### Load Dataset

In [None]:
df = pd.read_csv('data/processed.csv')

### Calculate cosine sim

In [102]:
MAX_TENSOR_SIZE = 514

def get_average_embeddings(natural_language, code, tokenizer, model):
    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    if len(nl_tokens) > MAX_TENSOR_SIZE or len(code_tokens) > MAX_TENSOR_SIZE:
        return None, None

    nl_tokens_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    code_token_ids = tokenizer.convert_tokens_to_ids(code_tokens)

    nl_embeddings = model(torch.tensor(nl_tokens_ids)[None, :])[0]
    code_embeddings = model(torch.tensor(code_token_ids)[None, :])[0]


    if nl_embeddings.size()[1] < code_embeddings.size()[1]:
        nl_embeddings = F.pad(nl_embeddings, (0, 0, code_embeddings.size()[1] - nl_embeddings.size()[1], 0))
    elif code_embeddings.size()[1] < nl_embeddings.size()[1]:
        code_embeddings = F.pad(code_embeddings, (0, 0, nl_embeddings.size()[1] - code_embeddings.size()[1], 0))


    nl_agg = torch.mean(nl_embeddings, 2)
    code_agg = torch.mean(code_embeddings, 2)

    return nl_agg, code_agg

In [103]:
def get_cosine_sim(natural_language, code, tokenizer, model):

    nl_agg, code_agg = get_average_embeddings(natural_language, code, tokenizer, model)

    if nl_agg is None or code_agg is None:
        return None

    cos = nn.CosineSimilarity(dim=1)

    return cos(nl_agg, code_agg).item()

#### Load Model

In [104]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [105]:
results_df = df.copy(deep=True)

results_df['sim'] = df.apply(lambda x: get_cosine_sim(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1)
results_df.head()

Unnamed: 0,docstring,code,stars,repo,sim
0,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server,0.912524
1,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server,
2,Returns the ClassName object referenced by a c...,public static ClassName getClassName(String cl...,1056,timfel/netbeans,
3,Create a ClassName object via its internal typ...,private ClassName(String type) {\n this...,1056,timfel/netbeans,0.703959
4,"Returns the type string of this class, as stor...",public String getType() {\n return type...,1056,timfel/netbeans,0.569791


### Analysis on data

In [106]:
print("Total number of docstring/code pairs", results_df.size)
print("Number of failed embeddings: ", results_df.sim.isna().sum())
print("Average docstring length: ", results_df.docstring.apply(len).mean())
print("Average code length:", results_df.code.apply(len).mean())
print("Average sim score:", results_df.sim.mean())
print("Average number of max stars", results_df.stars.mean())

Total number of docstring/code pairs 1210
Number of failed embeddings:  31
Average docstring length:  145.3305785123967
Average code length: 766.5702479338843
Average sim score: 0.6756840025856031
Average number of max stars 3879.8429752066118


In [107]:
results_df.loc[results_df.sim.notnull()].reset_index(drop=True).to_csv('data/sim.csv')