In [43]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F
import torch
from scipy.stats import pearsonr, spearmanr

### Load Dataset

In [4]:
df = pd.read_csv('data/processed.csv', index_col=0)
df.head()

Unnamed: 0,docstring,code,relevance,repo,func_url
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...


#### Scale Relevance

In [15]:
sc = MinMaxScaler()

df['relevance_scaled'] = sc.fit_transform(df.relevance.values.reshape(-1, 1))
df.head()

Unnamed: 0,docstring,code,relevance,repo,func_url,relevance_scaled
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,0.666667
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,0.666667
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,0.666667
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,1.0
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,1.0


### Calculate cosine sim

In [16]:
MAX_TENSOR_SIZE = 514

def get_average_embeddings(natural_language, code, tokenizer, model):
    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    if len(nl_tokens) > MAX_TENSOR_SIZE or len(code_tokens) > MAX_TENSOR_SIZE:
        return None, None

    nl_tokens_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    code_token_ids = tokenizer.convert_tokens_to_ids(code_tokens)

    nl_embeddings = model(torch.tensor(nl_tokens_ids)[None, :])[0]
    code_embeddings = model(torch.tensor(code_token_ids)[None, :])[0]


    if nl_embeddings.size()[1] < code_embeddings.size()[1]:
        nl_embeddings = F.pad(nl_embeddings, (0, 0, code_embeddings.size()[1] - nl_embeddings.size()[1], 0))
    elif code_embeddings.size()[1] < nl_embeddings.size()[1]:
        code_embeddings = F.pad(code_embeddings, (0, 0, nl_embeddings.size()[1] - code_embeddings.size()[1], 0))


    nl_agg = torch.mean(nl_embeddings, 2)
    code_agg = torch.mean(code_embeddings, 2)

    return nl_agg, code_agg

In [17]:
def get_cosine_sim(natural_language, code, tokenizer, model):

    nl_agg, code_agg = get_average_embeddings(natural_language, code, tokenizer, model)

    if nl_agg is None or code_agg is None:
        return None

    cos = nn.CosineSimilarity(dim=1)

    return cos(nl_agg, code_agg).item()

#### Load Model

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [19]:
results_df = df.copy(deep=True)

results_df['sim'] = df.apply(lambda x: get_cosine_sim(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1)
results_df.head()

Unnamed: 0,docstring,code,relevance,repo,func_url,relevance_scaled,sim
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,0.666667,0.685977
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,0.666667,0.469626
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,0.666667,0.49025
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,1.0,0.568199
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,1.0,0.166293


### Analysis on data

In [20]:
print("Total number of docstring/code pairs", results_df.size)
print("Number of failed embeddings: ", results_df.sim.isna().sum())
print("Average docstring length: ", results_df.docstring.apply(len).mean())
print("Average code length:", results_df.code.apply(len).mean())
print("Average sim score:", results_df.sim.mean())
print("Average scaled relevance", results_df.relevance_scaled.mean())

Total number of docstring/code pairs 2037
Number of failed embeddings:  37
Average docstring length:  193.03436426116838
Average code length: 654.9003436426117
Average sim score: 0.528663301913757
Average scaled relevance 0.38144329896907214


In [21]:
results_df.loc[results_df.sim.notnull()].reset_index(drop=True).to_csv('data/sim.csv')

### Evaluation

In [49]:
results_df.sim = results_df.sim.fillna(0)
print("PEARSON CORRELATION: ", pearsonr(results_df.relevance_scaled, results_df.sim))
print("SPEARMAN CORRELATION: ", spearmanr(results_df.relevance_scaled, results_df.sim))

PEARSON CORRELATION:  PearsonRResult(statistic=-0.06970411319006932, pvalue=0.23586422386790534)
SPEARMAN CORRELATION:  SignificanceResult(statistic=-0.042779913137257895, pvalue=0.4672488906281026)
