In [20]:
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F
import torch
import re
import seaborn as sns

# JavaDoc-Code Similarity
### Login to Huggingface

In [2]:
with open('secrets/hugging_face_key.txt') as f:
    login(f.read())

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/marcus/.cache/huggingface/token
Login successful


### Load dataset

In [3]:
ds = load_dataset("bigcode/starcoderdata", data_dir="java", split="train", streaming=True)
ds_filtered = ds.filter(lambda s: s['max_stars_count'] > 1000)

ds_filtered.column_names

Downloading readme: 0.00B [00:00, ?B/s]

In [6]:
row = next(iter(ds_filtered))
print(row.keys())
print(row['content'])

dict_keys(['max_stars_repo_path', 'max_stars_repo_name', 'max_stars_count', 'id', 'content'])
/*
 * Copyright 2020-2021 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.springframework.security.oauth2.core.http.converter;

import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import org.springframework.core.ParameterizedTypeReference;
import org.springframework.core.convert.TypeDescriptor;
imp

### Preprocess data

In [7]:
def preprocess(row):
    # Remove licence, imports and packages
    preproc_row = re.split(r'import [A-Za-z.]*;', row['content'])

    if len(preproc_row) >= 1:
        content = preproc_row[len(preproc_row) - 1]

        # Remove JavaDoc tags and compiler annotations
        # content = re.sub(r'\{?@.*', '<annotation>', content)

        class_level = re.split(r'\*/', content, 1)

        if len(class_level) > 1:
            function_level = re.split(r'/\*\*', class_level[1].strip()[:-1])
        else:
            function_level = re.split(r'/\*\*', class_level[0].strip()[:-1])

        docstrings = []
        codes = []

        for func in function_level:
            doc_code = func.split('*/')

            if len(doc_code) <= 1:
                continue

            docstring = doc_code[0].replace('*', '').strip()
            docstrings.append(docstring)

            code = doc_code[1].strip()
            codes.append(code)

        return pd.DataFrame({"docstring": docstrings, "code": codes, 'stars': row['max_stars_count'], 'repo': row['max_stars_repo_name']})

In [8]:
test = preprocess(row)

if test.size != 0:
    print(test['docstring'][0], ":", test['code'][0])
else:
    print("No Docstring/Code pairs")

Sets the {@link Converter} used for converting the OAuth 2.0 Authorization Server Metadata
	  parameters to an {@link OAuth2AuthorizationServerMetadata}.
	 
	  @param authorizationServerMetadataConverter the {@link Converter} used for converting to
	  an {@link OAuth2AuthorizationServerMetadata}. : public final void setAuthorizationServerMetadataConverter(Converter<Map<String, Object>, OAuth2AuthorizationServerMetadata> authorizationServerMetadataConverter) {
		Assert.notNull(authorizationServerMetadataConverter, "authorizationServerMetadataConverter cannot be null");
		this.authorizationServerMetadataConverter = authorizationServerMetadataConverter;
	}


### Process Data

In [9]:
# Used to check of docstring is written in a different language other than English.
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [10]:
NUMBER_OF_CLASSES = 100

df = pd.DataFrame()

for i, row in enumerate(iter(ds_filtered)):
    if not is_ascii(row['content']):
        continue

    proc_df = preprocess(row)
    df = pd.concat([df, proc_df], ignore_index=True)

    if i == NUMBER_OF_CLASSES:
        break

df.head()

Unnamed: 0,docstring,code,stars,repo
0,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server
1,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server
2,Returns the ClassName object referenced by a c...,public static ClassName getClassName(String cl...,1056,timfel/netbeans
3,Create a ClassName object via its internal typ...,private ClassName(String type) {\n this...,1056,timfel/netbeans
4,"Returns the type string of this class, as stor...",public String getType() {\n return type...,1056,timfel/netbeans


In [11]:
df.to_csv('data/processed.csv')

### Calculate cosine sim

In [12]:
MAX_TENSOR_SIZE = 514

def get_average_embeddings(natural_language, code, tokenizer, model):
    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    if len(nl_tokens) > MAX_TENSOR_SIZE or len(code_tokens) > MAX_TENSOR_SIZE:
        return None, None

    nl_tokens_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    code_token_ids = tokenizer.convert_tokens_to_ids(code_tokens)

    nl_embeddings = model(torch.tensor(nl_tokens_ids)[None, :])[0]
    code_embeddings = model(torch.tensor(code_token_ids)[None, :])[0]


    if nl_embeddings.size()[1] < code_embeddings.size()[1]:
        nl_embeddings = F.pad(nl_embeddings, (0, 0, code_embeddings.size()[1] - nl_embeddings.size()[1], 0))
    elif code_embeddings.size()[1] < nl_embeddings.size()[1]:
        code_embeddings = F.pad(code_embeddings, (0, 0, nl_embeddings.size()[1] - code_embeddings.size()[1], 0))


    nl_agg = torch.mean(nl_embeddings, 2)
    code_agg = torch.mean(code_embeddings, 2)

    return nl_agg, code_agg

In [13]:
def get_cosine_sim(natural_language, code, tokenizer, model):

    nl_agg, code_agg = get_average_embeddings(natural_language, code, tokenizer, model)

    if nl_agg is None or code_agg is None:
        return None

    cos = nn.CosineSimilarity(dim=1)

    return cos(nl_agg, code_agg).item()

#### Load Model

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [15]:
results_df = df.copy(deep=True)

results_df['sim'] = df.apply(lambda x: get_cosine_sim(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1)
results_df.head()

Unnamed: 0,docstring,code,stars,repo,sim
0,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server,0.912524
1,Sets the {@link Converter} used for converting...,public final void setAuthorizationServerMetada...,2210,chegekinuthia/spring-authorization-server,
2,Returns the ClassName object referenced by a c...,public static ClassName getClassName(String cl...,1056,timfel/netbeans,
3,Create a ClassName object via its internal typ...,private ClassName(String type) {\n this...,1056,timfel/netbeans,0.703959
4,"Returns the type string of this class, as stor...",public String getType() {\n return type...,1056,timfel/netbeans,0.569791


### Analysis on data

In [18]:
print("Total number of docstring/code pairs", results_df.size)
print("Number of failed embeddings: ", results_df.sim.isna().sum())
print("Average docstring length: ", results_df.docstring.apply(len).mean())
print("Average code length:", results_df.code.apply(len).mean())
print("Average sim score:", results_df.sim.mean())
print("Average number of max stars", results_df.stars.mean())

Total number of docstring/code pairs 1210
Number of failed embeddings:  31
Average docstring length:  145.3305785123967
Average code length: 766.5702479338843
Average sim score: 0.6756840025856031
Average number of max stars 3879.8429752066118


In [17]:
results_df.loc[results_df.sim.notnull()].reset_index(drop=True).to_csv('data/sim.csv')