In [1]:
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
from tqdm import tqdm

### Get Relevance Data
From https://github.com/github/CodeSearchNet#human-relevance-judgements

In [2]:
relevance_df = pd.read_csv('data/annotationStore.csv')
relevance_df = relevance_df[relevance_df['Language'] == 'Java']
relevance_df.head()

Unnamed: 0,Language,Query,GitHubUrl,Relevance,Notes
166,Java,fuzzy match ranking,https://github.com/spotbugs/spotbugs/blob/f636...,0,
167,Java,create cookie,https://github.com/apache/spark/blob/25ee0474f...,2,
168,Java,parse query string in url,https://github.com/tanhaichao/leopard-lang/blo...,0,
169,Java,convert int to string,https://github.com/hankcs/HanLP/blob/a538d0722...,0,
170,Java,deducting the median from each column,https://github.com/datacleaner/AnalyzerBeans/b...,0,


In [3]:
def get_relevance(repo_url):
    row = relevance_df[relevance_df['GitHubUrl'] == repo_url]

    if row is not None:
        return row.Relevance.iloc[0]

    return None

In [4]:
get_relevance('https://github.com/spring-projects/spring-boot/blob/0b27f7c70e164b2b1a96477f1d9c1acba56790c1/spring-boot-project/spring-boot/src/main/java/org/springframework/boot/info/GitProperties.java#L106-L118')

2

# JavaDoc-Code Similarity
### Login to Huggingface

In [5]:
with open('secrets/hugging_face_key.txt') as f:
    login(f.read())

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/marcus/.cache/huggingface/token
Login successful


### Load dataset

In [21]:
ds_train = load_dataset("code_search_net", "java", split='train', streaming=True).filter(lambda row: row['func_code_url'] in list(relevance_df.GitHubUrl))
ds_test = load_dataset("code_search_net", "java", split='test', streaming=True).filter(lambda row: row['func_code_url'] in list(relevance_df.GitHubUrl))
ds_validation = load_dataset("code_search_net", "java", split='validation', streaming=True).filter(lambda row: row['func_code_url'] in list(relevance_df.GitHubUrl))

ds = [ds_train, ds_test, ds_validation]

In [14]:
row = next(iter(ds_train))
print(row.keys())
print(row['func_documentation_string'])
print(row['func_code_string'])
print(row['repository_name'])
print(row['func_code_url'])
print(row['split_name'])

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])
Attempt to convert the specified value to epoch time. Git properties information
are known to be specified either as epoch time in seconds or using a specific date
format.
@param s the value to coerce to
@return the epoch time in milliseconds or the original value if it couldn't be
converted
private static String coerceToEpoch(String s) {
		Long epoch = parseEpochSecond(s);
		if (epoch != null) {
			return String.valueOf(epoch);
		}
		SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
		try {
			return String.valueOf(format.parse(s).getTime());
		}
		catch (ParseException ex) {
			return s;
		}
	}
spring-projects/spring-boot
https://github.com/spring-projects/spring-boot/blob/0b27f7c70e164b2b1a96477f1d9c1acba56790c1/spring-boot-projec

In [15]:
get_relevance('https://github.com/Netflix/eureka/blob/48446d956be09df6650a3c00b7ebd7e2d1e1544f/eureka-client/src/main/java/com/netflix/discovery/shared/resolver/ResolverUtils.java#L88-L102')

0

### Preprocess data

In [16]:
def preprocess(row):
    relevance = get_relevance(row['func_code_url'])

    return pd.DataFrame({"docstring": row['func_documentation_string'], "code": row['func_code_string'], 'relevance': relevance, 'repo': row['repository_name'], 'func_url': row['func_code_url'], 'split': row['split_name']}, index=[0])

In [17]:
test = preprocess(row)
test

Unnamed: 0,docstring,code,relevance,repo,func_url,split
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train


### Process Data

In [18]:
# Used to check of docstring is written in a different language other than English.
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [23]:
df = pd.DataFrame()

for dataset in ds:
    for i, row in tqdm(enumerate(iter(dataset))):
        try:
            if not is_ascii(row['func_documentation_string']):
                continue
        except StopIteration:
            break

        proc_df = preprocess(row)
        df = pd.concat([df, proc_df], ignore_index=True)

df.head()

306it [01:58,  2.58it/s]
17it [00:07,  2.35it/s]
10it [00:04,  2.05it/s]


Unnamed: 0,docstring,code,relevance,repo,func_url,split
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train


In [24]:
df.to_pickle('data/processed.pickle')