In [38]:
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
from tqdm import tqdm

# JavaDoc-Code Similarity
### Login to Huggingface

In [27]:
with open('secrets/hugging_face_key.txt') as f:
    login(f.read())

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/marcus/.cache/huggingface/token
Login successful


### Load dataset

In [28]:
ds = load_dataset("code_search_net", "java", split='train', streaming=True).shuffle(buffer_size=10_000, seed=42)

In [29]:
row = next(iter(ds))
print(row.keys())
print(row['func_documentation_string'])
print(row['func_code_string'])
print(row['repository_name'])

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])
Creates a new instance of MediaType by parsing the supplied string.

@param type the media type string
@return the newly created MediaType
@throws IllegalArgumentException if the supplied string cannot be parsed
or is null
public static MediaType valueOf(String type) throws IllegalArgumentException {
        if (type == null || type.trim().isEmpty() || type.startsWith(SUBTYPE_SEPARATOR) || type.endsWith(SUBTYPE_SEPARATOR)) {
            throw new IllegalArgumentException("Invalid mime type '" + type + "'");
        }
        String[] splitType = type.split(SUBTYPE_SEPARATOR);
        if (splitType.length == 2) {
            nonEmpty(type, splitType[0]);
            nonEmpty(type, splitType[1]);
            Map<String, String> parameters = new HashMap<>();
 

### Get Relevance Data
From https://github.com/github/CodeSearchNet#human-relevance-judgements

In [30]:
relevance_df = pd.read_csv('data/annotationStore.csv')
relevance_df = relevance_df[relevance_df['Language'] == 'Java']
relevance_df.head()

Unnamed: 0,Language,Query,GitHubUrl,Relevance,Notes
166,Java,fuzzy match ranking,https://github.com/spotbugs/spotbugs/blob/f636...,0,
167,Java,create cookie,https://github.com/apache/spark/blob/25ee0474f...,2,
168,Java,parse query string in url,https://github.com/tanhaichao/leopard-lang/blo...,0,
169,Java,convert int to string,https://github.com/hankcs/HanLP/blob/a538d0722...,0,
170,Java,deducting the median from each column,https://github.com/datacleaner/AnalyzerBeans/b...,0,


In [31]:
def get_relevance(repo_url):
    row = relevance_df[relevance_df['GitHubUrl'] == repo_url]

    if row is not None:
        return row.Relevance

    return None

In [32]:
get_relevance('https://github.com/ontop/ontop/blob/ddf78b26981b6129ee9a1a59310016830f5352e4/core/optimization/src/main/java/it/unibz/inf/ontop/iq/optimizer/FlattenUnionOptimizer.java#L45-L50')

964    0
Name: Relevance, dtype: int64

### Preprocess data

In [33]:
def preprocess(row):
    relevance = get_relevance(row['func_code_url'])

    return pd.DataFrame({"docstring": row['func_documentation_string'], "code": row['func_code_string'], 'relevance': relevance, 'repo': row['repository_name']}, index=[0])

In [34]:
test = preprocess(row)
test

Unnamed: 0,docstring,code,relevance,repo
0,Creates a new instance of MediaType by parsing...,public static MediaType valueOf(String type) t...,,josueeduardo/snappy


### Process Data

In [35]:
# Used to check of docstring is written in a different language other than English.
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [36]:
NUMBER_OF_PAIRS = 100
df = pd.DataFrame()

for i, row in tqdm(enumerate(iter(ds))):
    if not is_ascii(row['func_documentation_string']):
        continue

    proc_df = preprocess(row)
    df = pd.concat([df, proc_df], ignore_index=True)

    if i == NUMBER_OF_PAIRS:
        break

df.head()

100it [05:01,  3.01s/it]


Unnamed: 0,docstring,code,relevance,repo
0,Creates a new instance of MediaType by parsing...,public static MediaType valueOf(String type) t...,,josueeduardo/snappy
1,Generates a Predicate that only accepts the Ma...,"public static <T extends MatchResult, S extend...",,kmi/iserve
2,Move a raster layer down (=back) one place. No...,public boolean moveRasterLayerDown(RasterLayer...,,geomajas/geomajas-project-client-gwt
3,Calculates the cross product of this vector wi...,public Vector crossProduct(Vector o) {\n ...,,greatman/GreatmancodeTools
4,Filter the specified string for characters tha...,"protected void filter(String value, AbstractRe...",,moparisthebest/beehive


In [37]:
df.to_csv('data/processed.csv')