In [9]:
%pip install plotnine numpy pandas swifter tqdm sentencepiece
%cd /work/paras/code/contracode
%pwd
%matplotlib inline
!mkdir -p data/plots
import jsonlines
import sentencepiece as spm

import pandas as pd
import numpy as np
import re
import os
import plotnine as p9
from tqdm import tqdm
tqdm.pandas()

sp = spm.SentencePieceProcessor()
sp.Load('data/codesearchnet_javascript/csnjs_8k_9995p_unigram_url.model')





Note: you may need to restart the kernel to use updated packages.
/work/paras/code/contracode


True

In [15]:
camel_case_re = re.compile(r".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)")
_fix_function_crop_regexes = [
    re.compile(r + r"(\s+|\()") for r in [r"\A^unction", r"\A^nction", r"\A^ction", r"\A^tion", r"\A^ion", r"\A^on", r"\A^n"]
]
_valid_identifier_regex = re.compile(r"^[a-zA-Z_$][0-9a-zA-Z_$]*$")
_url_regex = re.compile(r"https?://\S+\b")


def camel_case_split(identifier):
    return [m.group(0) for m in camel_case_re.finditer(identifier)]

def snake_case_split(identifier):
    return identifier.split("_")

def split_method_name(method_name: str):
    toks = [method_name]
    toks = [tok for s in toks for tok in snake_case_split(s)]
    toks = [tok.lower() for s in toks for tok in camel_case_split(s)]
    return toks

def filtered_row(pd_row):
    out_funcs = pd_row['original_string']

    # Fix cropped "function" token at the begging of the function string
    for regex in _fix_function_crop_regexes:
        out_funcs = regex.sub(r"function\1", out_funcs, count=1)

    # Remove function name from declaration, but leave it in the function body
    _function_name_regex = r"(function\s*)" + re.escape(pd_row['func_name'])
    return re.sub(_function_name_regex, r"\1x", out_funcs, count=1)

# to create subtoken input as specified by paper
def tok_and_pack_string(string):
    out_toks = []
    for tok in string.split():
        out_toks.append(' '.join(split_method_name(tok)))
    return ' '.join(out_toks)

def process_data(jsonl_file):
    data = []
    with jsonlines.open(jsonl_file) as f:
        for obj in f:
            data.append(obj)
    df = pd.DataFrame(data)
    df['func_name'].replace('', np.nan, inplace=True)
    df['original_string'].replace('', np.nan, inplace=True)
    df.dropna(subset=['func_name', 'original_string'], inplace=True)
    
    df['func_name_pieces'] = df['func_name'].apply(split_method_name).apply(lambda x: ' '.join(x))
    df['code_cleaned'] = df[['original_string', 'func_name']].progress_apply(filtered_row, axis=1).apply(lambda x: ' '.join(x.split()))
    df['code_cleaned_subtoken'] = df['code_cleaned'].progress_apply(tok_and_pack_string)
    
    programs = df['code_cleaned_subtoken'].tolist()
    programs_subtok = df['code_cleaned_subtoken'].tolist()
    labels = df['func_name_pieces'].tolist()
    
    return programs, programs_subtok, labels, df

In [16]:
prefix_in = 'data/codesearchnet_javascript/'
prefix_out = 'data/neuralcodesum/'

datasets = {
    'train': 'javascript_train_supervised.jsonl',
    'dev': 'javascript_valid_0.jsonl',
    'test': 'javascript_test_0.jsonl',
}

for setname, fname in datasets.items():
    print("Processing data from", setname)
    os.makedirs(os.path.join(prefix_out, setname), exist_ok=True)
    programs, programs_subtok, labels, df = process_data(os.path.join(prefix_in, fname))
    with open(os.path.join(prefix_out, setname, 'code.original'), 'w') as f:
        f.write('\n'.join(programs))
    with open(os.path.join(prefix_out, setname, 'code.original_subtoken'), 'w') as f:
        f.write('\n'.join(programs_subtok))
    with open(os.path.join(prefix_out, setname, 'javadoc.original'), 'w') as f:
        f.write('\n'.join(labels))

Processing data from train


100%|██████████| 81487/81487 [00:18<00:00, 4526.84it/s]
100%|██████████| 81487/81487 [00:24<00:00, 3328.31it/s]


Processing data from dev


100%|██████████| 5501/5501 [00:00<00:00, 5600.81it/s]
100%|██████████| 5501/5501 [00:02<00:00, 2460.43it/s]


Processing data from test


100%|██████████| 4441/4441 [00:00<00:00, 4700.30it/s]
100%|██████████| 4441/4441 [00:01<00:00, 3502.33it/s]


In [17]:
df

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,func_name_pieces,code_cleaned,code_cleaned_subtoken
0,axios/axios,lib/axios.js,createInstance,function createInstance(defaultConfig) {\n va...,javascript,function createInstance(defaultConfig) {\n va...,"[function, createInstance, (, defaultConfig, )...",Create an instance of Axios\n\n@param {Object}...,"[Create, an, instance, of, Axios]",92d231387fe2092f8736bc1746d4caa766b675f5,https://github.com/axios/axios/blob/92d231387f...,test,create instance,function x(defaultConfig) { var context = new ...,function x(default config) { var context = new...
1,axios/axios,lib/cancel/CancelToken.js,CancelToken,function CancelToken(executor) {\n if (typeof...,javascript,function CancelToken(executor) {\n if (typeof...,"[function, CancelToken, (, executor, ), {, if,...",A `CancelToken` is an object that can be used ...,"[A, CancelToken, is, an, object, that, can, be...",92d231387fe2092f8736bc1746d4caa766b675f5,https://github.com/axios/axios/blob/92d231387f...,test,cancel token,function x(executor) { if (typeof executor !==...,function x(executor) { if (typeof executor !==...
2,axios/axios,lib/utils.js,isArrayBufferView,function isArrayBufferView(val) {\n var resul...,javascript,function isArrayBufferView(val) {\n var resul...,"[function, isArrayBufferView, (, val, ), {, va...",Determine if a value is a view on an ArrayBuff...,"[Determine, if, a, value, is, a, view, on, an,...",92d231387fe2092f8736bc1746d4caa766b675f5,https://github.com/axios/axios/blob/92d231387f...,test,is array buffer view,function x(val) { var result; if ((typeof Arra...,function x(val) { var result; if ((typeof arra...
3,axios/axios,lib/utils.js,isStandardBrowserEnv,function isStandardBrowserEnv() {\n if (typeo...,javascript,function isStandardBrowserEnv() {\n if (typeo...,"[function, isStandardBrowserEnv, (, ), {, if, ...",Determine if we're running in a standard brows...,"[Determine, if, we, re, running, in, a, standa...",92d231387fe2092f8736bc1746d4caa766b675f5,https://github.com/axios/axios/blob/92d231387f...,test,is standard browser env,function x() { if (typeof navigator !== 'undef...,function x() { if (typeof navigator !== 'undef...
4,axios/axios,lib/utils.js,forEach,"function forEach(obj, fn) {\n // Don't bother...",javascript,"function forEach(obj, fn) {\n // Don't bother...","[function, forEach, (, obj, ,, fn, ), {, // Do...",Iterate over an Array or an Object invoking a ...,"[Iterate, over, an, Array, or, an, Object, inv...",92d231387fe2092f8736bc1746d4caa766b675f5,https://github.com/axios/axios/blob/92d231387f...,test,for each,"function x(obj, fn) { // Don't bother if no va...","function x(obj, fn) { // don't bother if no va..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,greatbsky/ES6MVC,src/main/Project.js,requireAllInterceptor,function requireAllInterceptor(dir) {\n if ...,javascript,function requireAllInterceptor(dir) {\n if ...,"[function, requireAllInterceptor, (, dir, ), {...",/*自动require interceptors,"[/, *, 自动require, interceptors]",b28ebd9931a0228e2ef098ff6578b3b0607a4da8,https://github.com/greatbsky/ES6MVC/blob/b28eb...,test,require all interceptor,function x(dir) { if (!fs.existsSync(dir)) { r...,function x(dir) { if (!fs.exists sync(dir)) { ...
6476,tuddman/passport-wink,lib/passport-wink/strategy.js,Strategy,"function Strategy(options, verify) {\n option...",javascript,"function Strategy(options, verify) {\n option...","[function, Strategy, (, options, ,, verify, ),...",`Strategy` constructor.\n\nThe wink authentica...,"[Strategy, constructor, .]",010d2a843af645dad931a414605029c81649d59c,https://github.com/tuddman/passport-wink/blob/...,test,strategy,"function x(options, verify) { options = option...","function x(options, verify) { options = option..."
6477,farmdawgnation/vain,lib/vain.js,processParams,function processParams(paramsString) {\n var ...,javascript,function processParams(paramsString) {\n var ...,"[function, processParams, (, paramsString, ), ...",Utility method to process a params string in a...,"[Utility, method, to, process, a, params, stri...",0a30de09f443c9c313223b338a8892b6ef9ddf1d,https://github.com/farmdawgnation/vain/blob/0a...,test,process params,function x(paramsString) { var individualParam...,function x(params string) { var individual par...
6480,beyo/model,lib/collection.js,Collection,function Collection(options) {\n if (!(this i...,javascript,function Collection(options) {\n if (!(this i...,"[function, Collection, (, options, ), {, if, (...",Create a new collection\n\n@param {Object} opt...,"[Create, a, new, collection]",625b386b6e1141e1869ede753640850c0c9f0971,https://github.com/beyo/model/blob/625b386b6e1...,test,collection,function x(options) { if (!(this instanceof Co...,function x(options) { if (!(this instanceof co...
