In [3]:
import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd


def read_json_dataset(path):
    with path.open('r') as istream:
        return [json.loads(l) for l in istream]


def concat_json_datasets(data_folder):
    data_folder = Path(data_folder)
    return [
        l for file in data_folder.glob("*.jsonl")
        for l in read_json_dataset(file)
    ]


DATA_FOLDER = Path.home() / "data/method_name_prediction/python/final/jsonl"

In [2]:
train = concat_json_datasets(DATA_FOLDER / "train")
valid = concat_json_datasets(DATA_FOLDER / "valid")
test = concat_json_datasets(DATA_FOLDER / "test")

print("Train: ", len(train))
print("Val: ", len(valid))
print("Test: ", len(test))

Train:  412178
Val:  23107
Test:  22176


In [3]:
import re
from tqdm import tqdm


def remove_docstring(code: str, docstring: str):
    doc_start_exp = re.compile(r"^\s*[ur]?[ur]?('''|\"\"\")")
    docstring = doc_start_exp.sub("", docstring)
    assert docstring in code
    code = code.replace(docstring, "")
    quotes_exp = re.compile(r"[ur]?[ur]?('''\s*'''|\"\"\"\s*\"\"\")")
    return quotes_exp.sub("", code)

        
def add_field(dicts: 'List[dict]', key, func):
    for d in tqdm(dicts):
        d[key] = func(d)


def get_nth_token_drop_async(entry, n):
    if entry['code_tokens'][0] == 'async':
        return entry['code_tokens'][n + 1]
    return entry['code_tokens'][n]


def get_function_name_from_code_tokens(entry):
    return get_nth_token_drop_async(entry, 1)


def replace_with_placeholder(code, name, placeholder='_'):
    name_exp = re.compile(name)  # should we ignorecase or smth like that?
    return name_exp.sub(placeholder, code)


def preprocess_function_body(entry, drop_docstring = True):
    code = entry['code']
    if drop_docstring:
        code = remove_docstring(code, entry['docstring'])
    try:
        code = replace_with_placeholder(code, entry['function_name'])
    except Exception:
        return "FAILED_REPLACING_PLACEHOLDER" + code
    return code


In [4]:
func_name_key = 'function_name'
add_field(train, func_name_key, get_function_name_from_code_tokens)

add_field(valid, func_name_key, get_function_name_from_code_tokens)

add_field(test, func_name_key, get_function_name_from_code_tokens)

100%|██████████| 412178/412178 [00:00<00:00, 754737.34it/s]
100%|██████████| 23107/23107 [00:00<00:00, 683839.12it/s]
100%|██████████| 22176/22176 [00:00<00:00, 786491.85it/s]


In [5]:
body_key = 'function_body'
add_field(train, body_key, preprocess_function_body)

add_field(valid, body_key, preprocess_function_body)

add_field(test, body_key, preprocess_function_body)

100%|██████████| 412178/412178 [00:44<00:00, 9189.00it/s] 
100%|██████████| 23107/23107 [00:02<00:00, 8314.68it/s]
100%|██████████| 22176/22176 [00:02<00:00, 8920.90it/s]


In [6]:
print("Train: ", len(train))
print("Val: ", len(valid))
print("Test: ", len(test))

Train:  412178
Val:  23107
Test:  22176


In [7]:
def filter_failed(entry, body_key=body_key):
    body = entry[body_key]
    return not (body.startswith("FAILED_REMOVING_COMMENTS") 
                or body.startswith("FAILED_REPLACING_PLACEHOLDER"))


train = list(filter(filter_failed, train))
valid = list(filter(filter_failed, valid))
test = list(filter(filter_failed, test))

print("Train: ", len(train))
print("Val: ", len(valid))
print("Test: ", len(test))

Train:  412173
Val:  23107
Test:  22176


In [8]:
def filter_self_ref(entry, body_key=body_key, name_key=func_name_key):
    return not (entry[name_key] in entry[body_key])


train = list(filter(filter_self_ref, train))
valid = list(filter(filter_self_ref, valid))
test = list(filter(filter_self_ref, test))

print("Train: ", len(train))
print("Val: ", len(valid))
print("Test: ", len(test))

Train:  412160
Val:  23107
Test:  22176


In [9]:
from cubert.python_tokenizer import PythonTokenizer
from cubert.code_to_subtokenized_sentences import code_to_cubert_sentences
from tensor2tensor.data_generators import text_encoder


MODELS_DIR = Path.home() / "models/cubert"
VOCAB_PATH = MODELS_DIR / "github_python_minus_ethpy150open_deduplicated_vocabulary.txt"


python_tokenizer = PythonTokenizer()
subword_tokenizer = text_encoder.SubwordTextEncoder(VOCAB_PATH.as_posix())

In [10]:
from functools import partial


def tokenize_cubert(key, entry):
    return code_to_cubert_sentences(entry[key], python_tokenizer, subword_tokenizer)


tokenize_body = partial(tokenize_cubert, body_key)
tokenize_name = partial(tokenize_cubert, func_name_key)

In [11]:
problem = [e for e in train if 'Ne = len(magnetic_states)' in e['code']]

In [12]:
add_field(train, body_key + "_tokenized", tokenize_body)
add_field(valid, body_key + "_tokenized", tokenize_body)
add_field(test, body_key + "_tokenized", tokenize_body)

          schema=None):
    
    def _(func):
        def wrapper(self, *args, **kwargs):
            # "test" argument means no wrap func this time,
            # return original func immediately.
            if kwargs.get("test", False):
                kwargs.pop("test")
                func(self, *args, **kwargs)

            _methods = methods
            if isinstance(methods, str):
                _methods = [methods]
             = self.r.(resource)
            for method in _methods:
                getattr(, method)(func, schema)
        # Ordered by declare sequence
        # http://stackoverflow.com/questions/4459531/how-to-read-class-attributes-in-the-same-order-as-declared
        f_locals = sys._getframe(1).f_locals
        _order = len([v for v in f_locals.itervalues()
                     if hasattr(v, '__call__') and
                     hasattr(v, '__name__') and
                     v.__name__ == "wrapper"])
        wrapper.__dict__["_order"] = _order
        return

In [13]:
add_field(train, func_name_key + "_tokenized", tokenize_name)
add_field(valid, func_name_key + "_tokenized", tokenize_name)
add_field(test, func_name_key + "_tokenized", tokenize_name)

100%|██████████| 412160/412160 [01:08<00:00, 5983.05it/s]
100%|██████████| 23107/23107 [00:03<00:00, 6385.77it/s]
100%|██████████| 22176/22176 [00:03<00:00, 6687.18it/s]


In [18]:
def save_jsonl(path, json_list):
    with open(path, 'x') as istream:
        for j in json_list:
            istream.write(f"{json.dumps(j)}\n")

In [19]:
save_jsonl(DATA_FOLDER / "train_preprocessed.jsonl", train)
save_jsonl(DATA_FOLDER / "valid_preprocessed.jsonl", valid)
save_jsonl(DATA_FOLDER / "test_preprocessed.jsonl", test)

In [2]:
def read_jsonl(path):
    with open(path, 'r') as istream:
        return [json.loads(line) for line in istream]

In [54]:
train_p = read_jsonl(DATA_FOLDER / "train_preprocessed.jsonl")
valid_p = read_jsonl(DATA_FOLDER / "valid_preprocessed.jsonl")
test_p = read_jsonl(DATA_FOLDER / "test_preprocessed.jsonl")

In [55]:
import ast


def is_correct_python_code(code: str) -> bool:
    try:
        ast.parse(code)
        return True
    except:
        return False


In [23]:
from tqdm import tqdm


failed_train = [
    e for e in tqdm(train_p)
    if not is_correct_python_code(e['function_body'])
]
len(failed_train)

100%|██████████| 412160/412160 [00:43<00:00, 9369.34it/s] 


In [43]:
failed_valid = [
    e for e in tqdm(valid_p)
    if not is_correct_python_code(e['function_body'])
]
len(failed_valid)

100%|██████████| 23107/23107 [00:02<00:00, 10005.86it/s]


345

In [42]:
failed_test = [
    e for e in tqdm(test_p)
    if not is_correct_python_code(e['function_body'])
]
len(failed_test)

100%|██████████| 22176/22176 [00:02<00:00, 10831.69it/s]


299

In [56]:
def body_is_correct_python_code(entry):
    return is_correct_python_code(entry['function_body'])


train_p = list(filter(body_is_correct_python_code, train_p))
valid_p = list(filter(body_is_correct_python_code, valid_p))
test_p = list(filter(body_is_correct_python_code, test_p))

In [58]:
print("Train: ", len(train_p))
print("Val: ", len(valid_p))
print("Test: ", len(test_p))

Train:  407429
Val:  22762
Test:  21877


In [59]:
def save_jsonl(path, json_list):
    with open(path, 'w') as istream:
        for j in json_list:
            istream.write(f"{json.dumps(j)}\n")

In [61]:
save_jsonl(DATA_FOLDER / "train_preprocessed.jsonl", train_p)
save_jsonl(DATA_FOLDER / "valid_preprocessed.jsonl", valid_p)
save_jsonl(DATA_FOLDER / "test_preprocessed.jsonl", test_p)