# Function Names suggestion

Today we are going to show how to:
* Extract function definitions
* Highlight names and identifiers in function
* extract features and labels
* Train a tokenizer (BPE)
* Prepare train & validation dataset for training a seq2seq model
* Train seq2seq NMT model
* Prediction

In [None]:
import os
import logging
import warnings
import base64
from bz2 import open as bz2_open
from json import dumps as json_dumps, loads as json_loads

import coloredlogs
import pandas as pd
import youtokentome as yttm

from utils import DirsABC, FilesABC, Run, SUPPORTED_LANGUAGES, query_gitbase

from enum import Enum
from os import makedirs
from os.path import join as path_join
from typing import Union

coloredlogs.install(level="WARNING")
logging.getLogger("matplotlib.axes._base").setLevel(logging.INFO)
warnings.filterwarnings("ignore")


class Files(FilesABC, Enum):
    FUNCTIONS = ["functions.jsonl.bz2"]
    FUNC_ID_NAME = ["functions_identifers_names.pkl.bz2"]
    BPE_MODEL = ["bpe.model"]
    BPE_INPUT = ["bpe_input.txt"]
    TRAIN_BODIES = ["train.src"]    
    TRAIN_NAMES = ["train.tgt"]
    VAL_BODIES = ["val.src"]
    VAL_NAMES = ["val.tgt"]
    ENC_TRAIN_BODIES = ["train.bpe.src"]
    ENC_TRAIN_NAMES = ["train.bpe.tgt"]
    ENC_VAL_BODIES = ["val.bpe.src"]
    ENC_VAL_NAMES = ["val.bpe.tgt"]
    TGT_VOCABULARY = ["tgt.vocab"]
    SRC_VOCABULARY = ["src.vocab"]
    MODEL_CONFIG = ["model", "config.yml"]    
    MODEL_PRETRAINED = ["pretrained", "ckpt-25000"]
    ENC_VAL_NAMES_PRED = ["val.bpe.pred.tgt"]
    SAMPLE_ENC_VAL_BODIES = ["sample_val.bpe.src"]
    SAMPLE_ENC_VAL_NAMES = ["sample_val.bpe.tgt"]

class Dirs(DirsABC, Enum):
    TF_MODELS = ["tf", "models"]
    MODEL_RUN = ["model", "run"]

    
# Un-coment this at the end, to play with larger pre-processed data
# run = Run("name-suggestion", "java-full")

run = Run("name-suggestion", "java-small")

## Extract function definitions

In [None]:
def extract_function_group(functions_path: str, limit: int = 0):    
    sql = """SELECT
        files.repository_id as repository_id,
        files.file_path as path,
        files.blob_content as content,
        UAST(files.blob_content, LANGUAGE(files.file_path, files.blob_content), '//uast:FunctionGroup') as functions
    FROM files
    NATURAL JOIN commit_files
    NATURAL JOIN commits
    NATURAL JOIN refs
    WHERE
        refs.ref_name= 'HEAD' and functions IS NOT NULL
        AND LANGUAGE(files.file_path, files.blob_content) = 'Java'
        AND NOT IS_VENDOR(file_path)
        AND NOT IS_BINARY(file_path)
    %s
    """ % ( "LIMIT %d" % limit if limit > 0 else "" )
    with bz2_open(functions_path, "wt", encoding="utf8") as fh:
        for row in query_gitbase(sql):
            row["content"] = base64.b64encode(row["content"]).decode("utf-8")
            row["functions"] = base64.b64encode(row["functions"]).decode("utf-8")
            fh.write("%s\n" % json_dumps(row))


extract_function_group(run.path(Files.FUNCTIONS), 3) # 21374 total

## Extract function names and identifiers

In [None]:
def get_function_name(function_node):
    func_name, func_name_pos = None, None
    for node in function_node["Nodes"]:
        if node is None or "@type" not in node:
            continue
        if node["@type"] == 'uast:Alias':
            func_name = node["Name"]["Name"]
            func_name_pos = (node["Name"]["@pos"]["start"]["offset"], node["Name"]["@pos"]["end"]["offset"])
    return func_name, func_name_pos

def get_identifiers(node):
    if (isinstance(node, dict) and 
        '@type' in node and 
        node['@type']  == 'uast:Identifier'):
        yield node["Name"], (node["@pos"]["start"]["offset"], node["@pos"]["end"]["offset"])
    else:
        if isinstance(node, dict):
            for k in node:
                yield from get_identifiers(node[k])
        elif isinstance(node, list) or isinstance(node, tuple):
            for n in node:
                yield from get_identifiers(n)

### Inspect the data

Inspect what is going to be a model feature and labels.

In [None]:
from utils import colored_text_by_pos, Colored, RED, GREEN
from bblfsh.pyuast import decode as uast_decode

def highlight_function_name_and_identifiers(functions_path: str, limit: int = 0):
    with bz2_open(functions_path, "rt", encoding="utf8") as fh_functions:
        processed = 0
        for row_str in fh_functions:
            row = json_loads(row_str)
            content = base64.b64decode(row["content"]).decode('utf-8', 'replace')
            func_group = uast_decode(base64.b64decode(row["functions"]), format=0).load()

            for func in func_group:
                if limit > 0 and processed >= limit:
                    return
                processed += 1
                identifiers = None
                for node in func["Nodes"]:        
                    if node is None or "@type" not in node:
                        continue
                    if node["@type"] == 'uast:Alias':
                        func_body = node["Node"]["Body"]
                        body_identifiers = sorted(get_identifiers(func_body), key=lambda x: x[1][0])
                func_name = get_function_name(func)
                print("-" * 20)

                start_offset = func["@pos"]["start"]["offset"]
                end_offset = func["@pos"]["end"]["offset"]
                colored_texts = []
                colored_texts.append(Colored(color=RED, position=func_name[1], start_offset=start_offset))
                for bi in body_identifiers:
                    colored_texts.append(Colored(color=GREEN, position=bi[1], start_offset=start_offset))
                
                print(colored_text_by_pos(content[start_offset:end_offset], colored_texts))


highlight_function_name_and_identifiers(run.path(Files.FUNCTIONS), 3)

## Features and labels extraction

Prepare raw model input: 
 - X identifiers from the body,
 - Y lable, a name of the function.


In [None]:
import itertools
from joblib import Parallel, delayed

def extract_functions_parallel(functions_path: str, limit: int = 0):

    def read_function_group(functions_path: str, limit: int = 0):
        with bz2_open(functions_path, "rt", encoding="utf-8") as fh_functions:
            processed = 0
            for row_str in fh_functions:
                row = json_loads(row_str)
                func_group = base64.b64decode(row["functions"])
                if limit > 0 and processed >= limit:
                    break
                processed += 1
                yield func_group

    def process_function_group(func_group):
        res = []
        try:
            func_group = uast_decode(func_group, format=0).load()

            for func in func_group:
                body_identifiers = None        
                for node in func["Nodes"]:        
                    if node is None or "@type" not in node:
                        continue
                    if node["@type"] == 'uast:Alias':
                        func_body = node["Node"]["Body"]
                        body_identifiers = sorted(get_identifiers(func_body), key=lambda x: x[1][0])
                if body_identifiers is None:
                    continue
                res.append(([i[0] for i in body_identifiers], get_function_name(func)[0]))
        except:
            print("decoding error")
        return res

    function_group_res = Parallel(n_jobs=-1, verbose=10)(
        delayed(process_function_group)(fg) for fg in read_function_group(run.path(Files.FUNCTIONS), limit))
    
    res = itertools.chain.from_iterable(function_group_res)
    deduplicated_res = filter(lambda x: x[0], set(map(lambda x: (tuple(x[0]), x[1]), res)))
    
    df = pd.DataFrame(deduplicated_res, columns = ['function_identifiers', 'function_name'])
    df.to_pickle(run.path(Files.FUNC_ID_NAME))
    del(df)


extract_functions_parallel(run.path(Files.FUNCTIONS), 3)

# Train Byte Pair Encoding (BPE)

In order to feed text data into the model (identifers) we need to represent it in the vector form.

There are multiple ways to do so:
 1. **word level**, assign a uniq number for every identifer
    * *pro*: easy to implement (hashtable)
    * *con*: huge (and rapidly growing) vocabulary size
    * *con*: how to deal with Out Of Vacabulary (OOV) tokens? E.g by replace with "-UNK-"
 2. **char level**, assign a uniq number for every char
    * *pro*: small vocabulary size
    * *pro*: no OOV
    * *con*: model need to "learn" much more, e.g. to compose words first :/ 
 3. **sub-word level**, assign a uniq number for every sub-word (based on frequency)
    * *pro*: small vocabulary size (hyperparameter)
    * *pro*: easty to deal with OOV
    * *con*: additional "training" step, harder to implement
    
 
We are going to use one particular sub-word level tokininzation algorithm called [Byte Pair Encoding](https://en.wikipedia.org/wiki/Byte_pair_encoding) (BPE)

## Prepare BPE training data

We use single vocabulary for both, identifiers and function names. In order to do so, we will need to train BPE tokenizer on a file that contains all identifiers and function names in plain text.

In [None]:
%env LANG=en_US.UTF-8
%env LC_ALL=en_US.UTF-8

def prepare_bpe_text_file(pkl_functions_path: str, text_file_path: str, limit: int = 0):
    df = pd.read_pickle(pkl_functions_path)
    with open(text_file_path, "wt") as text_file_fh:
        for i, row in df.iterrows():
            if limit > 0 and i >= limit:
                break
            text_file_fh.write(" ".join(row["function_identifiers"]))
            text_file_fh.write(" {}\n".format(row["function_name"]))

prepare_bpe_text_file(run.path(Files.FUNC_ID_NAME), run.path(Files.BPE_INPUT))

## Train BPE tokenizer

Out of multile BPE impelementaitons we are going to use optimized C++ one form https://github.com/VKCOM/YouTokenToMe using its CLI interface and Python bindings.

In [None]:
vocab_size = 10000

!yttm bpe --data {run.path(Files.BPE_INPUT)} --model {run.path(Files.BPE_MODEL)} --vocab_size {vocab_size}

# Split dataset on Training / Vaidation

Create a dataset for training and a separate, holdout one for validation using handy [`model_selection` scikit-learn helper](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_pickle(run.path(Files.FUNC_ID_NAME))
train_bodies, val_bodies, train_names, val_names = train_test_split(df.function_identifiers, df.function_name, 
                                                                    test_size=0.1, random_state=1989)
del(df)

## Save dataset splits

In the plain text format, suitable for further processing by [OpenNMT](http://opennmt.net/OpenNMT-tf).

In [None]:
from tqdm import tqdm_notebook as tqdm

def save_as_text(output_path: str, data: pd.Series):
    with open(output_path, "wt") as output_fd:
        for row in tqdm(data):
            if isinstance(row, str):
                output_fd.write(row + "\n")
            else:
                output_fd.write(" ".join(row) + "\n")
            
save_as_text(run.path(Files.TRAIN_BODIES), train_bodies)
save_as_text(run.path(Files.TRAIN_NAMES), train_names)
save_as_text(run.path(Files.VAL_BODIES), val_bodies)
save_as_text(run.path(Files.VAL_NAMES), val_names)

del(train_bodies); del(val_bodies); del(train_names); del(val_names)

# Encode dataset

Get vector represenation using the vocabulary from the trained BPE tokenizer, in the format compatible with [OpenNMT](http://opennmt.net/OpenNMT-tf/data.html#vocabulary).

## Encode dataset splits using BPE

In [None]:
bpe = yttm.BPE(model=run.path(Files.BPE_MODEL))

def bpe_encode(input_path: str, output_path: str):
    with open(output_path, "w") as output_fd, open(input_path, "rt") as input_fd:
        data = input_fd.readlines()
        for row in bpe.encode(data, output_type=yttm.OutputType.ID):
            output_fd.write(" ".join(map(str, row)) + "\n")

bpe_encode(run.path(Files.TRAIN_BODIES), run.path(Files.ENC_TRAIN_BODIES))
bpe_encode(run.path(Files.TRAIN_NAMES), run.path(Files.ENC_TRAIN_NAMES))
bpe_encode(run.path(Files.VAL_BODIES), run.path(Files.ENC_VAL_BODIES))
bpe_encode(run.path(Files.VAL_NAMES), run.path(Files.ENC_VAL_NAMES))

# Train seq2seq model

* we will use [openNMT-tf](http://opennmt.net/OpenNMT-tf/)
* prepare vocabularies (we will use functionality to train translation model from identifiers to function names)
* train the model

In [None]:
# OpenNMT requires to provide explicit vocabularies, so we build them out of BPE-encoded data
def generate_build_vocab(save_vocab_loc, input_text, vocab_size=vocab_size):
    return "onmt-build-vocab --size %s --save_vocab %s %s" % (vocab_size, 
                                                              save_vocab_loc,
                                                              input_text)

if not os.path.exists(run.path(Files.SRC_VOCABULARY)):
    print("Generating vocabularies")
    # in case of pretrained model we reuse vocabulary
    cmd = generate_build_vocab(save_vocab_loc=run.path(Files.SRC_VOCABULARY),
                               input_text=run.path(Files.ENC_TRAIN_BODIES),
                               vocab_size=vocab_size + 10)
    ! {cmd}

    cmd = generate_build_vocab(save_vocab_loc=run.path(Files.TGT_VOCABULARY),
                               input_text=run.path(Files.ENC_TRAIN_NAMES),
                               vocab_size=vocab_size + 10)
    ! {cmd}

In [None]:
model_dir = run.path(Dirs.MODEL_RUN)

# prepare config file for model
config_yaml = run.path(Files.MODEL_CONFIG)
# this directory will contain evaluation results of the model, checkpoints and so on
yaml_content = "model_dir: %s \n" % model_dir

# where the data is
yaml_content += """
data:
  train_features_file: %s
  train_labels_file: %s
  eval_features_file: %s
  eval_labels_file: %s
  source_vocabulary: %s
  target_vocabulary: %s
""" % (run.path(Files.ENC_TRAIN_BODIES), 
       run.path(Files.ENC_TRAIN_NAMES),
       run.path(Files.ENC_VAL_BODIES), 
       run.path(Files.ENC_VAL_NAMES),
       run.path(Files.SRC_VOCABULARY), 
       run.path(Files.TGT_VOCABULARY))

# other configurations that affect training process
yaml_content += """
train:
  # (optional when batch_type=tokens) If not set, the training will search the largest
  # possible batch size.
  batch_size: 32

eval:
  # (optional) The batch size to use (default: 32).
  batch_size: 128

  # (optional) Evaluate every this many steps (default: 5000).
  steps: 5000

  # (optional) Save evaluation predictions in model_dir/eval/.
  save_eval_predictions: false
  # (optional) Evalutator or list of evaluators that are called on the saved evaluation predictions.
  # Available evaluators: bleu, rouge
  external_evaluators: bleu

  # (optional) Export a SavedModel when a metric has the best value so far (default: null).
  export_on_best: bleu

  # (optional) Early stopping condition.
  # Should be read as: stop the training if "metric" did not improve more
  # than "min_improvement" in the last "steps" evaluations.
  early_stopping:
    # (optional) The target metric name (default: "loss").
    metric: bleu
    # (optional) The metric should improve at least by this much to be considered as an improvement (default: 0)
    min_improvement: 0.01
    steps: 2
"""

with open(config_yaml, "w") as f:
    f.write(yaml_content)

## Training

Using a 2 layer encode-decoder LSTM model architecture by setting `--model_type LuongAttention` as described by [Minh-Thang Luong et all, 2015](https://arxiv.org/abs/1508.04025)

### Performance on GPU vs CPU:
* CPU with 4 cores: `source words/s = 104, target words/s = 34`
* 1080 GPU: `source words/s = 6959, target words/s = 1434`\

In [None]:
# how to launch training
GPU_USE = False
if not GPU_USE:
    train_cmd = """
    onmt-main --model_type LuongAttention \
    --config %s --auto_config train --with_eval""" % config_yaml
    ! {train_cmd}

# in case of GPU you can specify CUDA_VISIBLE_DEVICES & number of GPUs to use
if GPU_USE:
    cmd_gpu = """
    CUDA_VISIBLE_DEVICES=%s onmt-main --model_type LuongAttention \
    --config %s --auto_config train --with_eval --num_gpus %s""" % ("0,1", config_yaml, 2)
    ! {cmd_gpu}

In [None]:
!ls -la {model_dir}

# Predict
* we will use pretrained on several GPUs model to save time
* predictions will be saved to file 
* predicted BPE ids will be converted back to text

In [None]:
pretrained_model = None

# Put your checkoint number insteaf of XXX
# Comment this, in oredr to use an already pre-trained model instead
pretrained_model = "{}/ckpt-0".format(model_dir)

if pretrained_model is None:
    pretrained_model = run.path(Files.MODEL_PRETRAINED)

In [None]:
# limit number of samples to process
!head -50 {run.path(Files.ENC_VAL_BODIES)} > {run.path(Files.SAMPLE_ENC_VAL_BODIES)}
!head -50 {run.path(Files.ENC_VAL_NAMES)} > {run.path(Files.SAMPLE_ENC_VAL_NAMES)}

In [None]:
predict_cmd = """onmt-main \
--config %s --auto_config --model_type LuongAttention \
--checkpoint_path %s \
infer \
--features_file %s \
--predictions_file %s
""" % (config_yaml, pretrained_model,
                           run.path(Files.SAMPLE_ENC_VAL_BODIES), 
                           run.path(Files.ENC_VAL_NAMES_PRED),
                           )
! {predict_cmd}

In [None]:
!cat {run.path(Files.ENC_VAL_NAMES_PRED)}

In [None]:
pred_ids = []
with open(run.path(Files.ENC_VAL_NAMES_PRED), "r") as f:
    for line in f.readlines():
        pred_ids.append(list(map(int, line.split())))

pred_val_function_names = bpe.decode(pred_ids)

In [None]:
gt_ids = []
with open(run.path(Files.SAMPLE_ENC_VAL_NAMES), "r") as f:
    for i, line in enumerate(f.readlines()):
        gt_ids.append(list(map(int, line.split())))
gt_val_function_names = bpe.decode(gt_ids)

# And finally let's see the results!

In [None]:
for gt_name, pred_name in zip(gt_val_function_names, pred_val_function_names):
    print("%s | %s" % (gt_name, pred_name))    

# Quality

This is a very simplistic base line model, wich misses a lot of context information to make a decidions:
* roles of identifiers
* structural information 
* arguments to function

Many more improvements were proposed recently [code2vec](https://github.com/tech-srl/code2vec), [GGNNs](). etc.

Check [github.com/src-d/awesome-machine-learning-on-source-code](https://github.com/src-d/awesome-machine-learning-on-source-code) to learn about State Of the Art (SOtA) models.