In [2]:
from vocabularies import VocabType
from config import Config
from interactive_predict import InteractivePredictor
from model_base import Code2VecModelBase
from tensorflow_model import Code2VecModel as model
import tensorflow as tf
import numpy as np
import time
from typing import Dict, Optional, List, Iterable
from collections import Counter
from functools import partial

from path_context_reader import PathContextReader, ModelInputTensorsFormer, ReaderInputTensors, EstimatorAction
from common import common
from vocabularies import VocabType
from config import Config
from model_base import Code2VecModelBase, ModelEvaluationResults, ModelPredictionResults

from math import ceil
from typing import Optional
import logging

In [5]:
NUM_TRAIN_EPOCHS = 10 #20
SAVE_EVERY_EPOCHS = 1
TRAIN_BATCH_SIZE = 512 #1024
TEST_BATCH_SIZE = TRAIN_BATCH_SIZE
TOP_K_WORDS_CONSIDERED_DURING_PREDICTION = 10
NUM_BATCHES_TO_LOG_PROGRESS = 10
NUM_TRAIN_BATCHES_TO_EVALUATE = 1800
READER_NUM_PARALLEL_BATCHES = 6  # cpu cores [for tf.contrib.data.map_and_batch() in the reader]
SHUFFLE_BUFFER_SIZE = 10000
CSV_BUFFER_SIZE = 100 * 1024 * 1024  # 100 MB
MAX_TO_KEEP = 10

# Automatically filled by `Code2VecModelBase._init_num_of_examples()`.
NUM_TRAIN_EXAMPLES: int = 0
NUM_TEST_EXAMPLES: int = 0

# model hyper-params
MAX_CONTEXTS = 200
MAX_TOKEN_VOCAB_SIZE = 13011
MAX_TARGET_VOCAB_SIZE = 2612
MAX_PATH_VOCAB_SIZE = 9114
DEFAULT_EMBEDDINGS_SIZE = 128
TOKEN_EMBEDDINGS_SIZE = DEFAULT_EMBEDDINGS_SIZE
PATH_EMBEDDINGS_SIZE = DEFAULT_EMBEDDINGS_SIZE
CODE_VECTOR_SIZE = PATH_EMBEDDINGS_SIZE + 2 *TOKEN_EMBEDDINGS_SIZE
TARGET_EMBEDDINGS_SIZE = CODE_VECTOR_SIZE
DROPOUT_KEEP_RATE = 0.75
SEPARATE_OOV_AND_PAD = False

train_steps_per_epoch=NUM_TRAIN_EXAMPLES / TRAIN_BATCH_SIZE

# Automatically filled by `args`.
PREDICT: bool = False   # TODO: update README;
MODEL_SAVE_PATH: Optional[str] = None
MODEL_LOAD_PATH: Optional[str] = None
TRAIN_DATA_PATH_PREFIX: Optional[str] = None
TEST_DATA_PATH: Optional[str] = ''
RELEASE: bool = False
EXPORT_CODE_VECTORS: bool = False
SAVE_W2V: Optional[str] = None   # TODO: update README;
SAVE_T2V: Optional[str] = None   # TODO: update README;
VERBOSE_MODE: int = 0
LOGS_PATH: Optional[str] = None
DL_FRAMEWORK: str = 'tensorflow'  # in {'keras', 'tensorflow'}
USE_TENSORBOARD: bool = False


__logger: Optional[logging.Logger] = None
    
vocabs = vocabs
config = config
model_input_tensors_former = model_input_tensors_former
estimator_action = estimator_action
repeat_endlessly = repeat_endlessly
CONTEXT_PADDING = ','.join([vocabs.token_vocab.special_words.PAD,
                                         vocabs.path_vocab.special_words.PAD,
                                         vocabs.token_vocab.special_words.PAD])
csv_record_defaults = [[vocabs.target_vocab.special_words.OOV]] + \
                                   ([[CONTEXT_PADDING]] * config.MAX_CONTEXTS)

# initialize the needed lookup tables (if not already initialized).
create_needed_vocabs_lookup_tables(vocabs)

_dataset: Optional[tf.data.Dataset] = None

NameError: name 'vocabs' is not defined

In [4]:
print('Starting training')
start_time = time.time()

batch_num = 0
sum_loss = 0
multi_batch_start_time = time.time()
num_batches_to_save_and_eval = max(int(train_steps_per_epoch * SAVE_EVERY_EPOCHS), 1)

train_reader = PathContextReader(vocabs=vocabs,
                                    model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
                                    config=config, estimator_action=EstimatorAction.Train)
input_iterator = tf.compat.v1.data.make_initializable_iterator(train_reader.get_dataset())
input_iterator_reset_op = input_iterator.initializer
input_tensors = input_iterator.get_next()

optimizer, train_loss = _build_tf_training_graph(input_tensors)
saver = tf.compat.v1.train.Saver(max_to_keep=MAX_TO_KEEP)

print('Number of trainable params: {}'.format(
    np.sum([np.prod(v.get_shape().as_list()) for v in tf.compat.v1.trainable_variables()])))
for variable in tf.compat.v1.trainable_variables():
    print("variable name: {} -- shape: {} -- #params: {}".format(
        variable.name, variable.get_shape(), np.prod(variable.get_shape().as_list())))

_initialize_session_variables()

if MODEL_LOAD_PATH:
    _load_inner_model(self.sess)

sess.run(input_iterator_reset_op)
time.sleep(1)
print('Started reader...')
# run evaluation in a loop until iterator is exhausted.
try:
    while True:
        # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
        batch_num += 1

        # Actual training for the current batch.
        _, batch_loss = self.sess.run([optimizer, train_loss])

        sum_loss += batch_loss
        if batch_num % NUM_BATCHES_TO_LOG_PROGRESS == 0:
            _trace_training(sum_loss, batch_num, multi_batch_start_time)
            # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
            # self.print('Number of waiting examples in queue: %d' % self.sess.run(
            #    "shuffle_batch/random_shuffle_queue_Size:0"))
            sum_loss = 0
            multi_batch_start_time = time.time()
        if batch_num % num_batches_to_save_and_eval == 0:
            epoch_num = int((batch_num / num_batches_to_save_and_eval) * SAVE_EVERY_EPOCHS)
            model_save_path = MODEL_SAVE_PATH + '_iter' + str(epoch_num)
            save(model_save_path)
            print('Saved after %d epochs in: %s' % (epoch_num, model_save_path))
            evaluation_results = self.evaluate()
            evaluation_results_str = (str(evaluation_results).replace('topk', 'top{}'.format(
                self.TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
            self.print('After {nr_epochs} epochs -- {evaluation_results}'.format(
                nr_epochs=epoch_num,
                evaluation_results=evaluation_results_str
            ))
except tf.errors.OutOfRangeError:
    pass  # The reader iterator is exhausted and have no more batches to produce.

print('Done training')

if MODEL_SAVE_PATH:
    _save_inner_model(MODEL_SAVE_PATH)
    print('Model saved in file: %s' % self.MODEL_SAVE_PATH)

elapsed = int(time.time() - start_time)
print("Training time: %sH:%sM:%sS\n" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))

Starting training


NameError: name 'train_reader' is not defined