data generator notebook

In [None]:
import io
import os
import signal
import warnings
from collections import OrderedDict
from functools import partial
from multiprocessing import Process, Queue

from random import shuffle
from time import sleep

import numpy as np
from tqdm import tqdm

from backend.special_tokens import SPECIAL_TOKENS

import os
import pickle
import sys
from argparse import ArgumentParser
from functools import partial

import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
from nltk.translate.bleu_score import sentence_bleu
from tensorflow import logging as logging

from backend.container import ArgContainer
from backend.config import (load_estimator_config, load_graph_params,
                            load_input_fn_params)
from backend.input_functions import (eval_input_fn,  # noqa
                                     train_generator_input_fn,
                                     train_generator_input_fn_v2,
                                     train_static_input_fn)
from backend.utils import check_response, convert_int2word, make_model_dir

In [None]:

class GeneratorInputV3(object):
    """
    NEW MODEL GENERATOR

    This input function will ALSO return a start position and an end position to identify where the question starts and ends.

    ** V2 results in approximately 70% faster data loading than v1 **
    A sample generator with a side process to fill a que of samples. tensorflow can grab items from the queue generator at will.

    Usage:

    gen_obj = GneratorInputV2(**kwargs)  # This should take max a couple minutes to prefill the queue
    generator = gen_obj.from_queue_generator()

    """
    def __init__(self,
                 Q_size,
                 num_proc,
                 questions,
                 non_questions,
                 word2index,
                 max_seq_length,
                 max_num_questions,
                 max_num_elements,
                 randomize_num_questions=False):

        print('Initializing generator')
        self.global_sleep_clock = 2

#         self.queues_length = 0
        self.Q_size = Q_size
        self.data_que = Queue(Q_size)

        self.processes = [Process(target=self.queue_builder,
                                  args=(questions,
                                        non_questions,
                                        word2index,
                                        max_seq_length,
                                        max_num_questions,
                                        max_num_elements,
                                        randomize_num_questions)) for _ in range(num_proc)]
        self.initialized = False
        import pdb;pdb.set_trace()
        # cleanup from previous iteration
        prev_pids = self.read_prev_pids()
        self.kill_prev_pids(prev_pids)

        # start the engines
        self.start_processes()
        self.save_pids()

        print('Process started')
        while not self.data_que.full():
            size = self.queue_size()
            print(self.data_que.get())
            print('Queues initializing...{} of {}'.format(str(size), self.Q_size))
            sleep(self.global_sleep_clock)
        self.initialized = True

    def save_pids(self):
        pids = [p.pid for p in self.processes]
        with open('temp_pid', 'w+') as pout:
            for pid in pids:
                pout.write(str(pid) + '\n')

    def read_prev_pids(self):
        try:
            with open('temp_pid', 'r') as pin:
                pid_list = [int(x.strip()) for x in pin.readlines()]
            os.remove('temp_pid')
        except IOError:
            pid_list = None
        return pid_list

    def kill_prev_pids(self, pid_list):
        if pid_list:
            try:
                for pid in pid_list:
                    os.kill(pid, signal.SIGTERM)
            except Exception as e:
                pass
        else:
            pass

    def terminate_processes(self):
        for proc in self.processes:
            proc.terminate()

    def start_processes(self):
        for proc in self.processes:
            proc.start()

    def queue_size(self):
        return self.data_que.qsize()

    def kill_builder(self):
        self.processes.terminate()

    def make_breaks(self, input_list):
        starts = list()
        stops = list()

        start = 0
        for length, sentence in input_list:
            stop = start + length -1

            if sentence.endswith('<QQQ>'):
                starts.append(start)
                stops.append(stop)

            start += length

        return starts[0], stops[0]

    def _generate_sequence_(self, questions, non_questions, max_seq_length, num_questions, max_num_elements):
        """
        - Aim to generate seq shorter than max_seq_length by filtering as data is processed
        """
        assert max_num_elements >= num_questions, 'must have more elements than questions'

        num_non_questions = max_num_elements - num_questions

        # keep track of questions for label making
        pre_choices = np.random.choice(questions, num_questions, replace=False).tolist()
        question_choices = [''.join([x, '<QQQ>']) for x in pre_choices]
        length_of_questions = sum([len(x.split()) for x in question_choices])

        filtered_non_questions = list(filter(lambda x: len(x.split()) <= int((max_seq_length - length_of_questions) / num_non_questions), non_questions))
        shuffle(filtered_non_questions)

        non_question_choices = np.random.choice(filtered_non_questions, num_non_questions, replace=False).tolist()

        # New code
        input_list = [(len(x.split()), x) for x in question_choices] + [(len(x.split()), x) for x in non_question_choices]

        shuffle(input_list)

        # New code
        starts, stops = self.make_breaks(input_list)
        input_list = [x[1] for x in input_list]

        input_sequence = ' '.join(input_list).replace('<QQQ>', '')
        target_sequence = ' '.join(
            [x.replace('<QQQ>', '') for x in input_list if '<QQQ>' in x])
        return input_sequence, target_sequence, num_questions, starts, stops


    def queue_builder(self,
                      questions,
                      non_questions,
                      word2index,
                      max_seq_length,
                      max_num_questions,
                      max_num_elements,
                      randomize_num_questions):
        " This function gets its own process which will run on the side "

        # to ensure no two questions togther reach the max_seq_length
        filtered_questions = list(filter(lambda x: len(x.split()) <= max_seq_length // max_num_elements, questions))
        shuffle(filtered_questions)

        assert max_num_questions <= max_num_elements - 1, ' need to have at least 1 fewer questions than num element '  # to ensure that there will always be non-questions
        while True:
            if not self.data_que.full():
                # if self.initialized:
                    # print("Queue not full. Repleneshing...")
                if randomize_num_questions:  # if max elements is 3, then we can add 1 or 2 questions.
                        max_num_questions = np.random.randint(1, max_num_elements - 1)

                input_sequence, target_sequence, num_questions, starts_list, stops_list = self._generate_sequence_(filtered_questions,
                                                                                                                    non_questions,
                                                                                                                    max_seq_length=max_seq_length,
                                                                                                                    num_questions=max_num_questions,
                                                                                                                    max_num_elements=max_num_elements)

                # if _is_acceptable_(input_sequence, max_seq_length):
                encoder_inputs, encoder_input_lengths = _prepare_encoder_input_(input_sequence, max_seq_length, word2index)
                decoder_inputs, decoder_input_lengths = _prepare_decoder_input_(target_sequence, max_seq_length, word2index)
                target_sequences, target_seq_lengths = _prepare_target_sequences_(target_sequence, max_seq_length, word2index)

                array = partial(np.array, dtype=np.int32)
                features = array(encoder_inputs), array(encoder_input_lengths), array(decoder_inputs), array(decoder_input_lengths)
                labels = array(target_sequences), array(target_seq_lengths), array(num_questions), array(starts_list), array(stops_list)

                self.data_que.put((features, labels))

            else:
                print('Queue full...Sleeping Again... 5 seconds')
                sleep(self.global_sleep_clock)

    def from_queue_generator(self):
        while True:
            if self.data_que.empty():
                sleep(self.global_sleep_clock)
            else:
                features, labels = self.data_que.get()
                yield features, labels

        self.data_que.close()
        self.data_que.join_thread()

        self.terminate_processes()


In [None]:
batch_size = 16
minimode=True

datafile = 'data_containerV2_30msl.generator.pkl'
with open(datafile, 'rb') as pick:
    data = pickle.load(pick)

# params & configs
graph_params = load_graph_params(batch_size=batch_size, **data.return_config())

input_fn_params = load_input_fn_params(
    batch_size=batch_size if not minimode else int(1e6),
    repeat=-1
)

" Main input function -- use this one "
kwargs = {
    'questions': data.questions,
    'non_questions': data.non_questions,
    'Q_size': 400,
    'num_proc': 2,
    'word2index': graph_params['word2index'],
    'max_seq_length': graph_params['input_max_length'],
    'max_num_questions': input_fn_params['max_num_questions'],
    'max_num_elements': input_fn_params['max_num_elements'],
    'randomize_num_questions': input_fn_params['randomize_num_questions']
}
gen_obj = GeneratorInputV3(**kwargs)  # This should take max a few seconds to prefill the queue

# samples = list()
# for i in range(10):
#     samples.append(
#         gen_obj._generate_sequence_(data.questions, 
#                                     data.non_questions, 
#                                     graph_params.get('input_max_length'), 
#                                     input_fn_params.get('max_num_questions'), 
#                                     input_fn_params.get('max_num_elements'))
#     )
#     break
# for x in samples:
#     print(x)
#     print
# generator = gen_obj.from_queue_generator()

In [None]:
test = [(35, 'uninteresting moral skepticism hehe'), (78, 'criglcragl qualia are not empirical phenomena empirical refers to sensory data'), (31, 'what else does the op need<QQQ>')]

In [None]:
test

In [None]:
def make_breaks(input_list):
    starts = list()
    stops = list()
    
    start = 0
    for length, sentence in test:
        stop = start + length
        
        if sentence.endswith('<QQQ>'):
            starts.append(start)
            stops.append(stop)
            
        start += length
    
    return starts, stops

In [None]:
make_breaks(test)

In [None]:
datafile = 'data_containerV2_30msl.generator.pkl'

In [None]:
with open(datafile, 'rb') as pick:
    data = pickle.load(pick)

In [None]:
model_dir = 'temp_fake_temp'
args = None
minimode = True

# output dir
_name = '_'.join([model_dir, 'minitest1']) if True else args.model_dir
model_dir = make_model_dir(name=_name, overwrite=True)

In [None]:
# params & configs
batch_size=10
graph_params = load_graph_params(batch_size=batch_size, **data.return_config())
input_fn_params = load_input_fn_params(
    batch_size=10,
    repeat=-1
)
estimator_config = load_estimator_config(save_every=100, log_every=10)

In [None]:
input_fn_params

In [None]:
from models.transformer.run_model import model_fn
from backend.preprocess import _generate_sequence_, GeneratorInputV3

In [None]:
gen = GeneratorInputV3(400, 6, data.questions, data.non_questions, data.word2index, 30, 1, 3)

In [None]:
gener = gen.from_queue_generator()

In [None]:
next(gener)

In [None]:
# load estimator
classifier = tf.estimator.Estimator(model_fn=model_fn, params=graph_params, config=estimator_config, model_dir='fake2')

In [None]:
def train_input_fn(graph_params, input_fn_params):
    " Main input function -- use this one "
    generator = gen.from_queue_generator()

    msl = graph_params['input_max_length']
    bsize = 2

    dataset = tf.data.Dataset.from_generator(lambda: generator,
                                             output_types=((tf.int32, tf.int32, tf.int32, tf.int32),
                                                           (tf.int32, tf.int32, tf.int32, tf.int32, tf.int32)),
                                             output_shapes=(
                                                 (tf.TensorShape([msl]), tf.TensorShape([]), tf.TensorShape([msl]), tf.TensorShape([])),
                                                 (tf.TensorShape([msl]), tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([]))
                                                 )
    )
    dataset = dataset.batch(bsize).prefetch(int(bsize * 3))
    feature, label = dataset.make_one_shot_iterator().get_next()

    features = {
        'encoder_inputs': feature[0],  # encoder_inputs
        'encoder_input_lengths': feature[1],  # encoder_input_lengths
        'decoder_inputs': feature[2],  # decoder_inputs
        'decoder_input_lengths': feature[3]  # decoder_input_lengths
    }
    labels = {
        'target_sequences': label[0],  # target_sequences
        'target_seq_lengths': label[1],  # target_seq_lengths
        'num_questions': label[2],  # num_questions
        'starts': label[3], # question start position
        'stops': label[4]  # question stop position
    }
    return features, labels
    

In [None]:
classifier.train(input_fn=lambda: train_input_fn(graph_params, input_fn_params), steps=1000)

In [1]:
import tensorflow as tf
tf.enable_eager_execution()

In [2]:
import numpy as np

In [5]:
init = tf.initializers.truncated_normal(0.0, 0.01)

In [20]:
three_channel = tf.constant(np.random.normal(0, 1, size=(2, 20, 50, 1)), tf.float32)

In [33]:
conv1 = tf.layers.conv2d(three_channel, 126, (5, 5), activation=tf.nn.relu,use_bias=True, kernel_initializer=init, name='conv1')
conv1_flat = tf.layers.flatten(conv1, name='flatten')

key = tf.layers.dense(conv1_flat, units=512, activation=tf.nn.relu)
query = tf.layers.dense(conv1_flat, units=512, activation=tf.nn.relu)
value = tf.layers.dense(conv1_flat, units=512, activation=tf.nn.relu)

scale = tf.sqrt(tf.constant(512.0, tf.float32))
raw_scores = tf.divide(tf.cast(tf.matmul(key, query, transpose_b=True), tf.float32), scale)

attention_scores = tf.nn.softmax(raw_scores, axis=1)



attended = tf.matmul(attention_scores, value)

start_predictions = tf.layers.dense(attended, units=1, activation=tf.nn.sigmoid)
start_predictions_transformed = transform_to_range(start_predictions, min_value=0, max_value=params['input_max_length'])