数据准备（data/corpora_processed、data/conditions_index）

构建单词索引

In [1]:
import json
import nltk
from collections import Counter
nltk.download('punkt')
_tokenizer = nltk.tokenize.RegexpTokenizer(pattern='\w+|[^\w\s]')

VOCABULARY_MAX_SIZE = 50000
MAX_CONDITIONS_NUM = 5

dialogs = []
tokens_counter = Counter()
conditions_counter = Counter()
tokenized_training_lines = []
for line in open('data/corpora_processed/train_processed_dialogs.txt', 'r', encoding='utf-8'):
    line_json = json.loads(line.strip())
    dias = []
    for entry in line_json:
        tokens = _tokenizer.tokenize(entry['text'])
        tokenized_training_lines.append(tokens)
        dias.append({'text': ' '.join(tokens), 'condition': entry['condition']})
        tokens_counter.update(tokens)
        conditions_counter[entry['condition']] += 1
    dialogs.append(dias)
        
# 构建vocab list
special_tokens = ['_pad_', '_unk_', '_start_', '_end_']
vocab = special_tokens + [token for token, _ in tokens_counter.most_common(VOCABULARY_MAX_SIZE - len(special_tokens))]

# 构建condition list
conditions = [condition for condition, _ in conditions_counter.most_common(MAX_CONDITIONS_NUM)]

index_to_token = dict(enumerate(vocab))
index_to_condition = dict(enumerate(conditions))

with open('data/id2vocab', 'w', encoding='utf-8') as fh:
        json.dump(index_to_token, fh, ensure_ascii=False)

with open('data/id2condition', 'w', encoding='utf-8') as fh:
        json.dump(index_to_condition, fh, ensure_ascii=False)
        
print(list(index_to_token.items())[:10])
print(list(index_to_condition.items())[:5])

[(0, '_pad_'), (1, '_unk_'), (2, '_start_'), (3, '_end_'), (4, '.'), (5, ','), (6, 'Hello'), (7, 'Oh'), (8, 'hi'), (9, '!')]
[(0, 'neutral'), (1, 'joy'), (2, 'sadness')]


[nltk_data] Downloading package punkt to /Users/majing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


预训练词向量

In [2]:
from gensim.models import Word2Vec
import multiprocessing

WORD_EMBEDDING_DIMENSION = 128
W2V_WINDOW_SIZE = 10
USE_SKIP_GRAM = True # 为False时用CBOW
MIN_WORD_FREQ = 1
_WORKERS_NUM = multiprocessing.cpu_count()

word2vec_path = 'data/word2vec.bin'
word2vec_model = Word2Vec(
        window=W2V_WINDOW_SIZE,
        size=WORD_EMBEDDING_DIMENSION,
        max_vocab_size=VOCABULARY_MAX_SIZE,
        min_count=MIN_WORD_FREQ,
        workers=_WORKERS_NUM,
        sg=USE_SKIP_GRAM)

word2vec_model.build_vocab(tokenized_training_lines)
word2vec_model.train(tokenized_training_lines, total_words=50000, epochs=10)
word2vec_model.init_sims(replace=True) # 强制单位归一化，破坏性就地(打击非归一化向量), 更节省存储空间
word2vec_model.save(word2vec_path, separately=[])

# 在推理时可以用，导入词向量
word2vec_model = Word2Vec.load(word2vec_path, mmap='r')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


初始化随机数种子

In [3]:
import random
import numpy as np
import tensorflow as tf

random.seed(42)
np.random.seed(42)
tf.set_random_seed(42)

导入vocab和condition索引

In [4]:
with open('data/id2vocab', 'r', encoding='utf-8') as item_index_fh:
        token_to_index = json.load(item_index_fh)
        token_to_index = {v: int(k) for k, v in token_to_index.items()}


with open('data/id2condition', 'r', encoding='utf-8') as item_index_fh:
        condition_to_index = json.load(item_index_fh)
        condition_to_index = {v: int(k) for k, v in condition_to_index.items()}
        
print (token_to_index)
print (condition_to_index)        

{'_pad_': 0, '_unk_': 1, '_start_': 2, '_end_': 3, '.': 4, ',': 5, 'Hello': 6, 'Oh': 7, 'hi': 8, '!': 9, ':': 10, ')': 11, 'How': 12, 'are': 13, 'you': 14, 'my': 15, 'friend': 16, '?': 17, 'Doing': 18, 'good': 19, 'Justin': 20, 'Bieber': 21, 'is': 22, 'the': 23, 'best': 24, 'Ok': 25}
{'neutral': 0, 'joy': 1, 'sadness': 2}


定义Dataset和ModelParam

In [9]:
from collections import namedtuple
Dataset = namedtuple('Dataset', ['x', 'y', 'condition_ids'])
ModelParam = namedtuple('ModelParam', ['value', 'id'])

生成训练数据

In [5]:
from itertools import islice


INPUT_CONTEXT_SIZE = 3
INPUT_SEQUENCE_LENGTH = 30
INPUT_SEQUENCE_LENGTH = 30
OUTPUT_SEQUENCE_LENGTH = 32
INTX = 'uint16'

# 对数据进行分词等操作
train_conditions = []
tokenized_alternated_train_lines = []
for dialog in dialogs:
    for first_dialog_line, second_dialog_line in zip(dialog, dialog[1:]):
        tokenized_alternated_train_lines.append(_tokenizer.tokenize(first_dialog_line['text']))
        tokenized_alternated_train_lines.append(_tokenizer.tokenize(second_dialog_line['text']))
        train_conditions.append(first_dialog_line['condition'])
        train_conditions.append(second_dialog_line['condition'])

# 数据进行X、Y的区分
n_dialogs = sum(1 for _ in tokenized_alternated_train_lines)
x_data_iterator_seq2seq = islice(tokenized_alternated_train_lines, 0, None, 2)
y_data_iterator_seq2seq = islice(tokenized_alternated_train_lines, 1, None, 2)
n_dialogs //= 2
context = []
x_data_iterator = [] # 训练的输入x
y_data_iterator_for_context = [] # 训练的输出y
last_y_line = None
for x_line, y_line in zip(x_data_iterator_seq2seq, y_data_iterator_seq2seq):
    if x_line != last_y_line:
        context = []  # clear context if last response != current dialog context (new dialog)
    context.append(x_line)
    x_data_iterator.append(context[-INPUT_CONTEXT_SIZE:])  # yield list of tokenized lines
    y_data_iterator_for_context.append(y_line)
    last_y_line = y_line

# X数据转成ID
max_contexts_num = n_dialogs
max_context_len = INPUT_CONTEXT_SIZE
max_line_len = INPUT_SEQUENCE_LENGTH
X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index['_pad_'], dtype=INTX)
for context_idx, context in enumerate(x_data_iterator):
    if context_idx >= max_contexts_num:
        break

    # take last max_content_len utterances
    context = context[-max_context_len:]

    # fill utterances to the end of context, keep first empty utterances padded.
    utterance_offset = max_context_len - len(context)
    for utterance_idx, utterance in enumerate(context):
        for token_idx, token in enumerate(utterance[:max_line_len]):
            X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index[_unk_]
                
# Y数据转成ID
max_lines_num = n_dialogs
Y = np.full((max_lines_num, OUTPUT_SEQUENCE_LENGTH), token_to_index['_pad_'], dtype=INTX)
for line_idx, line in enumerate(y_data_iterator_for_context):
    if line_idx >= max_lines_num:
        break
    line = ['_start_'] + line + ['_end_']
    for token_idx, token in enumerate(line[:max_line_len]):
        Y[line_idx, token_idx] = token_to_index[token] if token in token_to_index else token_to_index['_unk_']

# condition数据转成ID
y_conditions_iterator = islice(train_conditions, 1, None, 2)
condition_ids_iterator = map(lambda condition: condition_to_index.get(condition, condition_to_index['neutral']), \
                             y_conditions_iterator)
condition_ids = np.full(n_dialogs, condition_to_index['neutral'], dtype=INTX)
for sample_idx, condition_id in enumerate(condition_ids_iterator):
    condition_ids[sample_idx] = condition_id
    
x_train = X
y_train = Y
condition_ids_train = condition_ids

print (list(x_train)[:3])
print (list(y_train)[:3])
print (list(condition_ids_train)[:3])

[array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint16), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  5,  8,  9, 10, 11, 12, 13, 14,  5, 15, 16, 17,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint16), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 

生成上下文无关的验证集数据

In [6]:
MAX_VAL_LINES_NUM = 10000

with open('data/corpora_processed/context_free_validation_set.txt', 'r', encoding='utf-8') as fh:
    test_lines = [line.strip() for line in fh.readlines()]
    test_lines = list(filter(None, test_lines))
tokenized_validation_lines = []
tokens_voc = set(token_to_index.keys())
for line in test_lines:
    tokenized_line = _tokenizer.tokenize(line)
    tokenized_line = [t if t in tokens_voc else '_unk_' for t in tokenized_line]
    tokenized_validation_lines.append(tokenized_line)
tokenized_validation_lines = tokenized_validation_lines[:MAX_VAL_LINES_NUM]

# 数据进行X、Y的区分
n_dialogs = sum(1 for _ in tokenized_validation_lines)
x_data_iterator_seq2seq = islice(tokenized_validation_lines, 0, None, 2)
y_data_iterator_seq2seq = islice(tokenized_validation_lines, 1, None, 2)
n_dialogs //= 2
context = []
x_data_iterator = [] # 验证的输入x
y_data_iterator_for_context = [] # 验证的输入y
last_y_line = None
for x_line, y_line in zip(x_data_iterator_seq2seq, y_data_iterator_seq2seq):
    if x_line != last_y_line:
        context = []  # clear context if last response != current dialog context (new dialog)
    context.append(x_line)
    x_data_iterator.append(context[-INPUT_CONTEXT_SIZE:])  # yield list of tokenized lines
    y_data_iterator_for_context.append(y_line)
    last_y_line = y_line

# X数据转成ID
max_contexts_num = n_dialogs
X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index['_pad_'], dtype=INTX)
for context_idx, context in enumerate(x_data_iterator):
    if context_idx >= max_contexts_num:
        break
    # take last max_content_len utterances
    context = context[-max_context_len:]
    # fill utterances to the end of context, keep first empty utterances padded.
    utterance_offset = max_context_len - len(context)
    for utterance_idx, utterance in enumerate(context):
        for token_idx, token in enumerate(utterance[:max_line_len]):
            X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index['_unk_']
                
# Y数据转成ID
max_lines_num = n_dialogs
Y = np.full((max_lines_num, OUTPUT_SEQUENCE_LENGTH), token_to_index['_pad_'], dtype=INTX)
for line_idx, line in enumerate(y_data_iterator_for_context):
    if line_idx >= max_lines_num:
        break
    line = ['_start_'] + line + ['_end_']
    for token_idx, token in enumerate(line[:max_line_len]):
        Y[line_idx, token_idx] = token_to_index[token] if token in token_to_index else token_to_index['_unk_']

x_validation_free = X
y_validation_free = Y
condition_ids_validation_free = None

print (list(x_validation_free)[:3])
print (list(y_validation_free)[:3])

[array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint16), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12, 13, 14, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint16), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 

生成上下文相关的验证集数据

In [22]:
# 获取对话数据
context_sensitive_val_dialogs = []
tokenized_valid_lines = []
for line in open('data/corpora_processed/val_processed_dialogs.txt', 'r', encoding='utf-8'):
    line_json = json.loads(line.strip())
    dias = []
    for entry in line_json:
        tokens = _tokenizer.tokenize(entry['text'])
        tokenized_valid_lines.append(tokens)
        dias.append({'text': ' '.join(tokens), 'condition': entry['condition']})
    context_sensitive_val_dialogs.append(dias)
context_sensitive_val_dialogs = islice(context_sensitive_val_dialogs, MAX_VAL_LINES_NUM)
    
# 分别拿到对话和condition
tokenized_alternated_context_sensitive_val_lines = []
alternated_context_sensitive_val_conditions = []
for dialog in context_sensitive_val_dialogs:
    for first_dialog_line, second_dialog_line in zip(dialog, dialog[1:]):
        tokenized_alternated_context_sensitive_val_lines.append(_tokenizer.tokenize(first_dialog_line['text']))
        tokenized_alternated_context_sensitive_val_lines.append(_tokenizer.tokenize(second_dialog_line['text']))
        alternated_context_sensitive_val_conditions.append(first_dialog_line['condition'])
    alternated_context_sensitive_val_conditions.append(second_dialog_line['condition'])

# x_context_sensitive_val, y_context_sensitive_val, num_context_sensitive_val_dialogs = \
#         transform_lines_to_nn_input(tokenized_alternated_context_sensitive_val_lines, token_to_index)

# 数据进行X、Y的区分
n_dialogs = sum(1 for _ in tokenized_alternated_context_sensitive_val_lines)
x_data_iterator_seq2seq = islice(tokenized_alternated_context_sensitive_val_lines, 0, None, 2)
y_data_iterator_seq2seq = islice(tokenized_alternated_context_sensitive_val_lines, 1, None, 2)
n_dialogs //= 2
context = []
x_data_iterator = [] # 验证的输入x
y_data_iterator_for_context = [] # 验证的输入y
last_y_line = None
for x_line, y_line in zip(x_data_iterator_seq2seq, y_data_iterator_seq2seq):
    if x_line != last_y_line:
        context = []  # clear context if last response != current dialog context (new dialog)
    context.append(x_line)
    x_data_iterator.append(context[-INPUT_CONTEXT_SIZE:])  # yield list of tokenized lines
    y_data_iterator_for_context.append(y_line)
    last_y_line = y_line

# X数据转成ID
max_contexts_num = n_dialogs
X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index['_pad_'], dtype=INTX)
for context_idx, context in enumerate(x_data_iterator):
    if context_idx >= max_contexts_num:
        break
    # take last max_content_len utterances
    context = context[-max_context_len:]
    # fill utterances to the end of context, keep first empty utterances padded.
    utterance_offset = max_context_len - len(context)
    for utterance_idx, utterance in enumerate(context):
        for token_idx, token in enumerate(utterance[:max_line_len]):
            X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index['_unk_']

# Y数据转成ID
max_lines_num = n_dialogs
Y = np.full((max_lines_num, OUTPUT_SEQUENCE_LENGTH), token_to_index['_pad_'], dtype=INTX)
for line_idx, line in enumerate(y_data_iterator_for_context):
    if line_idx >= max_lines_num:
        break
    line = ['_start_'] + line + ['_end_']
    for token_idx, token in enumerate(line[:max_line_len]):
        Y[line_idx, token_idx] = token_to_index[token] if token in token_to_index else token_to_index['_unk_']

# condition数据转成ID
y_conditions_iterator = islice(alternated_context_sensitive_val_conditions, 1, None, 2)
condition_ids_iterator = map(lambda condition: condition_to_index.get(condition, condition_to_index['neutral']), \
                             y_conditions_iterator)
condition_ids = np.full(n_dialogs, condition_to_index['neutral'], dtype=INTX)
for sample_idx, condition_id in enumerate(condition_ids_iterator):
    condition_ids[sample_idx] = condition_id

x_validation_sensitive = X
y_validation_sensitive = Y
condition_ids_validation_sensitive = None

print (list(x_validation_sensitive)[:3])
print (list(y_validation_sensitive)[:3])

[array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  5,  1,  5,  1,  1,  1,  1, 17,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint16), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  5,  1,  5,  1,  1,  1,  1, 17,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  1,  5,  1,  1, 14, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint16), array([[ 1,  5,  1,  5,  1,  1,  1,  1, 17,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

把训练数据和验证数据放入Dataset和ModelParam中

In [10]:
TRAIN_CORPUS_NAME = 'train_processed_dialogs'

train_dataset = Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
training_data_param = ModelParam(value=train_dataset, id=TRAIN_CORPUS_NAME)



模型定义

初始化

In [8]:
import math
from keras.layers import GRU
    
EPOCHS_NUM = 2
BATCH_SIZE = 196
MODEL_NAME = 'hred_eng_gru_v1'
TRAIN_WORD_EMBEDDINGS_LAYER = True # 允许在模型训练过程中微调embedding
CONDITION_EMBEDDING_DIMENSION = 128

# 每个epoch要跑的batch数
batches_num_per_epoch = math.ceil(x_train.shape[0] / BATCH_SIZE)

_model_name = MODEL_NAME
_rnn_class = partial(GRU, reset_after=True) #如果在gpu上可以使用_rnn_class=CuDNNGRU
_index_to_token = index_to_token
_token_to_index = {v: k for k, v in index_to_token.items()}
_vocab_size = len(_index_to_token)
_skip_token_id = _token_to_index['_pad_']
_token_embedding_dim = WORD_EMBEDDING_DIMENSION
_train_token_embedding = TRAIN_WORD_EMBEDDINGS_LAYER

# 初始化embedding
_W_init_embedding = np.zeros((len(_token_to_index), _token_embedding_dim))
for token, index in _token_to_index.items():
    if token in word2vec_model:
        _W_init_embedding[index] = np.array(word2vec_model[token])
    eif token != '_pad_':
        _W_init_embedding[index] = numpy.random.uniform(token_embedding_dim, dtype=np.float32)

_index_to_condition = index_to_condition
_condition_to_index = {v: k for k, v in index_to_condition.items()}
_condition_embedding_dim = CONDITION_EMBEDDING_DIMENSION
_training_data = 

1


In [None]:
import os
import pickle
import tempfile

_PICKLE_PROTOCOL = 2
AUTOENCODER_MODE = False

def _pickle_iterable(filename, iterable):
    with open(filename, 'wb') as pickle_fh:
        pklr = pickle.Pickler(pickle_fh, _PICKLE_PROTOCOL)
        for entry in iterable:
            pklr.dump(entry)
            pklr.clear_memo()

def _open_pickle(filename):
    return open(filename, 'rb')

def _unpickle_iterable(pickle_fh):
    with pickle_fh:
        unpklr = pickle.Unpickler(pickle_fh)
        try:
            while True:
                yield unpklr.load()
        except EOFError:
            pass

def file_buffered_tee(iterable, n=2):
    _, filename = tempfile.mkstemp()
    try:
        _pickle_iterable(filename, iterable)
        return tuple(_unpickle_iterable(_open_pickle(filename)) for _ in range(n))
    finally:
        os.remove(filename)

def _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator, context_size=INPUT_CONTEXT_SIZE):
    context = []

    last_y_line = None
    for x_line, y_line in zip(x_data_iterator, y_data_iterator):
        if x_line != last_y_line:
            context = []  # clear context if last response != current dialog context (new dialog)

        context.append(x_line)
        yield context[-context_size:]  # yield list of tokenized lines
        last_y_line = y_line
def transform_contexts_to_token_ids(tokenized_contexts,
                                    token_to_index,
                                    max_line_len,
                                    max_context_len=1,
                                    max_contexts_num=None,
                                    add_start_end=False):
    """
    Transforms contexts of lines of text to matrix of indices of tokens to be used in training/predicting.
    Uses only first max_lines_num lines of tokenized_lines. Also clips each line to max_line_len tokens.
    if length of a line is less that max_line_len, it's padded with token_to_index[PAD_TOKEN].

    :param tokenized_contexts: iterable of lists (contexts) of lists (utterances) of tokens to transform to ids
    :param token_to_index: dict that maps each token to its id
    :param max_line_len: maximum number of tokens in a line
    :param max_context_len: maximum context length
    :param max_contexts_num: maximum number of contexts
    :param add_start_end: add start/end tokens to sequence
    :return: X -- numpy array, dtype=INTX, shape = (max_lines_num, max_context_len, max_line_len).
    """

    if max_contexts_num is None:
        if not isinstance(tokenized_contexts, list):
            raise TypeError('tokenized_lines should has list type if max_lines_num is not specified')
        max_contexts_num = len(tokenized_contexts)

    X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index['_pad_'], dtype=INTX)

    for context_idx, context in enumerate(tokenized_contexts):
        if context_idx >= max_contexts_num:
            break

        # take last max_content_len utterances
        context = context[-max_context_len:]

        # fill utterances to the end of context, keep first empty utterances padded.
        utterance_offset = max_context_len - len(context)
        for utterance_idx, utterance in enumerate(context):
            if add_start_end:
                utterance = ['_start_'] + utterance + ['_end_']

            for token_idx, token in enumerate(utterance[:max_line_len]):
                X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                    if token in token_to_index else token_to_index['_unk_']

    return X

def transform_lines_to_token_ids(tokenized_lines, token_to_index, max_line_len, max_lines_num=None,
                                 add_start_end=False):
    """
    Transforms lines of text to matrix of indices of tokens to be used in training/predicting.
    Uses only first max_lines_num lines of tokenized_lines. Also clips each line to max_line_len tokens.
    if length of a line is less that max_line_len, it's padded with token_to_index[PAD_TOKEN].

    :param tokenized_lines: iterable of lists (utterances) of tokens to transform to ids
    :param token_to_index: dict that maps each token to its id
    :param max_line_len: maximum number of tokens in a lineh
    :param max_lines_num: maximum number of lines
    :param add_start_end: add start/end tokens to sequence
    :return: X -- numpy array, dtype=INTX, shape = (max_lines_num, max_line_len).
    """

    if max_lines_num is None:
        if not isinstance(tokenized_lines, list):
            raise TypeError('tokenized_lines should has list type if max_lines_num is not specified')
        max_lines_num = len(tokenized_lines)

    X = np.full((max_lines_num, max_line_len), token_to_index['_pad_'], dtype=INTX)

    for line_idx, line in enumerate(tokenized_lines):
        if line_idx >= max_lines_num:
            break

        if add_start_end:
            line = ['_start_'] + line + ['_end_']

        for token_idx, token in enumerate(line[:max_line_len]):
            X[line_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index['_unk_']

    return X

def transform_lines_to_nn_input(tokenized_dialog_lines, token_to_index, autoencoder_mode=AUTOENCODER_MODE):
    """
    Splits lines (IterableSentences) and generates numpy arrays of token ids suitable for training.
    Doesn't store all lines in memory.
    """
    x_data_iterator, y_data_iterator, iterator_for_len_calc = file_buffered_tee(tokenized_dialog_lines, 3)
    print ('Iterating through lines to get number of elements in the dataset')
    n_dialogs = sum(1 for _ in iterator_for_len_calc)
    print (n_dialogs)

    if not autoencoder_mode:
        # seq2seq mode
        x_data_iterator = islice(x_data_iterator, 0, None, 2)
        y_data_iterator = islice(y_data_iterator, 1, None, 2)
        n_dialogs //= 2

    y_data_iterator, y_data_iterator_for_context = file_buffered_tee(y_data_iterator)
    x_data_iterator = _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator_for_context)
#     print (list(x_data_iterator))
    print ('Iterating through lines to get input matrix')
    x_ids = transform_contexts_to_token_ids(
        x_data_iterator, token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE, max_contexts_num=n_dialogs)
#     print (x_ids)

    print ('Iterating through lines to get output matrix')
    y_ids = transform_lines_to_token_ids(
        y_data_iterator, token_to_index, OUTPUT_SEQUENCE_LENGTH, n_dialogs, add_start_end=True)
    print (y_ids)
    return x_ids, y_ids, n_dialogs

# x_train, y_train, _ = transform_lines_to_nn_input(tokenized_alternated_train_lines, token_to_index)
x_validation, y_validation, _ = transform_lines_to_nn_input(tokenized_validation_lines, token_to_index)
# print (list(x_validation))