数据准备（data/corpora_processed、data/conditions_index）

构建单词索引

In [2]:
import json
import nltk
from collections import Counter
nltk.download('punkt')

VOCABULARY_MAX_SIZE = 50000
MAX_CONDITIONS_NUM = 5

dialogs = []
tokens_counter = Counter()
conditions_counter = Counter()
tokenized_training_lines = []
for line in open('data/corpora_processed/train_processed_dialogs.txt', 'r', encoding='utf-8'):
    line_json = json.loads(line.strip())
    dias = []
    for entry in line_json:
        tokens = nltk.word_tokenize(entry['text'])
        tokenized_training_lines.append(tokens)
        dias.append({'text': ' '.join(tokens), 'condition': entry['condition']})
        tokens_counter.update(tokens)
        conditions_counter[entry['condition']] += 1
        
# 构建vocab list
special_tokens = ['_pad_', '_unk_', '_start_', '_end_']
vocab = special_tokens + [token for token, _ in tokens_counter.most_common(VOCABULARY_MAX_SIZE - len(special_tokens))]

# 构建condition list
conditions = [condition for condition, _ in conditions_counter.most_common(MAX_CONDITIONS_NUM)]

index_to_token = dict(enumerate(vocab))
index_to_condition = dict(enumerate(conditions))

with open('data/id2vocab', 'w', encoding='utf-8') as fh:
        json.dump(index_to_token, fh, ensure_ascii=False)

with open('data/id2condition', 'w', encoding='utf-8') as fh:
        json.dump(index_to_condition, fh, ensure_ascii=False)
        
print(list(index_to_token.items())[:10])
print(list(index_to_condition.items())[:5])

[(0, '_pad_'), (1, '_unk_'), (2, '_start_'), (3, '_end_'), (4, ','), (5, 'Hello'), (6, 'Oh'), (7, 'hi'), (8, '!'), (9, ':')]
[(0, 'neutral'), (1, 'joy'), (2, 'sadness')]


[nltk_data] Downloading package punkt to /Users/majing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


预训练词向量

In [3]:
from gensim.models import Word2Vec
import multiprocessing

WORD_EMBEDDING_DIMENSION = 128
W2V_WINDOW_SIZE = 10
USE_SKIP_GRAM = True # 为False时用CBOW
MIN_WORD_FREQ = 1
_WORKERS_NUM = multiprocessing.cpu_count()

word2vec_path = 'data/word2vec.bin'
word2vec_model = Word2Vec(
        window=W2V_WINDOW_SIZE,
        size=WORD_EMBEDDING_DIMENSION,
        max_vocab_size=VOCABULARY_MAX_SIZE,
        min_count=MIN_WORD_FREQ,
        workers=_WORKERS_NUM,
        sg=USE_SKIP_GRAM)

word2vec_model.build_vocab(tokenized_training_lines)
word2vec_model.train(tokenized_training_lines, total_words=50000, epochs=10)
word2vec_model.init_sims(replace=True) # 强制单位归一化，破坏性就地(打击非归一化向量), 更节省存储空间
word2vec_model.save(word2vec_path, separately=[])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


初始化随机数种子

In [5]:
import random
import numpy
import tensorflow as tf

random.seed(42)
numpy.random.seed(42)
tf.set_random_seed(42)

导入vocab和condition索引

In [6]:
with open('data/id2vocab', 'r', encoding='utf-8') as item_index_fh:
        token_to_index = json.load(item_index_fh)
        token_to_index = {v: int(k) for k, v in token_to_index.items()}


with open('data/id2condition', 'r', encoding='utf-8') as item_index_fh:
        condition_to_index = json.load(item_index_fh)
        condition_to_index = {v: int(k) for k, v in condition_to_index.items()}
        
print (token_to_index)
print (condition_to_index)        

{'_pad_': 0, '_unk_': 1, '_start_': 2, '_end_': 3, ',': 4, 'Hello': 5, 'Oh': 6, 'hi': 7, '!': 8, ':': 9, ')': 10, 'How': 11, 'are': 12, 'you': 13, 'my': 14, 'friend': 15, '?': 16, 'Doing': 17, 'good': 18, 'Justin': 19, 'Bieber': 20, 'is': 21, 'the': 22, 'best': 23, 'Ok': 24, '...': 25}
{'neutral': 0, 'joy': 1, 'sadness': 2}


In [48]:
import collections
from collections import namedtuple
import numpy as np
from itertools import islice
import tempfile
import os
import pickle

def create_namedtuple_instance(name, **kwargs):
    return collections.namedtuple(name, kwargs.keys())(**kwargs)
            
INPUT_SEQUENCE_LENGTH = 30
AUTOENCODER_MODE = False
INPUT_CONTEXT_SIZE = 3
_PICKLE_PROTOCOL = 2
INTX = 'uint16'
OUTPUT_SEQUENCE_LENGTH = 32
EMOTIONS_TYPES = create_namedtuple_instance(
    'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness')
DEFAULT_CONDITION = EMOTIONS_TYPES.neutral
Dataset = namedtuple('Dataset', ['x', 'y', 'condition_ids'])

SPECIAL_TOKENS = create_namedtuple_instance(
    'SPECIAL_TOKENS', PAD_TOKEN='_pad_', UNKNOWN_TOKEN='_unk_', START_TOKEN='_start_', EOS_TOKEN='_end_')
_tokenizer = nltk.tokenize.RegexpTokenizer(pattern='\w+|[^\w\s]')

def _pickle_iterable(filename, iterable):
    with open(filename, 'wb') as pickle_fh:
        pklr = pickle.Pickler(pickle_fh, _PICKLE_PROTOCOL)
        for entry in iterable:
            pklr.dump(entry)
            pklr.clear_memo()
          
def _unpickle_iterable(pickle_fh):
    with pickle_fh:
        unpklr = pickle.Unpickler(pickle_fh)
        try:
            while True:
                yield unpklr.load()
        except EOFError:
            pass

def _open_pickle(filename):
    return open(filename, 'rb')
        
class FileTextLinesIterator(object):
    def __init__(self, filename):
        self._filename = filename

    def __iter__(self):
        for line in open(self._filename, 'r', encoding='utf-8'):
            yield line.strip()

    def __copy__(self):
        return FileTextLinesIterator(self._filename)
    
class ProcessedLinesIterator(object):
    def __init__(self, lines_iter, processing_callbacks=None):
        self._lines_iter = lines_iter
        self._processing_callbacks = processing_callbacks if processing_callbacks else []

    def __iter__(self):
        for line in self._lines_iter:
            for callback in self._processing_callbacks:
                line = callback(line)
            yield line

    def __copy__(self):
        return ProcessedLinesIterator(copy(self._lines_iter), self._processing_callbacks)

class JsonTextLinesIterator(object):
    def __init__(self, text_lines_iter):
        self._text_lines_iter = text_lines_iter

    def __iter__(self):
        for line in self._text_lines_iter:
            try:
                yield json.loads(line.strip())
            except ValueError:
                print ('Skipped invalid json object: "{}"'.format(line.strip()))
                continue

class itemgetter:
    """
    Return a callable object that fetches the given item(s) from its operand.
    After f = itemgetter(2), the call f(r) returns r[2].
    After g = itemgetter(2, 5, 3), the call g(r) returns (r[2], r[5], r[3])
    """
    __slots__ = ('_items', '_call')

    def __init__(self, item, *items):
        if not items:
            self._items = (item,)
            def func(obj):
                return obj[item]
            self._call = func
        else:
            self._items = items = (item,) + items
            def func(obj):
                return tuple(obj[i] for i in items)
            self._call = func

    def __call__(self, obj):
        return self._call(obj)

    def __repr__(self):
        return '%s.%s(%s)' % (self.__class__.__module__,
                              self.__class__.__name__,
                              ', '.join(map(repr, self._items)))

    def __reduce__(self):
        return self.__class__, self._items
    
def _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator, context_size=INPUT_CONTEXT_SIZE):
    context = []

    last_y_line = None
    for x_line, y_line in zip(x_data_iterator, y_data_iterator):
        if x_line != last_y_line:
            context = []  # clear context if last response != current dialog context (new dialog)

        context.append(x_line)
        yield context[-context_size:]  # yield list of tokenized lines
        last_y_line = y_line
        
def transform_lines_to_nn_input(tokenized_dialog_lines, token_to_index, autoencoder_mode=AUTOENCODER_MODE):
    """
    Splits lines (IterableSentences) and generates numpy arrays of token ids suitable for training.
    Doesn't store all lines in memory.
    """
    x_data_iterator, y_data_iterator, iterator_for_len_calc = file_buffered_tee(tokenized_dialog_lines, 3)

#     print ('==================')
#     print (list(y_data_iterator))
    
    print ('Iterating through lines to get number of elements in the dataset')
    n_dialogs = sum(1 for _ in iterator_for_len_calc)

    if not autoencoder_mode:
        # seq2seq mode
        x_data_iterator = islice(x_data_iterator, 0, None, 2)
        y_data_iterator = islice(y_data_iterator, 1, None, 2)
        n_dialogs //= 2

    y_data_iterator, y_data_iterator_for_context = file_buffered_tee(y_data_iterator)
#     print ('==================')
#     print (list(y_data_iterator_for_context))
    x_data_iterator = _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator_for_context)
#     print ('================')
#     print (list(x_data_iterator))
    
    print ('Iterating through lines to get input matrix')
    x_ids = transform_contexts_to_token_ids(
        x_data_iterator, token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE, max_contexts_num=n_dialogs)

    print ('Iterating through lines to get output matrix')
    y_ids = transform_lines_to_token_ids(
        y_data_iterator, token_to_index, OUTPUT_SEQUENCE_LENGTH, n_dialogs, add_start_end=True)
    return x_ids, y_ids, n_dialogs

def get_tokens_sequence(text, lower=True, check_unicode=True):
    if check_unicode and not isinstance(text, str):
        raise TypeError('Text object should be unicode type. Got instead "{}" of type {}'.format(text, type(text)))

    if not text.strip():
        return []

    if lower:
        text = text.lower()

    tokens = _tokenizer.tokenize(text)

    return tokens

def transform_lines_to_token_ids(tokenized_lines, token_to_index, max_line_len, max_lines_num=None,
                                 add_start_end=False):
    """
    Transforms lines of text to matrix of indices of tokens to be used in training/predicting.
    Uses only first max_lines_num lines of tokenized_lines. Also clips each line to max_line_len tokens.
    if length of a line is less that max_line_len, it's padded with token_to_index[PAD_TOKEN].

    :param tokenized_lines: iterable of lists (utterances) of tokens to transform to ids
    :param token_to_index: dict that maps each token to its id
    :param max_line_len: maximum number of tokens in a lineh
    :param max_lines_num: maximum number of lines
    :param add_start_end: add start/end tokens to sequence
    :return: X -- numpy array, dtype=INTX, shape = (max_lines_num, max_line_len).
    """

    if max_lines_num is None:
        if not isinstance(tokenized_lines, list):
            raise TypeError('tokenized_lines should has list type if max_lines_num is not specified')
        max_lines_num = len(tokenized_lines)

    X = np.full((max_lines_num, max_line_len), token_to_index[SPECIAL_TOKENS.PAD_TOKEN], dtype=INTX)

    for line_idx, line in enumerate(tokenized_lines):
        if line_idx >= max_lines_num:
            break

        if add_start_end:
            line = [SPECIAL_TOKENS.START_TOKEN] + line + [SPECIAL_TOKENS.EOS_TOKEN]

        for token_idx, token in enumerate(line[:max_line_len]):
            X[line_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index[SPECIAL_TOKENS.UNKNOWN_TOKEN]

    return X

def transform_contexts_to_token_ids(tokenized_contexts,
                                    token_to_index,
                                    max_line_len,
                                    max_context_len=1,
                                    max_contexts_num=None,
                                    add_start_end=False):
    """
    Transforms contexts of lines of text to matrix of indices of tokens to be used in training/predicting.
    Uses only first max_lines_num lines of tokenized_lines. Also clips each line to max_line_len tokens.
    if length of a line is less that max_line_len, it's padded with token_to_index[PAD_TOKEN].

    :param tokenized_contexts: iterable of lists (contexts) of lists (utterances) of tokens to transform to ids
    :param token_to_index: dict that maps each token to its id
    :param max_line_len: maximum number of tokens in a line
    :param max_context_len: maximum context length
    :param max_contexts_num: maximum number of contexts
    :param add_start_end: add start/end tokens to sequence
    :return: X -- numpy array, dtype=INTX, shape = (max_lines_num, max_context_len, max_line_len).
    """

    if max_contexts_num is None:
        if not isinstance(tokenized_contexts, list):
            raise TypeError('tokenized_lines should has list type if max_lines_num is not specified')
        max_contexts_num = len(tokenized_contexts)

    X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index[SPECIAL_TOKENS.PAD_TOKEN], dtype=INTX)
    
    for context_idx, context in enumerate(tokenized_contexts):
        if context_idx >= max_contexts_num:
            break

        # take last max_content_len utterances
        context = context[-max_context_len:]

        # fill utterances to the end of context, keep first empty utterances padded.
        utterance_offset = max_context_len - len(context)
        for utterance_idx, utterance in enumerate(context):
            if add_start_end:
                utterance = [SPECIAL_TOKENS.START_TOKEN] + utterance + [SPECIAL_TOKENS.EOS_TOKEN]

            for token_idx, token in enumerate(utterance[:max_line_len]):
                X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                    if token in token_to_index else token_to_index[SPECIAL_TOKENS.UNKNOWN_TOKEN]

    return X

def get_processed_corpus_path():
    return 'data/corpora_processed/train_processed_dialogs.txt'

def load_processed_dialogs_from_json(lines, text_field_name, condition_field_name):
    for line_json in JsonTextLinesIterator(lines):
        yield [{
            text_field_name: entry['text'],
            condition_field_name: entry['condition']
        } for entry in line_json]
        
def file_buffered_tee(iterable, n=2):
    _, filename = tempfile.mkstemp()
    try:
        _pickle_iterable(filename, iterable)
        return tuple(_unpickle_iterable(_open_pickle(filename)) for _ in range(n))
    finally:
        os.remove(filename)
        
def get_dialog_lines_and_conditions(dialog_lines, text_field_name, condition_field_name):
    """
    Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines
    """
    conditions_iter, dialog_lines_iter = file_buffered_tee(
        map(lambda line: [line[condition_field_name], line[text_field_name]], dialog_lines))
    conditions_iter = map(itemgetter(0), conditions_iter)
    dialog_lines_iter = map(itemgetter(1), dialog_lines_iter)
    return dialog_lines_iter, conditions_iter

def transform_conditions_to_nn_input(dialog_conditions, condition_to_index, num_dialogs):
    y_conditions_iterator = islice(dialog_conditions, 1, None, 2)

    print ('Iterating through conditions of output list')
    return transform_conditions_to_ids(y_conditions_iterator, condition_to_index, num_dialogs)

def get_alternated_dialogs_lines(dialogs):
    for dialog in dialogs:
        for first_dialog_line, second_dialog_line in zip(dialog, dialog[1:]):
            yield first_dialog_line
            yield second_dialog_line

def transform_conditions_to_ids(conditions, condition_to_index, n_dialogs):
    condition_ids_iterator = map(
        lambda condition: condition_to_index.get(condition, condition_to_index[DEFAULT_CONDITION]), conditions)
    condition_ids = np.full(n_dialogs, condition_to_index[DEFAULT_CONDITION], dtype=INTX)
    # shape == (n_dialogs, )
    for sample_idx, condition_id in enumerate(condition_ids_iterator):
        condition_ids[sample_idx] = condition_id

    # shape == (n_dialogs, 1)
    return condition_ids

def load_conditioned_dataset(token_to_index, condition_to_index, subset_size=None):
    processed_corpus_path = get_processed_corpus_path()
    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition')
    train_lines, train_conditions = get_dialog_lines_and_conditions(
        get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition')
#     print (list(train_lines))
    
#     for ky in train_lines:
#         print (ky)
    
    tokenized_alternated_train_lines = ProcessedLinesIterator(train_lines, processing_callbacks=[get_tokens_sequence])

#     print (list(tokenized_alternated_train_lines))
    # prepare train set
    x_train, y_train, n_dialogs = transform_lines_to_nn_input(tokenized_alternated_train_lines, token_to_index)

#     print (x_train[0][2])
    
    condition_ids_train = transform_conditions_to_nn_input(train_conditions, condition_to_index, n_dialogs)
    return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)

def get_training_dataset(token_to_index,
                         condition_to_index,
                         is_reverse_model,
                         train_subset_size=None):
    train_dataset = load_conditioned_dataset(token_to_index, condition_to_index, train_subset_size)

    return train_dataset

get_training_dataset(token_to_index, condition_to_index, False)

Iterating through lines to get number of elements in the dataset
[['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend', '?'], ['doing', 'good'], ['ok', '.', '.', '.'], ['oh', ',', 'hi', '!', ':', ')', 'how', 'are', 'you', ',', 'my', 'friend'

Dataset(x=array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]], dtype=uint16), y=array([[ 2,  1,  4, ...,  0,  0,  0],
       [ 2,  1, 18, ...,  0,  0,  0],
       [ 2,  1,  1, ...,  0,  0,  0],
       ...,
       [ 2,  1,  4, ...,  0,  0,  0],
       [ 2,  1, 18, ...,  0,  0,  0],
       [ 2,  1,  1, ...,  0,  0,  0]], dtype=uint16), condition_ids=array([1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1,
       0, 2