数据准备（data/corpora_processed、data/conditions_index）

构建单词索引

In [1]:
import json
import nltk
from collections import Counter
nltk.download('punkt')
_tokenizer = nltk.tokenize.RegexpTokenizer(pattern='\w+|[^\w\s]')

VOCABULARY_MAX_SIZE = 50000
MAX_CONDITIONS_NUM = 5

dialogs = []
tokens_counter = Counter()
conditions_counter = Counter()
tokenized_training_lines = []
for line in open('data/corpora_processed/train_processed_dialogs.txt', 'r', encoding='utf-8'):
    line_json = json.loads(line.strip())
    dias = []
    for entry in line_json:
        tokens = _tokenizer.tokenize(entry['text'])
        tokenized_training_lines.append(tokens)
        dias.append({'text': ' '.join(tokens), 'condition': entry['condition']})
        tokens_counter.update(tokens)
        conditions_counter[entry['condition']] += 1
    dialogs.append(dias)
        
# 构建vocab list
special_tokens = ['_pad_', '_unk_', '_start_', '_end_']
vocab = special_tokens + [token for token, _ in tokens_counter.most_common(VOCABULARY_MAX_SIZE - len(special_tokens))]

# 构建condition list
conditions = [condition for condition, _ in conditions_counter.most_common(MAX_CONDITIONS_NUM)]

index_to_token = dict(enumerate(vocab))
index_to_condition = dict(enumerate(conditions))

with open('data/id2vocab', 'w', encoding='utf-8') as fh:
        json.dump(index_to_token, fh, ensure_ascii=False)

with open('data/id2condition', 'w', encoding='utf-8') as fh:
        json.dump(index_to_condition, fh, ensure_ascii=False)
        
print(list(index_to_token.items())[:10])
print(list(index_to_condition.items())[:5])

[(0, '_pad_'), (1, '_unk_'), (2, '_start_'), (3, '_end_'), (4, '.'), (5, ','), (6, 'Hello'), (7, 'Oh'), (8, 'hi'), (9, '!')]
[(0, 'neutral'), (1, 'joy'), (2, 'sadness')]


[nltk_data] Downloading package punkt to /Users/majing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


预训练词向量

In [2]:
from gensim.models import Word2Vec
import multiprocessing

WORD_EMBEDDING_DIMENSION = 128
W2V_WINDOW_SIZE = 10
USE_SKIP_GRAM = True # 为False时用CBOW
MIN_WORD_FREQ = 1
_WORKERS_NUM = multiprocessing.cpu_count()

word2vec_path = 'data/word2vec.bin'
word2vec_model = Word2Vec(
        window=W2V_WINDOW_SIZE,
        size=WORD_EMBEDDING_DIMENSION,
        max_vocab_size=VOCABULARY_MAX_SIZE,
        min_count=MIN_WORD_FREQ,
        workers=_WORKERS_NUM,
        sg=USE_SKIP_GRAM)

word2vec_model.build_vocab(tokenized_training_lines)
word2vec_model.train(tokenized_training_lines, total_words=50000, epochs=10)
word2vec_model.init_sims(replace=True) # 强制单位归一化，破坏性就地(打击非归一化向量), 更节省存储空间
word2vec_model.save(word2vec_path, separately=[])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


初始化随机数种子

In [3]:
import random
import numpy as np
import tensorflow as tf

random.seed(42)
np.random.seed(42)
tf.set_random_seed(42)

导入vocab和condition索引

In [4]:
with open('data/id2vocab', 'r', encoding='utf-8') as item_index_fh:
        token_to_index = json.load(item_index_fh)
        token_to_index = {v: int(k) for k, v in token_to_index.items()}


with open('data/id2condition', 'r', encoding='utf-8') as item_index_fh:
        condition_to_index = json.load(item_index_fh)
        condition_to_index = {v: int(k) for k, v in condition_to_index.items()}
        
print (token_to_index)
print (condition_to_index)        

{'_pad_': 0, '_unk_': 1, '_start_': 2, '_end_': 3, '.': 4, ',': 5, 'Hello': 6, 'Oh': 7, 'hi': 8, '!': 9, ':': 10, ')': 11, 'How': 12, 'are': 13, 'you': 14, 'my': 15, 'friend': 16, '?': 17, 'Doing': 18, 'good': 19, 'Justin': 20, 'Bieber': 21, 'is': 22, 'the': 23, 'best': 24, 'Ok': 25}
{'neutral': 0, 'joy': 1, 'sadness': 2}


生成训练数据

In [5]:
from itertools import islice


INPUT_CONTEXT_SIZE = 3
INPUT_SEQUENCE_LENGTH = 30
INPUT_SEQUENCE_LENGTH = 30
OUTPUT_SEQUENCE_LENGTH = 32
INTX = 'uint16'

# 对数据进行分词等操作
train_conditions = []
tokenized_alternated_train_lines = []
y_data_iterator_for_context = []
for dialog in dialogs:
    for first_dialog_line, second_dialog_line in zip(dialog, dialog[1:]):
        tokenized_alternated_train_lines.append(_tokenizer.tokenize(first_dialog_line['text']))
        tokenized_alternated_train_lines.append(_tokenizer.tokenize(second_dialog_line['text']))
        y_data_iterator_for_context.append(_tokenizer.tokenize(second_dialog_line['text']))
        train_conditions.append(first_dialog_line['condition'])
        train_conditions.append(second_dialog_line['condition'])

# 数据进行X、Y的区分
n_dialogs = sum(1 for _ in tokenized_alternated_train_lines)
x_data_iterator_seq2seq = islice(tokenized_alternated_train_lines, 0, None, 2)
context = []
x_data_iterator = []
last_y_line = None
for x_line, y_line in zip(x_data_iterator_seq2seq, y_data_iterator_for_context):
    if x_line != last_y_line:
        context = []  # clear context if last response != current dialog context (new dialog)
    context.append(x_line)
    x_data_iterator.append(context[-INPUT_CONTEXT_SIZE:])  # yield list of tokenized lines
    last_y_line = y_line

# X数据转成ID
n_dialogs = sum(1 for _ in tokenized_alternated_train_lines)
n_dialogs //= 2
max_contexts_num = n_dialogs
max_context_len = INPUT_CONTEXT_SIZE
max_line_len = INPUT_SEQUENCE_LENGTH
X = np.full((max_contexts_num, max_context_len, max_line_len), token_to_index['_pad_'], dtype=INTX)
for context_idx, context in enumerate(x_data_iterator):
    if context_idx >= max_contexts_num:
        break

    # take last max_content_len utterances
    context = context[-max_context_len:]

    # fill utterances to the end of context, keep first empty utterances padded.
    utterance_offset = max_context_len - len(context)
    for utterance_idx, utterance in enumerate(context):
        for token_idx, token in enumerate(utterance[:max_line_len]):
            X[context_idx, utterance_offset + utterance_idx, token_idx] = token_to_index[token] \
                if token in token_to_index else token_to_index[_unk_]
                
# Y数据转成ID
max_lines_num = n_dialogs
Y = np.full((max_lines_num, OUTPUT_SEQUENCE_LENGTH), token_to_index['_pad_'], dtype=INTX)
for line_idx, line in enumerate(y_data_iterator_for_context):
    if line_idx >= max_lines_num:
        break

    line = ['_start_'] + line + ['_end_']

    for token_idx, token in enumerate(line[:max_line_len]):
        Y[line_idx, token_idx] = token_to_index[token] if token in token_to_index else token_to_index['_unk_']

# condition数据转成ID
y_conditions_iterator = islice(train_conditions, 1, None, 2)
condition_ids_iterator = map(lambda condition: condition_to_index.get(condition, condition_to_index['neutral']), \
                             y_conditions_iterator)
condition_ids = np.full(n_dialogs, condition_to_index['neutral'], dtype=INTX)
for sample_idx, condition_id in enumerate(condition_ids_iterator):
    condition_ids[sample_idx] = condition_id
    
x_train = X
y_train = Y
condition_to_index = condition_ids

[[[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 6  0  0 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 6  0  0 ...  0  0  0]
  [ 7  5  8 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [20 21 22 ...  0  0  0]]

 ...

 [[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 6  0  0 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 6  0  0 ...  0  0  0]
  [ 7  5  8 ...  0  0  0]]

 [[ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [20 21 22 ...  0  0  0]]]
