# Text Preprocessing

In [33]:
import numpy as np
import tensorflow as tf
import os
# load lines dictionary 
lines = open('dataset/chatbot/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

# load conversations
convs = open('dataset/chatbot/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

print('\n'.join(lines[:3]))
print()
print('\n'.join(convs[:3]))

lines_dict = {}
for line in lines:
    l = line.split(" +++$+++ ")
    lines_dict[l[0]] = l[-1]
    
convs_list = []
for conv in convs:
    convs_list.append(conv.split(" +++$+++ ")[-1][1:-1].replace("'","").replace(" ","").split(','))


L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']


過濾掉沒用的符號，切出每個conversation

In [34]:
questions = []
answers = []

for conv in convs_list:
    for i in range(len(conv)-1):
        questions.append(lines_dict[conv[i]])
        answers.append(lines_dict[conv[i+1]])

print(len(questions))
print(len(answers))

221616
221616


用前後兩句組成questions和answers

In [35]:
import re

def clean_text(text):

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

把一些縮寫或是口語用法替換成正式用法，並過濾掉沒有必要的符號

In [37]:
min_line_length = 2
max_line_length = 16

def isNormal(l):
    return len(l.split()) >= min_line_length and len(l.split()) <= max_line_length

normal_questions = []
normal_answers = []

for question, answer in zip(clean_questions, clean_answers):
    if isNormal(question) and isNormal(answer):
        normal_questions.append(question)
        normal_answers.append(answer)

只取一個長度範圍內的句子，把太長或太短的字丟掉

In [38]:
vocab = {}
for question in normal_questions:
    for word in question.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
for answer in normal_answers:
    for word in answer.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
threshold = 10
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1
print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)

Size of total vocab: 39915
Size of vocab we will use: 6662


計算每個字出現的次數

In [39]:
vocab_question = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        vocab_question[word] = word_num
        word_num += 1
        
vocab_answer = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        vocab_answer[word] = word_num
        word_num += 1

codes = ['<PAD>','<END>','<UNK>','<BEG>']

for code in codes:
    vocab_question[code] = len(vocab_question)+1
    
for code in codes:
    vocab_answer[code] = len(vocab_answer)+1

vocab_question_int = {v_i: v for v, v_i in vocab_question.items()}
vocab_answer_int = {v_i: v for v, v_i in vocab_answer.items()}

只把出現次數超過threshold的字加入vocabulary dictionary，避免train model時出現ResourceExausted。除此之外，在dictionary中加入PAD等後續會用到的符號

In [40]:
for i in range(len(normal_answers)):
    normal_answers[i] = '<BEG> ' + normal_answers[i] + ' <END>'

因為answer在後面會當成decoder input 和 decoder output，所以在頭尾分別加入BEG和END

In [41]:
questions_int = []
for question in normal_questions:
    ints = []
    for word in question.split():
        if word not in vocab_question:
            ints.append(vocab_question['<UNK>'])
        else:
            ints.append(vocab_question[word])
    questions_int.append(ints)
    
answers_int = []
for answer in normal_answers:
    ints = []
    for word in answer.split():
        if word not in vocab_answer:
            ints.append(vocab_answer['<UNK>'])
        else:
            ints.append(vocab_answer[word])
    answers_int.append(ints)

如果句子中出現dic中沒有的字，就用UNK代替

In [42]:
questions_int_clean = []
answers_int_clean = []

for i in range(len(questions_int)):
    if not(vocab_question['<UNK>'] in questions_int[i] or vocab_answer['<UNK>'] in answers_int[i]): # remove '<UNK>' sentence
        questions_int_clean.append(questions_int[i])
        answers_int_clean.append(answers_int[i])

questions_int = questions_int_clean
answers_int = answers_int_clean

只使用沒有UNK的句子

In [44]:
max_line_length += 2

因為answer的頭尾多加了BEG和END，所以最大長度要+2

# Prepare Batch

In [45]:
class BatchGenerator:
    def __init__(self, questions, answers, batch_size):
        assert len(questions) == len(answers)
        
        batch_num = len(questions)//batch_size
        n = batch_num*batch_size
        
        self.xs = [np.zeros(n, dtype=np.int32) for _ in range(max_line_length)] # encoder inputs
        self.ys = [np.zeros(n, dtype=np.int32) for _ in range(max_line_length)] # decoder inputs
        self.gs = [np.zeros(n, dtype=np.int32) for _ in range(max_line_length)] # decoder outputs
        self.ws = [np.zeros(n, dtype=np.float32) for _ in range(max_line_length)] # decoder weight for loss caculation
        
        self.batch_size = batch_size
        
        for b in range(batch_num):
            for i in range(b*batch_size, (b+1)*batch_size):
                for j in range(len(questions[i])):
                    self.xs[j][i] = questions[i][j]
                for j in range(len(questions[i]), max_line_length):
                    self.xs[j][i] = vocab_question['<PAD>']

                for j in range(len(answers[i]) - 1):
                    self.gs[j][i] = answers_int[i][j + 1]
                    self.ys[j][i] = answers_int[i][j]
                    self.ws[j][i] = 1.0
                for j in range(len(answers[i]) - 1, max_line_length):
                    self.gs[j][i] = vocab_answer['<PAD>']
                    self.ys[j][i] = vocab_answer['<PAD>']
                    self.ws[j][i] = 0.0
    
    def get(self, batch_id):
        x = [self.xs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(max_line_length)]
        y = [self.ys[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(max_line_length)]
        g = [self.gs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(max_line_length)]
        w = [self.ws[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(max_line_length)]
        
        return x, y, g, w

    
batch = BatchGenerator(questions_int, answers_int, 4)
x, y, g, w = batch.get(7)
print("Encoder input")
print("Decoder input")
print("Decoder output")
print()
for i in range(4):
    print(' '.join([vocab_question_int[x[j][i]] for j in range(max_line_length)]))
    print(' '.join([vocab_answer_int[y[j][i]] for j in range(max_line_length)]))
    print(' '.join([vocab_answer_int[g[j][i]] for j in range(max_line_length)]))
    print('')

Encoder input
Decoder input
Decoder output

not all experiences are good bianca you cannot always trust the people you want to <PAD> <PAD> <PAD>
<BEG> i guess i will never know will i <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
i guess i will never know will i <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

you looked beautiful last night you know <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<BEG> so did you <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
so did you <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

let go <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<BEG> you set me up <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
you set me up <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

you set me up <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

notebook的BatchGenerator

# Build Seq2Seq Graph

In [53]:
class MachineTranslationSeq2Seq:
    def __init__(self, en_max_len, ch_max_len, en_size, ch_size):
        self.en_max_len = en_max_len
        self.ch_max_len = ch_max_len
        
        with tf.variable_scope('seq2seq_intput/output'):
            self.enc_inputs = [tf.placeholder(tf.int32, [None]) for i in range(en_max_len)] # time mojor feed
            self.dec_inputs = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.groundtruths = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.weights = [tf.placeholder(tf.float32, [None]) for i in range(ch_max_len)]
            
        with tf.variable_scope('seq2seq_rnn'): # training by teacher forcing
            self.out_cell = tf.contrib.rnn.LSTMCell(512)
            self.outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                    self.out_cell, 
                                                                                    en_size, ch_size, 300)
        with tf.variable_scope('seq2seq_rnn', reuse=True): # predict by feeding previous
            self.pred_cell = tf.contrib.rnn.LSTMCell(512, reuse=True) # reuse cell for train and test
            self.predictions, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                        self.pred_cell, 
                                                                                        en_size, ch_size, 300, 
                                                                                        feed_previous=True)
        
        with tf.variable_scope('loss'):
            # caculate weighted loss
            self.loss = tf.reduce_mean(tf.contrib.legacy_seq2seq.sequence_loss_by_example(self.outputs, 
                                                                                          self.groundtruths, 
                                                                                          self.weights))
            self.optimizer = tf.train.AdamOptimizer(0.002).minimize(self.loss)
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
    
    def train(self, x, y, g, w):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i] # show how to feed a list
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
            fd[self.groundtruths[i]] = g[i]
            fd[self.weights[i]] = w[i]
        
        loss, _ = self.sess.run([self.loss, self.optimizer], fd)
        
        return loss

    def output(self, x, y):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
        
        out = self.sess.run(self.outputs, fd)
        
        return out
    
    def predict(self, x, ch_beg):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len): # when feed previous, the fist token should be '<BEG>', and others are useless
            if i==0:
                fd[self.dec_inputs[i]] = np.ones(y[i].shape, dtype=np.int32)*ch_beg
            else:
                fd[self.dec_inputs[i]] = np.zeros(y[i].shape, dtype=np.int32)
        
        pd = self.sess.run(self.predictions, fd)
        
        return pd
    
    def save(self, e):
        self.saver.save(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e+1))
    
    def restore(self, e):
        self.saver.restore(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e))

notebook的model

In [54]:
EPOCHS = 100
BATCH_SIZE = 256
batch_num = len(questions_int)//BATCH_SIZE

batch = BatchGenerator(questions_int, answers_int, BATCH_SIZE)

In [92]:
tf.reset_default_graph()
model = MachineTranslationSeq2Seq(max_line_length, max_line_length, len(vocab_question), len(vocab_answer))

# Train and Inference

In [127]:
rec_loss = []
for e in range(EPOCHS):
    train_loss = 0
    
    for b in range(batch_num):
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss
    
    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    if (e+1)%10 == 0:
        model.save(e)
    
np.save('./model/seq2seq/rec_loss.npy', rec_loss)

epoch 0 loss: 4.889085
epoch 1 loss: 4.218606
epoch 2 loss: 3.951801
epoch 3 loss: 3.694142
epoch 4 loss: 3.420163
epoch 5 loss: 3.158150
epoch 6 loss: 2.917308
epoch 7 loss: 2.696449
epoch 8 loss: 2.488859
epoch 9 loss: 2.314511
epoch 10 loss: 2.161244
epoch 11 loss: 2.022103
epoch 12 loss: 1.906468
epoch 13 loss: 1.793270
epoch 14 loss: 1.684878
epoch 15 loss: 1.593122
epoch 16 loss: 1.512825
epoch 17 loss: 1.449281
epoch 18 loss: 1.383913
epoch 19 loss: 1.337903
epoch 20 loss: 1.294274
epoch 21 loss: 1.266396
epoch 22 loss: 1.216491
epoch 23 loss: 1.177595
epoch 24 loss: 1.139740
epoch 25 loss: 1.104571
epoch 26 loss: 1.083835
epoch 27 loss: 1.061483
epoch 28 loss: 1.036232
epoch 29 loss: 1.006418
epoch 30 loss: 0.990028
epoch 31 loss: 0.966246
epoch 32 loss: 0.951821
epoch 33 loss: 0.930293
epoch 34 loss: 0.899788
epoch 35 loss: 0.877102
epoch 36 loss: 0.872086
epoch 37 loss: 0.873781
epoch 38 loss: 0.865976
epoch 39 loss: 0.839594
epoch 40 loss: 0.804912
epoch 41 loss: 0.793262
ep

In [78]:
import nltk

def cherry_pick(records, n, upper_bound=1.0):
    bleus = []
    
    for en, ch_gr, ch_pd in records:
        # caculate BLEU by nltk
        bleu = nltk.translate.bleu_score.sentence_bleu([ch_gr], ch_pd)
        bleus.append(bleu)
    
    lst = [i for i in range(len(records)) if bleus[i]<=upper_bound]
    lst = sorted(lst, key=lambda i: bleus[i], reverse=True) # sort by BLEU score
    
    return [records[lst[i]] for i in range(n)]

In [79]:
import random as rd

records = []

for i in range(10):
    i = rd.randint(0, batch_num-1) # random pick one to translate
    
    x, y, g, w = batch.get(i)
    out = model.output(x, y)
    pd = model.predict(x, vocab_answer['<BEG>'])

    for j in range(10):
        j = rd.randint(0, BATCH_SIZE-1)
        
        en = [vocab_question_int[x[i][j]] for i in range(max_line_length)]
        en = en[:en.index('<PAD>')]
        ch_gr = [vocab_answer_int[g[i][j]] for i in range(max_line_length)]
        if '<END>' in ch_gr:
            ch_gr = ch_gr[:ch_gr.index('<END>')]
        ch_pd = [vocab_answer_int[np.argmax(pd[i][j, :])] for i in range(max_line_length)]
        if '<END>' in ch_pd:
            ch_pd = ch_pd[:ch_pd.index('<END>')]
        
        records.append([en, ch_gr, ch_pd])

n = 12 # how many result we show
rec_cherry = cherry_pick(records, n)

print("Encoder input")
print("Ground truth")
print("Decoder output")
print()

for i in range(n):
    for j in range(3):
        print(' '.join(rec_cherry[i][j]))
    
    print('')

Encoder input
Ground truth
Decoder output

goddamn it what kind of bullshit is that
listen jake i gotta go
listen jake i gotta go

details baby details
so they do know
so they do know

an excellent quality
maybe he wants something
maybe he wants something

it depends on what way you want to go
well wait you know what i know
well wait you know what i know

we have got some weather
i noticed starck anybody home
i noticed starck anybody home

what did you expect lieutenant
he is very human
he is very human

oh not at all i let them have twentyfive sure are not there four of them
how fascinating do go on john
how fascinating do go on john

you think she is a saint
she is been touched by god yes
she is been touched by god yes

josie you look rufus
who are you guys
who are you guys

we have still got to find the mainframe
no we do not
no we do not

will you bring it in to me
i would rather not
i would rather not

can we go home soon rachel
real soon jamie now shh
soon jamie now shh



In [90]:
#Hello.
#How are you?
#Where are you going?
#You look great.
#Good night.

model.restore(90)
q_test = ['hello', 'how are you', 'where are you going', 'you look great', 'good night']

inf_x = [np.zeros(len(q_test), dtype=np.int32) for _ in range(max_line_length)]

for i in range(len(q_test)):
    q_list = q_test[i].split(' ')
    last_num = 0
    for j in range(len(q_list)):
        inf_x[j][i] = vocab_question[q_list[j]]
        last_num = j
    for j in range(last_num + 1, max_line_length):
        inf_x[j][i] = vocab_question['<PAD>']

pd = model.predict(x, vocab_answer['<BEG>'])

INFO:tensorflow:Restoring parameters from model/seq2seq/seq2seq_90.ckpt


In [91]:
'''
for j in range(10):        
        en = [en_rev[x[i][j]] for i in range(en_max_len)]
        en = en[:en.index('<PAD>')]
        ch_gr = [ch_rev[g[i][j]] for i in range(ch_max_len)]
        if '<END>' in ch_gr:
            ch_gr = ch_gr[:ch_gr.index('<END>')]
        ch_pd = [ch_rev[np.argmax(pd[i][j, :])] for i in range(ch_max_len)]
        if '<END>' in ch_pd:
            ch_pd = ch_pd[:ch_pd.index('<END>')]
        
        records.append([en, ch_gr, ch_pd])
'''        

for j in range(len(q_test)):
    qes = [vocab_question_int[inf_x[i][j]] for i in range(max_line_length)]
    ans = [vocab_answer_int[np.argmax(pd[i][j, :])] for i in range(max_line_length)]
    qes_str = ''
    ans_str = ''
    for q, a in zip(qes, ans):
        if not q == '<PAD>':
            qes_str += q + ' '
        if not a == '<END>':
            ans_str += a + ' '
    print(qes_str)
    print(ans_str)
    print()

hello 
you will spend the night here 

how are you 
all right peter good night 

where are you going 
but it is my favorite state 

you look great 
god it is ugly it is ugly it is ugly it is ugly 

good night 
where is he 



model在training data的表現是好的，但是五句簡單的句子的回應卻不如日常對話，個人猜測有2個原因。首先，因為餵進model的training data是電影中的對話，很多對話並不像一般所認知的制式回答，而是跟特定場景或情節息息相關的，以這個角度來看的話，只要把5句話的回應套在特定場景的上下文中，也是能解釋得通的，例如：you look great這句的回應雖然並不是一般所認知的禮貌回答(如：thanks之類)，但是如果套進一個場景裡，這個場景是B剛剪完頭髮，B自己覺得很醜，而A看見B之後說you look great，而B說god it is ugly it is ugly it is ugly it is ugly就顯得合理了。其他的句子也能仿照這種方式解釋。第二，可能是餵進model的training data在處理時的問題，因為我們只單純的把前後兩句當成question和answer，並沒有考慮其他較複雜的狀況，例如：前面一句是整個話題的最後一句，而後面一句是另一個話題的第一句，沒有考慮到這些狀況會導致model可能把前後根本不相關的兩個句子當成是正確的question和answer。