<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Answers_9_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![PALS0039 Logo](https://www.phon.ucl.ac.uk/courses/pals0039/images/pals0039logo.png)](https://www.phon.ucl.ac.uk/courses/pals0039/)

# Exercise 9.1 Answers

Exercise developed from [https://github.com/sekharvth/simple-chatbot-keras](https://github.com/sekharvth/simple-chatbot-keras)


(a) Setup

In [0]:
import numpy as np
import pandas as pd
%tensorflow_version 2.x
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed
from tensorflow.keras.models import Model, load_model


(b) import movie dialogues

In [0]:
df=pd.read_csv("https://www.phon.ucl.ac.uk/courses/pals0039/data/movie_lines.csv",keep_default_na=False)
print(df.describe())
df.head()

(c) Tokenize the dialogues

In [0]:
max_words=5000

contexts=df.CONTEXT.tolist()
targets=[ "BOS "+l+" EOS" for l in df.TARGET.tolist()]
print("Contexts:",contexts[:5])
print("Targets:",targets[:5])

tokenizer = Tokenizer(num_words=max_words,oov_token="UNK")
tokenizer.fit_on_texts(df.CONTEXT.tolist()+targets)
word_index=tokenizer.word_index
print("Found",len(word_index),"different words.")

print(list(word_index.items())[:10])
print(list(word_index.items())[-10:])

ctxt=tokenizer.texts_to_sequences(df.CONTEXT.tolist())
targ=tokenizer.texts_to_sequences(targets)
print("Context",ctxt[:5])
print("Target",targ[:5])

# build a reverse index
index_to_word={ v:k for k,v in tokenizer.word_index.items()}
index_to_word[0]='.'

(d) filter out all dialogues containing fewer than 2 words or more than 12 words or contain unknown words

In [0]:
min_seq=2
max_seq=12

print("Unfiltered count",len(ctxt),len(targ))
ctxt_filt=[]
targ_filt=[];
for i in range(len(ctxt)):
  clen=len(ctxt[i])
  tlen=len(targ[i])-2   # -2 for BOS/EOS
  if ((min_seq<=clen)and(clen<=max_seq)and(min_seq<=tlen)and(tlen<=max_seq)):
    if (not (1 in ctxt[i]) and not (1 in targ[i])):       # 1 is code for UNK
      ctxt_filt.append(ctxt[i])
      targ_filt.append(targ[i])
print("Filtered count",len(ctxt_filt),len(targ_filt))

#ucount=0;
#for i in range(len(ctxt_filt)):
#  if ((1 in ctxt_filt[i])or(1 in targ_filt[i])):
#    ucount+=1
#print("Dialogues with UNK:",ucount)


(e) prepare data for training

In [0]:
seq_len=max_seq
ctxt_pad=pad_sequences(ctxt_filt, maxlen=seq_len, padding='post')
targ_pad=pad_sequences(targ_filt, maxlen=seq_len+2, padding='post')
outs_pad=np.roll(targ_pad,-1,axis=1)
outs_pad[:,-1]=0
print("Context",ctxt_pad[:5])
print("Target",targ_pad[:5])
print("Outputs",outs_pad[:5])


(f) load Glove embeddings

In [0]:
df=pd.read_csv('https://www.phon.ucl.ac.uk/courses/pals0039/data/glove.6B.100d.zip',header=None)
df.rename(columns={0:"word"},inplace=True)
print("Read %d word embeddings of length %d" % (len(df),len(df.columns)-1))
df.head()

In [0]:
# build an index into the embeddings
glove_index={}
for i,word in enumerate(df.word):
  glove_index[word]=i

# build an embedding matrix for words in movie dialogues
embed_dim=100
word_embed=np.zeros((max_words,embed_dim))
oov_count=0
for i in range(max_words):
  w=index_to_word[i]
  if w in glove_index:
    idx=glove_index[w]
  else:
    idx=glove_index["."]
    oov_count+=1
  word_embed[i,:]=np.array(df.iloc[idx,1:])

print("OOV rate = %.1f%%" % (100*oov_count/max_words))


(g) Build model


In [0]:

latent_dim=300
num_encoder_tokens=max_words
num_decoder_tokens=max_words

# model inputs
input_context = Input(shape = (seq_len, ), dtype = 'int32', name = 'input_context')
input_target = Input(shape = (seq_len+2, ), dtype = 'int32', name = 'input_target')

# initial embedding
embed_layer = Embedding(input_dim = max_words, output_dim = embed_dim, trainable = False )
embed_layer.build((None,))
embed_layer.set_weights([word_embed])

# same embedding layer used for both inputs
input_ctx_embed = embed_layer(input_context)
input_tar_embed = embed_layer(input_target)

# encoder LSTM takes input embedding and just returns final state
LSTM_encoder = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = LSTM_encoder(input_ctx_embed)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
LSTM_decoder = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = LSTM_decoder(input_tar_embed,initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
train_model = Model([input_context, input_target], decoder_outputs)

train_model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
train_model.summary()



(h) train model

In [0]:
train_model.fit([ctxt_pad, targ_pad], outs_pad, epochs = 40, batch_size = 128)

encoder_model = Model(input_context, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = LSTM_decoder(input_tar_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([input_target] + decoder_states_inputs,[decoder_outputs] + decoder_states)

encoder_model.save('ex9_1_encoder.h5')
decoder_model.save('ex9_1_decoder.h5')

(i) build decoder model

In [0]:
encoder_model=load_model('ex9_1_encoder.h5')
decoder_model=load_model('ex9_1_decoder.h5')


def decode_sequence(input_seq):
  # Encode the input as state vectors.
  states_value = encoder_model.predict(input_seq)

  # Generate empty target sequence of length 1.
  target_seq = np.zeros((1, seq_len+2))
  # Populate the first character of target sequence with the start character.
  target_seq[0, 0] = tokenizer.word_index['bos']

  # Sampling loop for a batch of sequences
  decoded_sentence = []
  while True:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, 0, :])

    # Exit condition: either hit max length or find stop character.
    if (sampled_token_index == tokenizer.word_index['eos'] or len(decoded_sentence) > seq_len):
      break

    # save word
    decoded_sentence.append(index_to_word[sampled_token_index])

    # Update the target sequence (of length 1).
    target_seq = np.zeros((1, seq_len+2))
    target_seq[0, 0] = sampled_token_index

    # Update states
    states_value = [h, c]

  return " ".join(decoded_sentence)

# get a question
#question=input("Ask something: ")
#ques_list=tokenizer.texts_to_sequences([question])
#ques_pad=pad_sequences(ques_list, maxlen=seq_len, padding='post')
#print(ques_pad)

# print the answer generated for the given question
#print("Q:",question)
#print("A:",decode_sequence(ques_pad))


(j) chat with chatbot

In [0]:
# get a question
print("Type 'stop' to stop.")
question=input("Q: ")
while question != "stop":
  ques_list=tokenizer.texts_to_sequences([question])
  ques_pad=pad_sequences(ques_list, maxlen=seq_len, padding='post')
  print("A:",decode_sequence(ques_pad))
  question=input("Q: ")
