# Download Cornell Corpus and Glove Dataset.

### set enable_twitter_glove_download to true to fetch dataset from internet. ~1.4Gb

In [1]:
import numpy as np
import os
import sys
import zipfile
import urllib.request
from collections import Counter

import nltk
nltk.download('punkt')

# Enable only if want to download entire set else use github
enable_twitter_glove_download = False

GLOVE_EMBEDDING_SIZE = 25
GLOVE_VECTOR = "/content/data_corpus/glove.twitter.27B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt"
CORNELL_CORPUS = "/content/data_corpus/cornell movie-dialogs corpus/movie_conversations.txt"
MAX_SEQUENCE_LENGTH=200
MAX_NB_WORDS=4000

BATCH_SIZE = 64  ##increasing batchsize causing crash on colab
# with last minute changes can't get time to complete more epocs in so logs are for 1 epoch .ipynb
NUM_EPOCHS = 1  
HIDDEN_UNITS = 256
MAX_INPUT_SEQ_LENGTH = 40
MAX_TARGET_SEQ_LENGTH = 40
MAX_VOCAB_SIZE = 10000

WEIGHT_FILE_PATH = 'models/'+ '/word-glove-weights.h5'

if not os.path.exists('./models'):
  os.makedirs('./models')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import requests

def download_file_from_google_drive(id, destination):
    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    
    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)
    print(response)
    save_response_content(response, destination)    

#download_file_from_google_drive("1cv-2rNKLZPttBrliwPrtQO9dwR0qSOw8","/content/data_corpus/glove.twitter.27B.25d.txt")

In [3]:
def download_corpus(CORPUS, path, zip_file, url):
  if not os.path.exists(CORPUS):
    if not os.path.exists(path):
        os.makedirs(path)

    if not os.path.exists(zip_file):
        print('corpus file not exist, downloading from net')
        urllib.request.urlretrieve(url=url, filename=zip_file)

    zip_ref = zipfile.ZipFile(zip_file, 'r')
    zip_ref.extractall(path)
    zip_ref.close()

download_corpus(CORPUS=CORNELL_CORPUS,  path='/content/data_corpus',
      zip_file='/content/data_corpus/cornell_movie_dialogs_corpus.zip',
      url='http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip')

if not os.path.exists(CORNELL_CORPUS):
  print("Download cornell corpus failed...!!")
else:
  print("cornell corpus downloaded.")

if enable_twitter_glove_download:
  download_corpus(CORPUS=GLOVE_VECTOR,  path='/content/data_corpus',
        zip_file='/content/data_corpus/glove.twitter.27B.zip',
        url='http://nlp.stanford.edu/data/glove.twitter.27B.zip')
else:
  download_file_from_google_drive("1cv-2rNKLZPttBrliwPrtQO9dwR0qSOw8","/content/data_corpus/glove.twitter.27B.25d.txt")
  #print("Using github for glove dataset")
  #from google.colab import drive
  #drive.mount('/content/gdrive')
  #GLOVE_VECTOR = '/content/gdrive/My Drive/Colab Notebooks/glove.twitter.27B.25d.txt'


if not os.path.exists(GLOVE_VECTOR):
  print("Download glove corpus failed...!!")
else:
  print("glove corpus downloaded.")



cornell corpus downloaded.
<Response [200]>
glove corpus downloaded.


# Process Cornell Movie Corpus Data

In [4]:
# Load the data
CORNELL_LINE_CORPUS = "/content/data_corpus/cornell movie-dialogs corpus/movie_lines.txt"
CORNELL_CONV_CORPUS =  CORNELL_CORPUS
lines = open(CORNELL_LINE_CORPUS, encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open(CORNELL_CONV_CORPUS, encoding='utf-8', errors='ignore').read().split('\n')

#LINE: ['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!']
#CONV: ["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']"]

#Build VOCAB from movie_lines
#Build  Question & Answer set from conv to train as X,Y



###### Based on many online chatbot, many used this to filter the text, its really good

In [5]:
import re
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text


In [6]:
# Create a dictionary to map each line's id with its text
texts = []
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        texts.append(clean_text(_line[4]))

with open('clean_movie_file.txt', 'w') as fp:
    for stmt in texts:
        fp.write('%s\n' % stmt)
lines = open('clean_movie_file.txt', 'r').read().split('\n')

####1. Load the glove word embedding into a dictionary. [10 Marks]

In [7]:
def load_glove():
  word2em = {}
  with open(GLOVE_VECTOR) as fp:
    for line in fp:
      words = line.strip().split()
      word = words[0]
      vecs = np.asarray(words[1:], dtype='float32')
      word2em[word] = vecs

  print('Found %s word vectors.' % len(word2em))
  return word2em

word2em = load_glove()

Found 1193514 word vectors.


####Data Preparation - Filter the conversations till max word length and convert the dialogues pairs into input text and target texts. Mark start and end token to recognize the beginning and end of the sentence token

In [8]:
target_counter = Counter()
input_texts = []
target_texts = []
prev_words = []

for line in lines:
    next_words = [w.lower() for w in nltk.word_tokenize(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        target_words = next_words[:]
        target_words.insert(0, 'start')
        target_words.append('end')

        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)

    prev_words = next_words


####2. Create a target word to id dictionary called target_word2idx. [10 Marks]

In [9]:
target_word2idx = dict()
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

if 'unknown' not in target_word2idx:
    target_word2idx['unknown'] = 0

np.save('models/' + '/word-glove-target-word2idx.npy', target_word2idx)


####3. Create a target to id dictionary called target_idx2word. [10 Marks]

In [10]:
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
np.save('models/' + '/word-glove-target-idx2word.npy', target_idx2word)

#### Number of unique decoder tokens

In [11]:
num_decoder_tokens = len(target_idx2word)+1

####4. Prepare the input data with embedding. [15 Marks]

In [12]:
input_texts_word2em = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        emb = np.zeros(shape=GLOVE_EMBEDDING_SIZE)
        if w in word2em:
            emb = word2em[w]
        encoder_input_wids.append(emb)

    input_texts_word2em.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

context = dict()
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

print(context)
np.save('models/' + '/word-glove-context.npy', context)

{'num_decoder_tokens': 10001, 'encoder_max_seq_length': 40, 'decoder_max_seq_length': 42}


####5. Generate training data per batch. [15 Marks]

In [13]:
def generate_batch(input_word2em_data, output_text_data):
    num_batches = len(input_word2em_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_word2em_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, GLOVE_EMBEDDING_SIZE))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = target_word2idx['unknown']  
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    if w in word2em:
                        decoder_input_data_batch[lineIdx, idx, :] = word2em[w]
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch



In [14]:
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from collections import Counter
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


####6. Defining correct model architecture. [10 Marks]

In [15]:
encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

json = model.to_json()
open('models/'+ '/word-glove-architecture.json', 'w').write(json)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(input_texts_word2em, target_texts, test_size=0.2, random_state=42)

print(len(Xtrain))
print(len(Xtest))


243552
60888


In [16]:
train_gen = generate_batch(Xtrain, Ytrain)
test_gen = generate_batch(Xtest, Ytest)

train_num_batches = len(Xtrain) // BATCH_SIZE
test_num_batches = len(Xtest) // BATCH_SIZE

checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True)
model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1, validation_data=test_gen, validation_steps=test_num_batches, callbacks=[checkpoint])

model.save_weights(WEIGHT_FILE_PATH)

Epoch 1/1


## Test Chatbot

#### To filter noise in input text for prediction

In [17]:
WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,'

def in_white_list(_word):
    for char in _word:
        if char in WHITELIST:
            return True

    return False

In [18]:
# glbal variables
model = None
encoder_model = None
decoder_model = None
target_word2idx = None
target_idx2word = None
max_decoder_seq_length = None
max_encoder_seq_length = None
num_decoder_tokens = None

In [29]:
# Initialize the chatbot for prediction

target_word2idx = np.load('./models/' + '/word-glove-target-word2idx.npy', allow_pickle=True).item()
target_idx2word = np.load('./models/' + '/word-glove-target-idx2word.npy', allow_pickle=True).item()
context = np.load('./models/' + '/word-glove-context.npy', allow_pickle=True).item()
max_encoder_seq_length = context['encoder_max_seq_length']
max_decoder_seq_length = context['decoder_max_seq_length']
num_decoder_tokens = context['num_decoder_tokens']

encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm")
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.load_weights('./models/' + '/word-glove-weights.h5')
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)




In [30]:
def testme(input_text):
    input_seq = []
    input_emb = []

    # create embedding vector for input text
    for word in nltk.word_tokenize(input_text.lower()):
        if not in_white_list(word):
            continue
        emb = np.zeros(shape=GLOVE_EMBEDDING_SIZE)
        if word in word2em:
            emb = word2em[word]
        input_emb.append(emb)
    
    input_seq.append(input_emb)
    input_seq = pad_sequences(input_seq, max_encoder_seq_length)

    # use padded input sequence with encoder
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE))
    target_seq[0, 0, :] = word2em['start']
    target_text = ''
    target_text_len = 0
    terminated = False
    
    ## predict the target text/ or repsonse
    while not terminated:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sample_token_idx = np.argmax(output_tokens[0, -1, :])
        sample_word = target_idx2word[sample_token_idx]
        target_text_len += 1

        if sample_word != 'start' and sample_word != 'end':
            target_text += ' ' + sample_word

        if sample_word == 'end' or target_text_len >= max_decoder_seq_length:
            terminated = True

        target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE))
        if sample_word in word2em:
            target_seq[0, 0, :] = word2em[sample_word]

        states_value = [h, c]
    return target_text.strip()


In [31]:
# unit test 
print(testme('can we talk?'))

i am not going to be a little unknown


####7. Final prediction. [20 Marks]

In [32]:
#Test 10 times
print("***** LOOP 10 times *******")
for _ in range(10):
    print(testme(input('Enter question : ')))


***** LOOP 10 times *******
Enter question : Hello Computer
i am not going to be a unknown
Enter question : How are you?
i am not going to be a little unknown
Enter question : who are you?
i am not going to be a little unknown
Enter question : party?
i am not going to be a unknown
Enter question : hello
i am not going to be a unknown
Enter question : hi
i am not going to be a unknown
Enter question : unknown
i am not going to be a unknown
Enter question :  bye
i am not going to be a unknown
Enter question : by bye
i am not going to be a unknown
Enter question : bye
i am not going to be a unknown


### Synopsis
The chatbot data preprocessing an modelling is learned. However, model is performing poorly on cornell dataset. Overall its good learning.
Needs optimization in dataset.