In [20]:
# deep learning library
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, Flatten, GRU, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from numpy.random import seed
seed(4222)

# general libraries
import pandas as pd
import numpy as np
import string, os, io
import random

import warnings
warnings.filterwarnings("ignore")

In [2]:
aolQueries = []

for i in range(2, 10):
  if i < 10:
    fileName = "user-ct-test-collection-0" + str(i) + ".txt"
  else:
    fileName = "user-ct-test-collection-" + str(i) + ".txt"

  lines = []
  with open(fileName) as f:
    lines = f.readlines()

  count = 0
  for line in lines:
    if count > 0:
      query = line.split("\t")[1]
      aolQueries.append(query)

    count += 1

In [3]:
corpus = aolQueries[:20000]

In [4]:
# Encoding - convert from text to sequences (numbers)
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    '''
    Convert each sentence to a list of ngram sequences
    '''
    # Tokenization
    tokenizer.fit_on_texts(corpus) # Fit on our text sentences
    total_words = len(tokenizer.word_index) + 1 # Total number of unique words in our vocabulary
    
    # Convert data to sequence of tokens 
    input_sequences = [] # House our final sequences
    for line in corpus: # For every sentence
        token_list = tokenizer.texts_to_sequences([line])[0] # Convert a line of text to a line of sequence
        for i in range(1, len(token_list)): # Generate ngrams
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10] # The first 10 sequences

[[1201, 3421],
 [4801, 4802],
 [4801, 4802, 4803],
 [1201, 3421],
 [1201, 3421, 234],
 [1201, 3421, 234, 4804],
 [257, 3422],
 [257, 3422, 1569],
 [257, 3422, 1569, 3],
 [257, 3422, 1569, 3, 350]]

In [11]:
from keras.utils import np_utils

def generate_padded_sequences(input_sequences):
    '''
    Pad every sentence to the longest sentence in the corpus
    '''
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = np_utils.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [12]:
print("The maximum sentence length is:", max_sequence_len)
print(predictors[0]) # Padded sequence

The maximum sentence length is: 43
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 1201]


In [29]:
def create_model(max_sequence_len, total_words):

    # Initialise model
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
    
    model.add(Bidirectional(GRU(100)))
    
    # Output Layer - softmax activation
    model.add(Dense(total_words, activation='softmax'))

    # Compile model - crossentropy loss
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 42, 100)           826900    
                                                                 
 bidirectional_8 (Bidirectio  (None, 200)              121200    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 8269)              1662069   
                                                                 
Total params: 2,610,169
Trainable params: 2,610,169
Non-trainable params: 0
_________________________________________________________________


In [30]:
history = model.fit(predictors, label, epochs=11, verbose=1)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [31]:
loss_history = history.history["loss"][-1]
print("The crossentropy loss is:", loss_history)

The crossentropy loss is: 1.0136362314224243


In [35]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0] # Tokenize seed text
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') # Pad seed text
        predicted = model.predict(token_list, verbose=0) # Predict next word given seeded text
        
        output_word = ""
        
        for word,index in tokenizer.word_index.items(): # Convert from sequence to string
            if  index == predicted[0,_]:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [33]:
search_query = "getting organized at work"

In [36]:
generate_text(search_query,1,max_sequence_len)

'Getting Organized At Work '