In [1]:
import numpy as np
import pandas as pd
#from keras.utils import to_categorical
#from keras.preprocessing.sequence import pad_sequences
#from keras.models import Sequential
#from keras.layers import LSTM, Dense, GRU, Embedding
#from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
df = open("user-ct-test-collection-02.txt", "r")

code adapted from https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/

In [3]:
data_source = df.read()

Data Preprocessing - removal of capital letters, ponctuation, numbers and URLS, crating an array with query logs

In [4]:
import re

def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub(":", "", newString) 
    newString = re.sub("-", "", newString) 
    # remove punctuations
    newString = re.sub(r'[0-9]+', "", newString) 
    # remove URLS
    noURL=[]
    for j in newString.split():
        if 'http' not in j:
            noURL.append(j)
    long_words=[]
    # remove short word
    for i in noURL:
        if len(i)>=3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()
    

# preprocess the text
data_new = text_cleaner(data_source)

Data reduction to 10% of its original size, to be more manageable

In [5]:
l = len(data_new)
print(l)

63267097


In [7]:
import random
n = int(0.1*l)
print(n)

6326709


In [8]:
data_short = random.sample(data_new, n)
print(len(data_short))

6326709


Creating sequences

In [9]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i-length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# create sequences   
sequences = create_seq(data_short)

Total Sequences: 6326679


Encoding Sequences

In [10]:
# create a character mapping index
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

Creating Train and Test sets

In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train shape:', X_tr.shape, 'Val shape:', X_val.shape)

Train shape: (5694011, 30) Val shape: (632668, 30)


Build the model

In [12]:
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(X_tr, y_tr, epochs=100, verbose=2, validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            3500      
                                                                 
 gru (GRU)                   (None, 150)               90900     
                                                                 
 dense (Dense)               (None, 70)                10570     
                                                                 
Total params: 104,970
Trainable params: 104,970
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100


Results

In [None]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text