## Preprocess

In [1]:
file = open("./data/war&peace.txt","r",encoding="utf-8")
data=file.read()

In [2]:
# considering punctuations as word
import string
punct = ".,!?/’:\";\-–“”'"
data = data.lower()
data = data.replace("\n"," ")
tmp = ""
for i in data:
    if i.isalpha() or i == " ":
        tmp = tmp+i
    elif i in punct:
        tmp = tmp+" "+i+" "
data = tmp

In [3]:
max_features = 50000  # Number of words 
n_word = 5 # Number of word as input to model
embedding_dim = 50 # Embedding vector dimension

## Tokenization

In [4]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=False,filters="")

In [5]:
tokenizer.fit_on_texts([data])

In [6]:
data_encoded = tokenizer.texts_to_sequences([data])[0]

In [7]:
vocab_size = len(tokenizer.word_index)+1
print(f"Vocabluary size: {vocab_size}")

Vocabluary size: 19263


## Model Input&Output

In [8]:
X = []
y = []
for i in range(n_word, len(data_encoded)):
    X.append(data_encoded[i-n_word:i])
    y.append(data_encoded[i])

In [9]:
import numpy as np
from keras.utils import to_categorical
X = np.array(X)
#y = to_categorical(y, num_classes=vocab_size)
y = np.array(y)

## Embedding

In [10]:
# pretrained GloVe embedding 6B tokens, 400K vocab size  50d vectors
# source https://nlp.stanford.edu/projects/glove/
embedding = open("embeddings/glove.6B.50d.txt",encoding="utf-8")

In [11]:
#Creating embedding dictionary
embedding_dict= {}
for i in embedding:
    line = i.split(' ')
    line[-1]=line[-1].replace('\n','')
    embedding_dict[''+line[0]]=line[1:]

In [12]:
len(embedding_dict)

400000

In [13]:
keys = embedding_dict.keys()
unk_token = np.zeros(embedding_dim,)
#Using unk_token(average of all vetors) for the words not in embedding ectors
for i in keys:
    unk_token = unk_token + np.array(embedding_dict[i],float)
unk_token = unk_token / len(keys)

In [14]:
# creating embedding matrix
counter = 0
embedding_matrix = np.zeros((vocab_size, embedding_dim))
embedding_matrix[0] = np.zeros(embedding_dim,)
for word, i in tokenizer.word_index.items():
    if i == vocab_size:
        break
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is  None:
        #words not in embedding considered unk token
        embedding_vector = unk_token.tolist()
        counter = counter + 1
    embedding_matrix[i] = embedding_vector

In [15]:
embedding_matrix.shape

(19263, 50)

In [16]:
print(f"Number of unk token usage: {counter}")

Number of unk token usage: 3339


## Model

In [17]:
from keras.models import Sequential
from keras.layers import LSTM,RNN,Dense,Embedding,Input,Flatten
model = Sequential()
model.add(Input(shape=(5,)))
model.add(Embedding(input_dim = vocab_size , weights =[embedding_matrix], output_dim = embedding_dim, input_length = n_word, trainable=False))
model.add(LSTM(128,return_sequences = True))
model.add(LSTM(64))
model.add(Dense(128,activation = "relu"))
model.add(Dense(64,activation = "relu"))
model.add(Flatten())
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 50)             963150    
_________________________________________________________________
lstm (LSTM)                  (None, 5, 128)            91648     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 19263)             1

# Compile&Train

In [18]:
# call back for stop training at certein accuracy threshold
from keras.callbacks import Callback,ModelCheckpoint
class My_Callback(Callback):
    def __init__(self, threshold,model_save_cp):
        super(My_Callback, self).__init__()
        self.threshold = threshold
        self.model_save_cp = model_save_cp
    def on_epoch_end(self, epoch, logs=None): 
        acc = logs["sparse_categorical_accuracy"]
        if (epoch+1) % self.model_save_cp == 0:
            print(f"\nModel Checkpoint reached saving model weights...\n")
            self.model.save_weights(f"model/en-lang-model-{epoch+1}ep.h5")
        if acc >= self.threshold:
            print(f"Accuracy reach over {self.threshold}% terminating train process.")
            self.model.save_weights(f"model/en-lang-model-{epoch}ep.h5")
            self.model.stop_training = True
            

In [None]:
#categorical_accuracy
cb = My_Callback(threshold = 0.95, model_save_cp = 50)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
history = model.fit(X, y, batch_size=4096, epochs=1000, callbacks=[cb])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000

Model Checkpoint reached saving model weights...

Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000


Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000

Model Checkpoint reached saving model weights...

Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000

In [None]:
#model.save_weights("model/en-lang-model.h5")

## Evaluation 

In [None]:
model.evaluate(X,y)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_history(history):
    acc = history.history['categorical_accuracy']
    loss = history.history['loss']
    x = range(1, len(acc) + 1)
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.title('Training  accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'r', label='Training loss')
    plt.title('Training loss')
    plt.legend()

In [None]:
plot_history(history)

To generate sequence basically get seed text and and consider 5 word from right side and ignore others, if its less than 5 padding(pre) from left side after that for each 5 word model predict 1 word. After prediction we feed 4 word from previous input and previously predicted word as 5th word and so on.

In [None]:
# for dynamic printing
from IPython.display import display, clear_output
# padding for fixed input size
from keras.preprocessing.sequence import pad_sequences
# generating sequence from language model
def generate_sequence(tokenizer, max_output_seq_len, n_input_words, seed_text, model):
    model_input = seed_text
    
    # generate given max amount of word
    for _ in range(max_output_seq_len):
        # tokenize input text
        model_input_encoded = tokenizer.texts_to_sequences([model_input])[0]
        # pre-padding for fixed length(n_input_words)
        model_input_encoded = pad_sequences([model_input_encoded], maxlen=n_input_words, padding='pre')
        # predict probabilties for each word in vocab.
        prediction = model.predict(model_input_encoded)[0]
        prediction = np.argmax(prediction)
        # finding predicted word
        predicted_word = list(tokenizer.word_index.keys())[prediction-1]
        model_input = model_input + " " + predicted_word
        display(model_input)
        clear_output(wait=True)
    return model_input

In [None]:
# for dynamic printing
from IPython.display import display, clear_output
# padding for fixed input size
from keras.preprocessing.sequence import pad_sequences
# generating sequence from language model
def generate_sequence_with_random_choice(tokenizer, max_output_seq_len, n_input_words, seed_text, model, acceptance_threshold):
    model_input = seed_text
    
    # generate given max amount of word
    for _ in range(max_output_seq_len):
        # variable countermeasure if threshold too high for prediction
        at = acceptance_threshold
        # tokenize input text
        model_input_encoded = tokenizer.texts_to_sequences([model_input])[0]
        # pre-padding for fixed length(n_input_words)
        model_input_encoded = pad_sequences([model_input_encoded], maxlen=n_input_words, padding='pre')
        # predict probabilties for each word in vocab.
        prediction = model.predict(model_input_encoded)
        # countermeasure if threshold too high for prediction
        if np.max(prediction) < acceptance_threshold:
            at = np.max(prediction)
        # take predicted words with probability higher than the threshold
        prediction = prediction >= at
        # taking index high prob. words(true values)
        possible_words = np.where(prediction)[1]
        # choosing one of possible word
        choosen_index = possible_words[np.random.randint(0,len(possible_words))]
        # getting correspanding key from word index
        predicted_word = list(tokenizer.word_index.keys())[choosen_index-1]
        model_input = model_input + " " + predicted_word
        display(model_input)
        clear_output(wait=True)
        
    return model_input

In [None]:
generate_sequence(tokenizer, 25, n_word, "Nicholas looked the sky and", model)

In [None]:
generate_sequence_with_random_choice(tokenizer, 25, n_word, "Nicholas looked the sky and", model,acceptance_threshold = 0.3)

In [None]:
generate_sequence(tokenizer, 150, n_word, "She suddenly", model)

In [None]:
generate_sequence_with_random_choice(tokenizer, 150, n_word, "She suddenly ", model,acceptance_threshold = 0.5)

In [None]:
generate_sequence(tokenizer, 50, n_word, "The Italian seemed happy", model)