In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from numpy.random import randint, choice
from keras import backend as K
import math

Using TensorFlow backend.


In [None]:
with open('processed_texts.csv', 'r', encoding='UTF-8') as file:
    train_data = [line.strip('\n') for line in file]

print('Number of training sentences: ', len(train_data))

max_words = 50000 # Max size of the dictionary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data)
sequences = tokenizer.texts_to_sequences(train_data)

# Flatten the list of lists resulting from the tokenization. This will reduce the list
# to one dimension, allowing us to apply the sliding window technique to predict the next word
text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)

Number of training sentences:  2457


In [None]:
# Training on 19 words to predict the 20th
sentence_len = 20
pred_len = 1
train_len = sentence_len - pred_len
seq = []
# Sliding window to generate train data
for i in range(len(text)-sentence_len):
    seq.append(text[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Each row in seq is a 20 word long window. We append he first 19 words as the input to predict the 20th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

5117293

In [None]:
def build_model(embdding_size=50, LSTM_sizes=[100,100], hidden_layer={'size': 100, 'act':'relu'}, dropout_rate=0.1):
    return Sequential([
        Embedding(vocab_size+1, embdding_size, input_length=train_len),
        LSTM(LSTM_sizes[0], return_sequences=True),
        LSTM(LSTM_sizes[1]),
        Dense(hidden_layer['size'], activation=hidden_layer['act']),
        Dropout(dropout_rate),
        Dense(vocab_size, activation='softmax')
    ])

In [None]:
def run_model(x,y,setting, EPOCHS, validation_data, callbacks):

    # build model
    model = build_model(embdding_size=setting['embedding_size'], LSTM_sizes=setting['LSTM_sizes'], hidden_layer=setting['hidden_layer'], dropout_rate=setting['dropout_rate'])
    model.compile(optimizer=setting['optimizer'], loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    K.set_value(model.optimizer.learning_rate, setting['learning_rate'])

    # train model
    history = model.fit(x, y, epochs=EPOCHS, batch_size = setting['BATCH_SIZE'], validation_data = validation_data, callbacks=callbacks)
    
    return history.history['loss'][EPOCHS-1], model


In [None]:
def random_search_hyperparameters(num_tries=10):
    losses = np.zeros(num_tries)
    settings = []
    x = np.asarray(trainX)
    y = np.asarray(trainy)

    for i in range(num_tries):
        LSTM_sizes = randint(50, 200, 2)
        embedding_size = randint(50, 200)
        BATCH_SIZE = randint(300,2000)
        hidden_layer_size = randint(50,200)
        hidden_layer_activation = np.random.choice(['relu', 'elu'])
        optimizer = np.random.choice(['adam', 'nadam', 'rmsprop'])
        hidden_layer = {'size': hidden_layer_size, 'act': hidden_layer_activation}
        dropout_rate = np.random.uniform(0.1, 0.5)
        EPOCHS = 1
        learning_rate = np.random.uniform(0.01, 0.0001)

        
        setting = {'LSTM_sizes': LSTM_sizes, 'embedding_size': embedding_size, 'BATCH_SIZE': BATCH_SIZE, 'hidden_layer': hidden_layer, 'optimizer': optimizer, 'learning_rate': learning_rate, 'dropout_rate': dropout_rate}
        print(setting)
        settings.append(setting)
        losses[i] = run_model(x, y, setting, EPOCHS)
    
    return losses, settings

losses, settings = random_search_hyperparameters()

In [None]:
# Valdiating 4 best model settings regarding validation loss and accuracy
l = len(trainy)
c = math.floor(l*0.75)
np.random.seed(42)
perm = np.random.permutation(l)
trainX = np.asarray(trainX)[perm]
trainy = np.asarray(trainy)[perm]
x = trainX[:c]
y = trainy[:c]
val_x = trainX[c:]
val_y = trainy[c:]
settings = [{'LSTM_sizes': np.array([151, 177]), 'embedding_size': 143, 'BATCH_SIZE': 972, 'hidden_layer': {'size': 144, 'act': 'elu'}, 'optimizer': 'nadam', 'learning_rate': 0.004159842835701155, 'dropout_rate': 0.1},
            {'LSTM_sizes': np.array([154, 193]), 'embedding_size': 66, 'BATCH_SIZE': 770, 'hidden_layer': {'size': 129, 'act': 'relu'}, 'optimizer': 'adam', 'learning_rate': 0.006717827078582077, 'dropout_rate': 0.26103763276759384},
            {'LSTM_sizes': np.array([196, 180]), 'embedding_size': 192, 'BATCH_SIZE': 1247, 'hidden_layer': {'size': 187, 'act': 'elu'}, 'optimizer': 'adam', 'learning_rate': 0.0028276114501194676, 'dropout_rate': 0.3828645817685946},
            {'LSTM_sizes': np.array([196, 110]), 'embedding_size': 130, 'BATCH_SIZE': 1408, 'hidden_layer': {'size': 193, 'act': 'sigmoid'}, 'optimizer': 'rmsprop', 'learning_rate': 0.00617748089357869, 'dropout_rate': 0.1}]

# Checkpoints for storing the best models
checkpoint1 = ModelCheckpoint('model_best_val_loss_weights.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
checkpoint2 = ModelCheckpoint('model_best_val_acc_weights.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint1, checkpoint2]

# The best model is using the first setting
loss, best_model = run_model(x, y, settings[0], 0, (val_x, val_y), callbacks_list)


In [None]:
best_model.load_weights('model_best_val_loss_weights.hdf5')

In [None]:
# First text generation method
# Keeps track of the k most likly sentences

def gen(model,seq,max_len = 10, k=5):
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])    
    tokenized_sent =  [(tokenized_sent[0], [1])]

    while len(tokenized_sent[0][0]) < max_len:
        new_sentences = []
        for entry in tokenized_sent:
            sent = entry[0]
            padded_sentence = tf.keras.preprocessing.sequence.pad_sequences([sent][-19:],maxlen=19)
            op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
            op = np.array(op).reshape((-1))
            # get k best options for the next word
            top_k_words = op.argsort()[-k:]
            for i in top_k_words:
                s = sent.copy()
                s.append(i)
                p = entry[1]
                p.append(op[i])
                # add tuple of (sentence string "s" and the probability "p") to list of possible new sentences
                new_sentences.append((s, p))

        # compute probability for sentences
        ps = np.array([np.prod(s[1]) for s in new_sentences])
        # select k best sentences
        top_k_sent = ps.argsort()[-k:]
        tokenized_sent = np.array(new_sentences)[top_k_sent]
    return [" ".join(map(lambda x : reverse_word_map[x],tokenized_sent[i][0])) for i in range(k)]

In [None]:
print(*gen(best_model,'Good evening everyone. My name is'), sep='\n')
print(*gen(best_model,'Donald Trump'), sep='\n')
print(*gen(best_model,'The climate change'), sep='\n')
print(*gen(best_model,'My research about'), sep='\n')


In [None]:
# Second method for text generation
# generate n texts using the output probabilties of the model
# t is "anti-temperature" : high value -> use most likely words
#                           low value -> use also some unlikely word

def generate(model,seq,max_len = 30, n=5, t=2):
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    
    tokenized_sent =  [tokenized_sent[0] for _ in range(n)]

    while len(tokenized_sent[0]) < max_len:
        new_sentences = []
        for i, sent in enumerate(tokenized_sent):
            padded_sentence = tf.keras.preprocessing.sequence.pad_sequences([sent][-19:],maxlen=19)
            op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
            op = np.array(op).reshape((-1))
            op = np.power(op, t)
            op = op/op.sum()
            w = np.random.choice(range(vocab_size), p = op)
            s = sent.copy()
            s.append(w)
            tokenized_sent[i] = s

    return [" ".join(map(lambda x : reverse_word_map[x],tokenized_sent[i])) for i in range(k)]

In [None]:
print(*generate(best_model,'Good evening everyone. My name is'), sep='\n')
print(*generate(best_model,'Donald Trump'), sep='\n')
print(*generate(best_model,'The climate change'), sep='\n')
print(*generate(best_model,'My research about'), sep='\n')

good evening everyone my name is a punk and i am an expert in my life and i have to be a little bit of a sort of feeling of reality and i couldn't tell you
good evening everyone my name is i am a blogger who was a young woman who was the most famous woman and i was a girl and she was a little girl and she was a
good evening everyone my name is an english composer i am a boy i can imagine that i had to be a professor i was a teenager i was a musician and i was an activist
good evening everyone my name is a woman who is a friend who is the father of the world who and the guy who is in a way and so what does this mean is that
good evening everyone my name is so wonderful i don't know what to look like i can say that i can look at the world and i can show you it is that the way i
donald trump refers to his phone and his hair and he was a professor he was a painter he was a little girl he was an actor he had a little bit
donald trump i love a beautiful bicycle and i grew up in a restaurant 