In [None]:
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tweeterator
from loader import Loader
from data_generator import DataGenerator

Setup training parameters

In [None]:
input = ''
text_column = 'text'
file_type = 'csv'
net_type = 'LSTM'
latent_dim = 64
n_units = 256
window = 10
dropout = 0.2
batch_size = 64
epochs = 30
learning_rate = 0.001
perc_val = 0.2
n_hidden_layers = 1
regex_to_remove = ['^rt ']
shuffle = True

Initialise loader

In [None]:
loader = Loader(flatten_hashtags=False, flatten_mentions=False)
data = loader.load(input, file_type=file_type, text_column=text_column, window=window, regex_to_remove=regex_to_remove)
data = np.array(data, dtype=object)

Look at loaded sentences

In [None]:
# Number of unique words
print(f"Number of words: {len(list(itertools.chain(*data)))}")
print(f"Number of unique words: {len(set(itertools.chain(*data)))}")

Remove words that appear only once (probably typos, errors, etc.)

In [None]:
flattened_text = list(itertools.chain(*data))
vc = pd.value_counts(flattened_text)
words_to_remove = vc[vc == 1].index

In [None]:
words_to_remove

In [None]:
counts = []
for sentence in data:
    count = 0
    for word in list(sentence):
        if word in words_to_remove:
            sentence.remove(word)
            count += 1
    
    counts.append(count)

In [None]:
print(f'Number of affected sentences: {np.sum(np.array(counts) > 0)}')

In [None]:
print(f"Total number of sentences: {len(counts)}")

In [None]:
for sentence in data[:5]:
    print('-' + ' '.join(sentence))

Train and get the trained model, the history and the word dictionaries

In [None]:
model, history, dicts, _ = tweeterator.train(data, net_type, latent_dim, n_units, window, dropout, batch_size, epochs, learning_rate,
                                             perc_val, n_hidden_layers, shuffle)

Visualise training results

In [None]:
plt.figure(dpi=150)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(history.history['categorical_accuracy'], label='categorical_accuracy')
plt.plot(history.history['val_categorical_accuracy'], label='val_categorical_accuracy')
plt.xlabel('Epoch')
plt.legend()

Take a random sentence and start generating text from its first words

In [None]:
w2i = dicts['word2int']
i2w = dicts['int2word']

In [None]:
sentence = data[np.random.choice(data.size, 1)]
gen = DataGenerator(sentence, w2i, window, 1, shuffle=False)
test_example = next(gen)

In [None]:
output_size = 40
output_int = np.empty(output_size, dtype=int)

# Set the first window elements to the start of the phrase
output_int[:window] = test_example[0]

# Predict the next word from the preceding ones (using the words already predicted)
for i in range(0, output_int.size - window):
    input_int = output_int[np.newaxis, i:window + i, np.newaxis]
    prediction = model(input_int).numpy()[0]
    # Activate this to generate randomly by sampling based on the probabilities
    #word_int = np.random.choice(range(len(prediction)), 1, p=prediction)[0]
    # Activate this to make it deterministic
    word_int = np.argmax(prediction)
    output_int[window + i] = word_int

# Convert integers to words
output = []
for i in range(len(output_int)):
    word_int = output_int[i]
    word = i2w[word_int]
    output.append(word)

Visualise the produced output

In [None]:
' '.join(output)