In [None]:
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow.config import list_physical_devices, experimental

import pos_tagging
import sentence_generation as sg
import nets
from loader import Loader
from generators import SingleDataGenerator, DoubleDataGenerator

In [None]:
gpus = list_physical_devices('GPU')
for gpu in gpus:
        experimental.set_memory_growth(gpu, True)

try:
    if gpus:
        print('GPU found')
    else:
        print("No GPU found")
except Exception as e:
    print(e)

# Setup

Setup training parameters

In [None]:
input = 'data/trump.csv'
text_column = 'text'
file_type = 'csv'

w_net_type = 'LSTM'
w_latent_dim = 60
w_n_units = 256
w_dropout = 0.2
w_n_hidden_layers = 1

pos_net_type = 'LSTM'
pos_latent_dim = 8
pos_n_units = 64
pos_dropout = 0
pos_n_hidden_layers = 1

window = 5
batch_size = 64
epochs = 30
learning_rate = 0.001
perc_val = 0.2
regex_replace = {'^rt ': '', '&amp;': 'and', '’': '\''}
shuffle = True
train_two_nets = False

Initialise loader

In [None]:
loader = Loader(flatten_hashtags=False, flatten_mentions=False)
data = loader.load(input, file_type=file_type, text_column=text_column, window=window, regex_replace=regex_replace)
data = np.array(data, dtype=object)

# Inspect

Look at loaded sentences

In [None]:
# Number of unique words
print(f"Number of words: {len(list(itertools.chain(*data)))}")
print(f"Number of unique words: {len(set(itertools.chain(*data)))}")

Remove words that appear only once (probably typos, errors, etc.)

In [None]:
flattened_text = list(itertools.chain(*data))
vc = pd.value_counts(flattened_text)
words_to_remove = vc[vc == 1].index

In [None]:
words_to_remove

In [None]:
counts = []
for sentence in data:
    count = 0
    for word in list(sentence):
        if word in words_to_remove:
            sentence.remove(word)
            count += 1
    
    counts.append(count)

In [None]:
print(f'Number of affected sentences: {np.sum(np.array(counts) > 0)}')

In [None]:
print(f"Total number of sentences: {len(counts)}")

In [None]:
empty_sentences = []
for i in range(len(data)):
    sentence = data[i]
    if len(sentence) == 0:
        empty_sentences.append(i)

data = np.delete(data, empty_sentences)

In [None]:
for sentence in data[:5]:
    print('-' + ' '.join(sentence))

# POS tagging

In [None]:
data_plus_pos = pos_tagging.get_tags(data)
dict_pos_freq, dict_pos_count, min_freq = pos_tagging.get_frequency(data, window)

# Separate train-test

In [None]:
n_phrases = len(data)
test_idx = np.random.choice(np.arange(n_phrases), int(n_phrases * perc_val), replace=False)
train_idx = np.setdiff1d(np.arange(n_phrases), test_idx)

train_data = data[train_idx]
test_data = data[test_idx]

In [None]:
pos_train_data = [[tok[1] for tok in sentence] for sentence in data_plus_pos[train_idx]]
pos_test_data = [[tok[1] for tok in sentence] for sentence in data_plus_pos[test_idx]]

# Get conversion dictionaries

In [None]:
word2int = {}
int2word = {}
pos2int = {}
int2pos = {}
for sent in data_plus_pos:
    for tok in sent:
        if tok[0] not in word2int.keys():
            idx = len(word2int)
            word2int[tok[0]] = idx
            int2word[idx] = tok[0]
        
        if tok[1] not in pos2int.keys():
            idx = len(pos2int)
            pos2int[tok[1]] = idx
            int2pos[idx] = tok[1]

# Initialise generators

In [None]:
train_data_generator = SingleDataGenerator(train_data, word2int, window, batch_size, shuffle)
test_data_generator = SingleDataGenerator(test_data, word2int, window, batch_size, shuffle)

In [None]:
pos_train_data_generator = SingleDataGenerator(pos_train_data, pos2int, window, batch_size, shuffle)
pos_test_data_generator = SingleDataGenerator(pos_test_data, pos2int, window, batch_size, shuffle)

In [None]:
double_train_data_generator = DoubleDataGenerator(data_plus_pos[train_idx], word2int, pos2int, window, batch_size, shuffle)
double_test_data_generator = DoubleDataGenerator(data_plus_pos[test_idx], word2int, pos2int, window, batch_size, shuffle)

# Train

Load the first elements of a sentence to test the result

In [None]:
n_iter = 100
for _ in range(n_iter):
    next(double_test_data_generator)

rnd_sentence = next(double_test_data_generator)[0]

start_of_sentence_word = [int2word[ii] for ii in rnd_sentence[0][0, :]]
start_of_sentence_pos = [int2pos[ii] for ii in rnd_sentence[1][0, :]]

Each of the following sections will train and test one of the possible network models and generation modalities.


## Single model

In [None]:
model1 = nets.one_input_one_output(window, w_net_type, len(word2int), w_latent_dim, w_n_units, w_dropout, w_n_hidden_layers)
optim_adam = Adam(learning_rate=learning_rate)
model1.compile(loss='categorical_crossentropy', optimizer=optim_adam, metrics=['categorical_accuracy'])

In [None]:
history1 = model1.fit(train_data_generator, steps_per_epoch=train_data_generator.get_n_steps_in_epoch(),
                    validation_data=test_data_generator, validation_steps=test_data_generator.get_n_steps_in_epoch(),
                    epochs=epochs)

Visualise training results

In [None]:
plt.figure(dpi=150)
plt.plot(history1.history['loss'], label='loss')
plt.plot(history1.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(history1.history['categorical_accuracy'], label='categorical_accuracy')
plt.plot(history1.history['val_categorical_accuracy'], label='val_categorical_accuracy')
plt.xlabel('Epoch')
plt.legend()

### Without POS information

In [None]:
res1_1 = sg.one_model_one_input_one_output(model1, start_of_sentence_word, window, use_pos_info=False, w2i=word2int,
                                        i2w=int2word, deterministic=False, output_length=40, pos_freq=None, min_freq=None)
' '.join(res1_1)

### With POS information

In [None]:
res1_2 = sg.one_model_one_input_one_output(model1, start_of_sentence_word, window, use_pos_info=True, w2i=word2int,
                                        i2w=int2word, pos_freq=dict_pos_freq, min_freq=min_freq, deterministic=False, output_length=40)
' '.join(res1_2)

## Single model two inputs one output

In [None]:
model2 = nets.two_inputs_one_output(window, w_net_type, len(word2int), w_latent_dim, w_n_units, w_dropout, w_n_hidden_layers,
                                    pos_net_type, len(pos2int), pos_latent_dim, pos_n_units, pos_dropout, pos_n_hidden_layers)
optim_adam = Adam(learning_rate=learning_rate)
model2.compile(loss='categorical_crossentropy', optimizer=optim_adam, metrics=['categorical_accuracy'])

In [None]:
history = model2.fit(double_train_data_generator, steps_per_epoch=double_train_data_generator.get_n_steps_in_epoch(),
                    validation_data=double_test_data_generator, validation_steps=double_test_data_generator.get_n_steps_in_epoch(),
                    epochs=epochs)

In [None]:
res2 = sg.one_model_two_inputs_one_output(model2, start_of_sentence_word, window, word2int, int2word, pos2int, deterministic=False,
                                   output_length=40)
' '.join(res2)

Visualise training results

In [None]:
plt.figure(dpi=150)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(history.history['categorical_accuracy'], label='categorical_accuracy')
plt.plot(history.history['val_categorical_accuracy'], label='val_categorical_accuracy')
plt.xlabel('Epoch')
plt.legend()

## Single model two inputs two outputs

In [None]:
model3 = nets.two_inputs_two_outputs(window, w_net_type, len(word2int), w_latent_dim, w_n_units, w_dropout, w_n_hidden_layers,
                                     pos_net_type, len(pos2int), pos_latent_dim, pos_n_units, pos_dropout, pos_n_hidden_layers)
optim_adam = Adam(learning_rate=learning_rate)
model3.compile(loss='categorical_crossentropy', optimizer=optim_adam, metrics=['categorical_accuracy'])

In [None]:
history3 = model3.fit(double_train_data_generator, steps_per_epoch=double_train_data_generator.get_n_steps_in_epoch(),
                      validation_data=double_test_data_generator, validation_steps=double_test_data_generator.get_n_steps_in_epoch(),
                      epochs=epochs)

In [None]:
res3 = sg.one_model_two_inputs_two_outputs(model3, start_of_sentence_word, window, word2int, int2word, pos2int, deterministic=False,
                                           output_length=40)
' '.join(res3)

Visualise training results

In [None]:
history3.history.keys()

In [None]:
plt.figure(dpi=150)
plt.plot(history3.history['loss'], 'r', label='loss')
plt.plot(history3.history['w_output_loss'], 'ro-', alpha=0.5, label='w_output_loss')
plt.plot(history3.history['pos_output_loss'], 'rx-' , alpha=0.5, label='pos_output_loss')
plt.plot(history3.history['val_loss'], 'b', label='val_loss')
plt.plot(history3.history['val_w_output_loss'], 'bo-', alpha=0.5, label='val_output_loss')
plt.plot(history3.history['val_pos_output_loss'], 'bx-', alpha=0.5, label='val_pos_output_loss')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(history3.history['w_output_categorical_accuracy'], 'ro-', alpha=0.5, label='w_output_categorical_accuracy')
plt.plot(history3.history['pos_output_categorical_accuracy'], 'rx-' , alpha=0.5, label='pos_output_categorical_accuracy')
plt.plot(history3.history['val_w_output_categorical_accuracy'], 'bo-', alpha=0.5, label='val_w_output_categorical_accuracy')
plt.plot(history3.history['val_pos_output_categorical_accuracy'], 'bx-', alpha=0.5, label='val_pos_output_categorical_accuracy')
plt.xlabel('Epoch')
plt.legend()

## Two models

In [None]:
w_model = nets.one_input_one_output(window, w_net_type, len(word2int), w_latent_dim, w_n_units, w_dropout, w_n_hidden_layers)
pos_model = nets.one_input_one_output(window, pos_net_type, len(pos2int), pos_latent_dim, pos_n_units, pos_dropout, pos_n_hidden_layers)

w_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['categorical_accuracy'])
pos_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['categorical_accuracy'])

In [None]:
w_history = w_model.fit(train_data_generator, steps_per_epoch=train_data_generator.get_n_steps_in_epoch(),
                    validation_data=test_data_generator, validation_steps=test_data_generator.get_n_steps_in_epoch(),
                    epochs=epochs)
pos_history = pos_model.fit(pos_train_data_generator, steps_per_epoch=pos_train_data_generator.get_n_steps_in_epoch(),
                        validation_data=pos_test_data_generator, validation_steps=pos_test_data_generator.get_n_steps_in_epoch(),
                        epochs=epochs)

In [None]:
import importlib
importlib.reload(sg)

In [None]:
res4 = sg.two_models([w_model, pos_model], start_of_sentence_word, window, word2int, int2word, pos2int, deterministic=False,
                     output_length=40)
' '.join(res4)

Visualise training results

In [None]:
plt.figure(dpi=150)
plt.plot(w_history.history['loss'], label='loss word model')
plt.plot(w_history.history['val_loss'], label='val_loss word model')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(pos_history.history['loss'], label='loss pos model')
plt.plot(pos_history.history['val_loss'], label='val_loss pos model')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(w_history.history['categorical_accuracy'], label='categorical_accuracy word model')
plt.plot(w_history.history['val_categorical_accuracy'], label='val_categorical_accuracy word model')
plt.xlabel('Epoch')
plt.legend()

plt.figure(dpi=150)
plt.plot(pos_history.history['categorical_accuracy'], label='categorical_accuracy pos model')
plt.plot(pos_history.history['val_categorical_accuracy'], label='val_categorical_accuracy pos model')
plt.xlabel('Epoch')
plt.legend()