In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
root_folder = '/content/drive/My Drive/WordGeneration'

In [None]:
data = pd.read_csv(root_folder+'/FOMC2021.txt', sep="\n")

In [None]:
data.rename(columns={"Action to Adopt Changes to the Committee's Rules Regarding Availability of Information": "text"},
          inplace=True)
data["text"] = data["text"].str.replace("United States", "US")
data["text"] = data["text"].str.replace("U.S.", "US")
data.info()

In [None]:
func_folder = '/content/drive/My Drive/Colab Notebooks'

In [None]:
import sys
sys.path.append(func_folder)

In [None]:
import Contractions
from Contractions import *

In [None]:
def clean(text):
    text = re.sub('[0-9]+.\t', '', str(text)) # removing paragraph numbers
    text = re.sub('U.S.', 'USA', str(text))
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(' ')])
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = re.sub("'s", '', str(text))
    text = re.sub("-", ' ', str(text))
    text = re.sub("— ", '', str(text))
    text = re.sub('\"', '', str(text))
    text = re.sub("Mr\.", 'Mr', str(text))
    text = re.sub("Mrs\.", 'Mrs', str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    text = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', text) # add space around punctuation, i.e. treat them as token
    text = re.sub(r'\s\s', ' ', text)

    return text

In [None]:
data['text_clean'] = data['text'].apply(clean)

In [None]:
def sequence_generator(texts,
                      training_length, 
                      result_length, 
                      max_train=100000,
                      start_end_tokens=False,
                      lower=True):

    tokenizer = Tokenizer(lower=lower)
    tokenizer.fit_on_texts(texts)

    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # import pickle
    # with open('tokenizer.pickle', 'wb') as handle:
    #     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    sequences = tokenizer.texts_to_sequences(texts)

    # Start-End tokens
    # x = word_idx["start_token"]
    # y = word_idx["end_token"]

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + result_length + 3)]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []
    training_seq_words = []
    labels_words = []

    for seq in new_sequences:

        if len(training_seq) < max_train:
            for i in range(training_length, len(seq) - result_length):
                # Extract the features and label
                extract = seq[i - training_length:i + result_length]
                training_seq.append(extract[:training_length])
                if start_end_tokens:
                    label_adj = [x] + extract[training_length:] + [y]
                else: label_adj = extract[training_length:]
                labels.append(label_adj)

                training_seq_words.append([idx_word[j] for j in extract[:training_length]])
                labels_words.append([idx_word[j] for j in extract[training_length:]])

    print(f'There are {len(training_seq)} training sequences.')

    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels, \
           training_seq_words, labels_words

In [None]:
len_text = 30
len_result = 15

In [None]:
word_idx, idx_word, num_words, word_counts, new_texts, sequences, features, labels, training_seq_words, labels_words = \
    sequence_generator(
    data['text_clean'].tolist(), training_length = len_text, result_length = len_result, lower=True)

In [None]:
TRAIN_FRACTION = 0.7
RANDOM_STATE = 50

In [None]:
from sklearn.utils import shuffle

In [None]:
def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=TRAIN_FRACTION):

    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # numpy array with one-hot encoding consisting of number of training data 
    # and size of vocabulary with 1 at the corresponding word following from the features
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    return X_train, X_valid, y_train, y_valid

In [None]:
x_tr, x_val, y_tr, y_val = create_train_valid(features, labels, num_words)

In [None]:
glove = np.loadtxt('glove.6B.100d.txt', dtype='str', comments=None, encoding="utf8")
print(glove.shape)
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
del glove

word_lookup = {word: vector for word, vector in zip(words, vectors)}
embedding_matrix = np.zeros((num_words, vectors.shape[1]))
not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

# Normalize and convert nan to 0
embedding_matrix = embedding_matrix / \
    np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
embedding_matrix = np.nan_to_num(embedding_matrix)

In [None]:
from keras import backend as K
import gensim
from numpy import *
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping
import warnings

In [None]:
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

K.clear_session()

latent_dim = 200
# embedding_dim = 200

In [None]:
encoder_inputs = Input(shape=(len_text,))

In [None]:
enc_emb = Embedding(num_words, embedding_matrix.shape[1], embeddings_initializer=Constant(embedding_matrix), trainable=False)(encoder_inputs)

In [None]:
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

In [None]:
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

In [None]:
decoder_inputs = Input(shape=(None,))

In [None]:
dec_emb_layer = Embedding(num_words, embedding_matrix.shape[1], embeddings_initializer=Constant(embedding_matrix), trainable=False)
dec_emb = dec_emb_layer(decoder_inputs)

In [None]:
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

decoder_dense = TimeDistributed(Dense(num_words, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

history = model.fit([x_tr, y_tr[:, :-1]], y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:, 1:], epochs=50,
                    callbacks=[es], batch_size=128,
                    validation_data=([x_val, y_val[:, :-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:, 1:]))

In [None]:
from matplotlib import pyplot

pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
model.save("fomc")
print("Saved model to disk")