# Project 5: Text Generation with Recurrent Neural Networks, LSTM, and Hyperas

## Hyperas

Hyperas is used for automated machine learning tuning in keras. It's based on the hyperopt library, with a focus on simplification and focus on keras.

The concepts here are going to be pretty simple. The main differences you're going to see between this and our previous notebooks are:

1. We have to use actual data creation and model creation functions.
    - The data function ensures that we only have to load our data once. We have to return the feautres and labels in a particualr order.
    - The model function defines our model and the hyperparameter tunings that we want to try.
2. We'll plug the data and model functions into a hyperas function that loads the data and tunes the model.

There aren't any new machine learning concpets in this notebook, but this tool will be invaluable for finding the best model for any future project.

The only real Hyperas notes I have are:

- tpe - This is the optimization algorithm we'll be using. You can use any algorithm that hyperopt supports. TPE is Tree-structured Parzen Estimator, it's more than just a random search, but most importantly: It's what the docs use.
- Trials - this is a hyperopt trials object that has to be passed to hyperas.

### Imports

In [None]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import gc

### Data

In [None]:
def clean_tweets(tweets):
    cleaned = []
    for tweet in tweets:
        tweet = re.sub(r'http.*\s', '', tweet)
        tweet = re.sub(r'http.*$', '', tweet)
        tweet = re.sub(r'http', '', tweet)
        cleaned.append(tweet)
    return cleaned

def data(training_length):
    tweet_data = pd.read_csv('trump_tweets.csv')

    entire_corpus = []
    for index, tweet in tweet_data.iterrows():
        entire_corpus.append(str(tweet['text']))
    
    entire_corpus = clean_tweets(entire_corpus)
    
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                          lower=True,
                          split=' ',
                          char_level=False)

    tokenizer.fit_on_texts(entire_corpus)

    word_index = tokenizer.word_index
    reverse_index_word = tokenizer.index_word
    number_of_words = len(word_index) + 1
    word_counts = tokenizer.word_counts

    tokenized = tokenizer.texts_to_sequences(entire_corpus)

    features = []
    labels = []

    for sequence in tokenized:
        for index in range(training_length, len(sequence)):
            extract = sequence[index - training_length:index + 1]
            features.append(extract[:-1])
            labels.append(extract[-1])
    
    features = np.array(features)

    label_placeholder = np.zeros((len(features), number_of_words), dtype = np.int8)

    for example_index, word_idx in enumerate(labels):
        label_placeholder[example_index, word_idx] = 1
    
    labels = label_placeholder
    
    train_percent = int(round(float(features.shape[0]) * 0.9))
    
    x_train = features[:train_percent]
    y_train = labels[:train_percent]
    x_test = features[train_percent:]
    y_test = labels[train_percent:]

    return x_train, y_train, x_test, y_test

### Embeddings

In [None]:
def embeddings():
    glove_vectors = 'glove.6B/glove.6B.100d.txt'
    glove = np.loadtxt(glove_vectors, dtype='str', comments=None, encoding='utf8')
    vectors = glove[:, 1:].astype('float')
    words = glove[:, 0]
    del glove
    word_lookup = {word: vector for word, vector in zip(words, vectors)}
    embedding_matrix = np.zeros((number_of_words, vectors.shape[1]))
    for index, word in enumerate(word_index.keys()):
        vector = word_lookup.get(word, None)
        if vector is not None:
            embedding_matrix[index + 1, :] = vector
    gc.enable()
    del vectors
    gc.collect()
    embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
    embedding_matrix = np.nan_to_num(embedding_matrix)
    return embedding_matrix

### Model

In [None]:
def model(x_train, y_train, x_test, y_test):

    model = Sequential()

    model.add(Embedding(input_dim=number_of_words,
                        input_length = training_length,
                        output_dim=100,
                        weights=[embeddings],
                        # Note
                        trainable={{choice(['False', 'True'])}},
                        mask_zero=True
                       ))
    
    model.add(Masking(mask_value=0.0))
    
    if {{choice(['one_lstm', 'two_lstm'])}} == 'two_lstm':
         model.add(LSTM({{choice([32, 64, 128, 256])}}, return_sequences=True))

    model.add(LSTM({{choice([32, 64, 128, 256])}}, return_sequences=False))
    
    model.add(Dense({{choice([32, 64, 128, 256])}}, activation='relu'))
    
    model.add(Dropout({{uniform(0, 1)}}))
    
    if {{choice(['one_dense', 'two_dense'])}} == 'two_dense':
        model.add(Dense({{choice([32, 64, 128, 256])}}, activation='relu'))

        model.add(Dropout({{uniform(0, 1)}}))
    
    model.add(Dense(number_of_words, activation='softmax'))
    
    optimizer = Adam(lr={{choice([0.001, 0.01, 0.1])}},
                     decay={{choice([0.0, 0.001, 0.005, 0.01])}})
    
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    
    result = model.fit(x_train, y_train, 
                       batch_size={{choice([512, 1024, 2048, 4096])}},
                       epochs=100,
                       validation_data=(x_test, y_test))
    
    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history['val_acc'])
    
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [None]:
embeddings = embeddings()
training_length = 3
number_of_words = 0

X_train, Y_train, X_test, Y_test = data()

best_run, best_model = optim.minimize(model=model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=50,
                                      trials=Trials())

X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)