In [None]:
import os
import re
import json
import matplotlib. pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from RnnModel import RnnModel
from Callbacks import NBatchLogger

# Load the data

In [None]:
# If empty I will train a new model, otherwise I will load it
model_to_load = ''  # model_dir + '/model_final.h5'
# Location for storing model outputs
model_dir = 'models'
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# If True I will clean raw data and produce clean data
need_to_clean_data = True
# Paths to the data
raw_tweets_path = 'data/raw_tweets.json'
raw_speeches_path = 'data/raw_speech.txt'
clean_path_tweets_path = 'data/tweets.txt'
clean_path_speeches_path = 'data/speech.txt'
clean_path_final = 'data/tweets_speech.txt'

In [None]:
# The object that will assist me for this project
analyze_tweets = RnnModel()

# Data cleaning

In [None]:
if need_to_clean_data:
    # Load the tweets and perform a basic cleaning
    analyze_tweets.load_raw_text(raw_tweets_path, text_col='text', cols_for_vetos=[('is_retweet', 0)])
    # Further cleaning
    analyze_tweets.remove_links()
    analyze_tweets.remove_emojis()
    analyze_tweets.remove_ellipsis()
    analyze_tweets.remove_parens()
    analyze_tweets.remove_chars(chars_to_remove=['#', '@'])
    analyze_tweets.remove_spaces()
    analyze_tweets.veto_sentences(veto_string='RT @', where='start', veto_len=4)
    analyze_tweets.dataset = re.sub(' \| ', '|', analyze_tweets.dataset)   # No ' | ' but simply '|'
    analyze_tweets.dataset = re.sub('\| ', '|', analyze_tweets.dataset)   # No ' | ' but simply '|'
    analyze_tweets.dataset = re.sub('\|\|*', '|', analyze_tweets.dataset)
    # Save the file
    tc = open(clean_path_tweets_path, "w", encoding="utf8")
    tc.write(analyze_tweets.dataset)
    tc.close()

In [None]:
if need_to_clean_data:
    # Load the speech
    analyze_tweets.load_raw_text(raw_speeches_path)
    analyze_tweets.veto_sentences(veto_string='speech ', where='start', veto_len=4)
    analyze_tweets.remove_links()
    analyze_tweets.remove_emojis()
    analyze_tweets.remove_ellipsis()
    analyze_tweets.remove_parens()
    analyze_tweets.remove_spaces()
    analyze_tweets.remove_chars(chars_to_remove=['#', '@'])
    analyze_tweets.dataset = re.sub(' \| ', '|', analyze_tweets.dataset)   # No ' | ' but simply '|'
    analyze_tweets.dataset = re.sub('\| ', '|', analyze_tweets.dataset)   # No ' | ' but simply '|'
    analyze_tweets.dataset = re.sub('\|\|*', '|', analyze_tweets.dataset)
    # Save the file
    tc = open(clean_path_speeches_path, "w", encoding="utf8")
    tc.write(analyze_tweets.dataset)
    tc.close()

    # Concat the input files
    analyze_tweets.concat(clean_path_final, clean_path_tweets_path, clean_path_speeches_path)

# Create the model

In [None]:
# Start Loading the final dataset
analyze_tweets.load_clean_text(clean_path_final)

# Reduce dataset for the sake of testing this code
analyze_tweets.dataset = analyze_tweets.dataset[0:10000]
print(f'Dataset type: {type(analyze_tweets.dataset)}. Corpus length: {len(analyze_tweets.dataset)}')


In [None]:
# Create your features X and your labels Y
analyze_tweets.prepare_feature_labels(validation_size=0.05, sequence_length=40, step_size=4)

In [None]:
# Build your model
if model_to_load == "":
    analyze_tweets.build_model(lstm_out_size=len(analyze_tweets.alphabet)*5,
                               next_layers=[len(analyze_tweets.alphabet)*2, len(analyze_tweets.alphabet)*2])

In [None]:
# Compile and fit your model. If a model already has been trained, you can load it
if model_to_load == "":
    # Create the callbacks want to use
    callbacks = []
    earlyStopping = EarlyStopping(monitor='val_loss', patience=2,
                                  verbose=0, mode='min')
    callbacks.append(earlyStopping)
    mcp_save = ModelCheckpoint(model_dir + '/.mdl_wts.hdf5', save_best_only=True,
                               monitor='val_loss', mode='min')
    callbacks.append(mcp_save)
    csv_logger = CSVLogger(model_dir + '/training.log', separator=',', append=False)
    callbacks.append(csv_logger)
    NBatchLogger_obj = NBatchLogger(validation_data=(analyze_tweets.x_val, analyze_tweets.y_val),
                                    freq=4, val=True)
    callbacks.append(NBatchLogger_obj)  # Use this callback only if you have a validation dataset
    # Compile and fit the model
    analyze_tweets.compile_and_fit(epochs=1, callbacks=callbacks)

    # Update the history with a more precise one
    analyze_tweets.history = NBatchLogger_obj.my_metrics
    with open(model_dir + '/detailed_history.json', 'w') as fp:
        json.dump(str(NBatchLogger_obj.my_metrics), fp)

    # Save the final model
    model_name = 'model_final.h5'
    model_fpath = os.path.join(model_dir, model_name)
    analyze_tweets.model.save(model_fpath)
else:
    analyze_tweets.load_model(model_to_load=model_to_load, history=model_dir + '/training.log')

# Analyze your history

In [None]:
# summarize history for accuracy
plt.plot(analyze_tweets.history['accuracy'])
plt.plot(analyze_tweets.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('Mini-batch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(analyze_tweets.history['loss'])
plt.plot(analyze_tweets.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('Minibatch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# Make some predictions

In [None]:
complete_sentence = analyze_tweets.gen_sentence("Tonight I will be going",
                                                next_words=None,
                                                next_letters=10,
                                                multinomial_thresh=1.3)
print(complete_sentence)