In [1]:
%load_ext autoreload
%autoreload 2

import os
import char_max_likelihood
import metrics
import pandas as pd
import LSTMGenerator
import utils
import ngram_model
utils.setup_nltk()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Valentin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Valentin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Valentin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Generate Speeches

In [2]:
presidents = ["obama", "bush", "reagan", "trump"]
n=10

## LSTM

In [3]:
for pres in presidents:
    lstm = LSTMGenerator.LSTMGenerator(pres)
    speeches = lstm.generate_n(n, length=5000)
    lstm.persist(speeches)

## CharPred

In [48]:
order = 10
letters = 5000*5
filepath = "../data/charpred/{}_generated/{}.txt"

for pres in presidents:
    lm = char_max_likelihood.train_char_lm(pres, order)
    for i in range(0,n):
        s = char_max_likelihood.generate_text(lm, order, letters)
        if not os.path.exists("../data/charpred/{}_generated/".format(pres)):
            os.makedirs("../data/charpred/{}_generated/".format(pres))
        with open(filepath.format(pres,i), "w") as f:
            f.write(s)

## NGram

In [3]:
for pres in presidents:
    for i in range(0,n):
        dir = "../data/ngram/{}_generated/".format(pres)
        filepath = dir + "{}.txt".format(i)
        print(pres, i)
        token_model, pos_model, tokens_per_pos = ngram_model.create_ngram(pres, n=2, pos_n=5, use_lower=True, pos_tagging=True)
        s = ngram_model.generate_speech(i, token_model, pos_model, tokens_per_pos, max_length=5000, top_token=int(len(token_model.VOCAB) / 10), top_pos=5)

        if not os.path.exists(dir):
            os.makedirs(dir)
        with open(filepath, "w", encoding="utf8") as f:
            f.write(s)

trump 0
100%|██████████| 5000/5000 [01:39<00:00, 50.10it/s]
trump 1
100%|██████████| 5000/5000 [01:33<00:00, 53.20it/s]
trump 2
100%|██████████| 5000/5000 [01:28<00:00, 56.38it/s]
trump 3
100%|██████████| 5000/5000 [01:23<00:00, 59.57it/s]
trump 4
100%|██████████| 5000/5000 [01:39<00:00, 50.43it/s]
trump 5
100%|██████████| 5000/5000 [01:48<00:00, 45.90it/s]
trump 6
100%|██████████| 5000/5000 [01:46<00:00, 46.76it/s]
trump 7
100%|██████████| 5000/5000 [01:31<00:00, 54.40it/s]
trump 8
100%|██████████| 5000/5000 [01:32<00:00, 53.88it/s]
trump 9
100%|██████████| 5000/5000 [01:22<00:00, 60.62it/s]


# Qualitative Evaluation

In [5]:
generated_speech_locations = ["lstm", "charpred", "ngram"]

In [6]:
# print some speeches here
for loc in generated_speech_locations:
    speech = open("../data/"+loc+"/"+presidents[1]+"_generated/0.txt").read()
    print("Generated by " + loc+":")
    print(speech[:700])
    print("\n-------------------------------------\n")

Generated by lstm:
good evening ladies and gentlemen a lot of a better time . the president yes to the iraqi government will be able to the united states . and the united states of the world is to do the world . the world is to be a new challenge . and this will be a new nation . we must continue to work . and the last year that we have been a way to live in the world . and i ask congress to do you to be a lot of people . and so i 'm not a new man . we will not be accomplished with the united states . i ask you to work . i have brought a lot of freedom . the president i have seen the world . i ask congress to join me in the american people . we will make sure that the united states . and we will work alongside

-------------------------------------

Generated by charpred:
Madam Speaker, Vice President -- unless you're riding mountain bikes as hard as you possibly get the full force and might of the United States made military you will know if we seized this moment pass.

My call tonigh

# Quantitative Evaluation

In [7]:
metric_list = ["tfidf_cosine", "tfidf_distance", "rouge", "mean_sentence_len_ratio", "mean_word_len_ratio"]

In [14]:
def generate_metrics(president):
    results = pd.DataFrame(columns=metric_list)
    for loc in generated_speech_locations:
        mean_cosine, _, _ = metrics.get_cosine_sim_tfidf(president, loc+"/"+president+"_generated", None, print_results=False)
        mean_distance = metrics.get_top_n_rank_distance(president, loc+"/"+president+"_generated", None, 15)
        mean_rouge, _, _ = metrics.get_rouge_score(president, loc+"/"+president+"_generated", None, print_results=False)
        mean_sentence_l_g = metrics.calculate_mean_sentence_length(loc+"/"+president+"_generated")
        mean_sentence_l = metrics.calculate_mean_sentence_length(president)
        mean_word_l_g = metrics.calculate_mean_word_length(loc+"/"+president+"_generated")
        mean_word_l = metrics.calculate_mean_word_length(president)
        results=results.append({"tfidf_cosine":mean_cosine,
                        "tfidf_distance":mean_distance,
                        "rouge":mean_rouge,
                        "mean_sentence_len_ratio":mean_sentence_l_g/mean_sentence_l,
                        "mean_word_len_ratio":mean_word_l_g/mean_word_l,
                               }, ignore_index=True)

    results.index = generated_speech_locations
    return results.round(3)

## Obama

In [19]:
results = generate_metrics("obama")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.046,601.267,0.408,0.643,0.716
charpred,0.092,3003.545,0.573,0.938,1.002
ngram,0.089,5234.547,0.433,0.891,1.033


In [None]:
print(results.to_latex())

## Bush

In [20]:
results = generate_metrics("bush")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.057,363.996,0.353,0.633,0.801
charpred,0.092,3021.826,0.488,0.999,0.996
ngram,0.099,4201.354,0.371,0.959,1.053


## Reagan

In [21]:
results = generate_metrics("reagan")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.041,326.732,0.347,0.529,0.787
charpred,0.089,3334.681,0.521,0.954,0.998
ngram,0.07,4888.885,0.336,0.862,1.114


## Trump

In [22]:
results = generate_metrics("trump")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.035,208.642,0.285,0.867,0.618
charpred,0.083,3048.084,0.478,0.975,1.0
ngram,0.079,5700.179,0.361,0.964,1.011
