In [1]:
%load_ext autoreload
%autoreload 2

import os
import char_max_likelihood
import metrics
import pandas as pd
import LSTMGenerator
import utils
import ngram_model
utils.setup_nltk()

Importing Jupyter notebook from lstm_words.ipynb


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/gabriel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Generate Speeches

In [2]:
presidents = ["obama", "bush", "reagan", "trump"]
n=10

## LSTM

In [3]:
for pres in presidents:
    lstm = LSTMGenerator.LSTMGenerator(pres)
    speeches = lstm.generate_n(n, length=5000)
    lstm.persist(speeches)

## CharPred

In [4]:
order = 10
letters = 5000*5
filepath = "../data/charpred/{}_generated/{}.txt"

for pres in presidents:
    lm = char_max_likelihood.train_char_lm(pres, order)
    for i in range(0,n):
        #s = char_max_likelihood.generate_text(lm, order, letters)
        s = char_max_likelihood.generate_text(lm, order, letters, early_stopping=False)
        if not os.path.exists("../data/charpred/{}_generated/".format(pres)):
            os.makedirs("../data/charpred/{}_generated/".format(pres))
        with open(filepath.format(pres,i), "w") as f:
            f.write(s)

## NGram

In [5]:
for pres in presidents:
    for i in range(0,n):
        dir = "../data/ngram/{}_generated/".format(pres)
        filepath = dir + "{}.txt".format(i)
        print(pres, i)
        token_model, pos_model, tokens_per_pos = ngram_model.create_ngram(pres, n=2, pos_n=5, use_lower=True, pos_tagging=True)
        s = ngram_model.generate_speech(i, token_model, pos_model, tokens_per_pos, max_length=5000, top_token=int(len(token_model.VOCAB) / 10), top_pos=5)

        if not os.path.exists(dir):
            os.makedirs(dir)
        with open(filepath, "w", encoding="utf8") as f:
            f.write(s)

obama 0


100%|██████████| 5000/5000 [00:38<00:00, 129.59it/s]


obama 1


100%|██████████| 5000/5000 [00:37<00:00, 133.17it/s]


obama 2


100%|██████████| 5000/5000 [00:40<00:00, 124.40it/s]


obama 3


100%|██████████| 5000/5000 [00:52<00:00, 95.60it/s] 


obama 4


100%|██████████| 5000/5000 [00:54<00:00, 91.79it/s] 


obama 5


100%|██████████| 5000/5000 [00:52<00:00, 94.45it/s] 


obama 6


100%|██████████| 5000/5000 [00:51<00:00, 96.92it/s] 


obama 7


100%|██████████| 5000/5000 [00:55<00:00, 89.82it/s] 


obama 8


100%|██████████| 5000/5000 [00:55<00:00, 90.71it/s] 


obama 9


100%|██████████| 5000/5000 [00:53<00:00, 93.19it/s] 


bush 0


100%|██████████| 5000/5000 [00:39<00:00, 127.75it/s]


bush 1


 36%|███▌      | 1787/5000 [00:14<00:25, 126.43it/s]


bush 2


100%|██████████| 5000/5000 [00:40<00:00, 124.03it/s]


bush 3


100%|██████████| 5000/5000 [00:40<00:00, 122.23it/s]


bush 4


100%|██████████| 5000/5000 [00:41<00:00, 119.74it/s]


bush 5


100%|██████████| 5000/5000 [00:43<00:00, 116.25it/s]


bush 6


100%|██████████| 5000/5000 [00:43<00:00, 114.39it/s]


bush 7


100%|██████████| 5000/5000 [00:42<00:00, 117.82it/s]


bush 8


 31%|███▏      | 1572/5000 [00:13<00:29, 115.22it/s]


bush 9


 57%|█████▋    | 2851/5000 [00:23<00:17, 120.75it/s]


reagan 0


100%|██████████| 5000/5000 [01:07<00:00, 73.83it/s] 


reagan 1


100%|██████████| 5000/5000 [01:09<00:00, 72.15it/s]


reagan 2


100%|██████████| 5000/5000 [01:06<00:00, 75.23it/s] 


reagan 3


 18%|█▊        | 892/5000 [00:12<00:57, 71.38it/s]


reagan 4


100%|██████████| 5000/5000 [01:07<00:00, 74.34it/s] 


reagan 5


100%|██████████| 5000/5000 [01:07<00:00, 73.80it/s]


reagan 6


100%|██████████| 5000/5000 [01:06<00:00, 75.01it/s] 


reagan 7


  0%|          | 8/5000 [00:00<01:31, 54.75it/s]


reagan 8


100%|██████████| 5000/5000 [01:08<00:00, 72.50it/s] 


reagan 9


100%|██████████| 5000/5000 [01:06<00:00, 74.92it/s] 


trump 0


100%|██████████| 5000/5000 [01:00<00:00, 82.66it/s] 


trump 1


100%|██████████| 5000/5000 [01:00<00:00, 82.86it/s] 


trump 2


100%|██████████| 5000/5000 [01:02<00:00, 79.69it/s] 


trump 3


100%|██████████| 5000/5000 [01:00<00:00, 82.31it/s] 


trump 4


100%|██████████| 5000/5000 [01:03<00:00, 78.35it/s] 


trump 5


100%|██████████| 5000/5000 [01:01<00:00, 80.81it/s] 


trump 6


100%|██████████| 5000/5000 [01:02<00:00, 79.78it/s] 


trump 7


100%|██████████| 5000/5000 [01:01<00:00, 81.84it/s] 


trump 8


100%|██████████| 5000/5000 [01:01<00:00, 80.94it/s] 


trump 9


100%|██████████| 5000/5000 [00:58<00:00, 84.87it/s] 


# Qualitative Evaluation

In [6]:
generated_speech_locations = ["lstm", "charpred", "ngram"]

In [7]:
# print some speeches here
for loc in generated_speech_locations:
    speech = open("../data/"+loc+"/"+presidents[1]+"_generated/0.txt").read()
    print("Generated by " + loc+":")
    print(speech[:700])
    print("\n-------------------------------------\n")

Generated by lstm:
good evening ladies and gentlemen the iraqi government is to go forward . we will make the world to be the security of our country . and so we 're working in the most industry . i appreciate you giving . i have found the moment of our country . and we must continue to make sure the most u.n. inspectors . we have seen the path of the middle east . i appreciate the hard work of the middle east . i appreciate you the american people . and our enemies have a new strategy that are not going to help the middle east and the best of our country . applause . i 'm a lot of the most of the middle east . we will not be a lot of consequence and the iraqi people in the world . and i appreciate the preside

-------------------------------------

Generated by charpred:
Good evening. Tonight in this struggle of our troops have engaged these enemies, it would not have imagined. We faced hard decisions of tyranny in our world.

As President Cheney, Mr. Chief Justice, President's counci

# Quantitative Evaluation

In [8]:
metric_list = ["tfidf_cosine", "tfidf_distance", "rouge", "mean_sentence_len_ratio", "mean_word_len_ratio"]

In [9]:
def generate_metrics(president):
    results = pd.DataFrame(columns=metric_list)
    for loc in generated_speech_locations:
        mean_cosine, _, _ = metrics.get_cosine_sim_tfidf(president, loc+"/"+president+"_generated", None, print_results=False)
        mean_distance = metrics.get_top_n_rank_distance(president, loc+"/"+president+"_generated", None, 15)
        mean_rouge, _, _ = metrics.get_rouge_score(president, loc+"/"+president+"_generated", None, print_results=False)
        mean_sentence_l_g = metrics.calculate_mean_sentence_length(loc+"/"+president+"_generated")
        mean_sentence_l = metrics.calculate_mean_sentence_length(president)
        mean_word_l_g = metrics.calculate_mean_word_length(loc+"/"+president+"_generated")
        mean_word_l = metrics.calculate_mean_word_length(president)
        results=results.append({"tfidf_cosine":mean_cosine,
                        "tfidf_distance":mean_distance,
                        "rouge":mean_rouge,
                        "mean_sentence_len_ratio":mean_sentence_l_g/mean_sentence_l,
                        "mean_word_len_ratio":mean_word_l_g/mean_word_l,
                               }, ignore_index=True)

    results.index = generated_speech_locations
    return results.round(3)

## Obama

In [10]:
results = generate_metrics("obama")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.049,559.239,0.401,0.682,0.711
charpred,0.094,4681.692,0.571,0.99,0.997
ngram,0.09,5286.658,0.433,0.889,1.033


In [11]:
print(results.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  tfidf\_cosine &  tfidf\_distance &  rouge &  mean\_sentence\_len\_ratio &  mean\_word\_len\_ratio \\
\midrule
lstm     &         0.049 &         559.239 &  0.401 &                    0.682 &                0.711 \\
charpred &         0.094 &        4681.692 &  0.571 &                    0.990 &                0.997 \\
ngram    &         0.090 &        5286.658 &  0.433 &                    0.889 &                1.033 \\
\bottomrule
\end{tabular}



## Bush

In [12]:
results = generate_metrics("bush")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.059,706.923,0.332,0.727,0.828
charpred,0.096,4646.39,0.488,0.994,0.998
ngram,0.099,4129.551,0.371,0.967,1.053


## Reagan

In [13]:
results = generate_metrics("reagan")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.043,441.374,0.331,0.598,0.76
charpred,0.085,4886.388,0.522,1.008,1.0
ngram,0.071,4863.456,0.336,0.866,1.113


## Trump

In [14]:
results = generate_metrics("trump")
results

Unnamed: 0,tfidf_cosine,tfidf_distance,rouge,mean_sentence_len_ratio,mean_word_len_ratio
lstm,0.043,192.938,0.279,0.858,0.622
charpred,0.084,4608.881,0.48,1.03,1.003
ngram,0.079,5652.61,0.361,0.953,1.01
