# <u> LSTM ENCODER DECODER

# Data

In [1]:
import utils
import numpy as np

#utils.setup_nltk()
PRESIDENT = 'obama'
speeches = utils.read_all_text_files(PRESIDENT)

### Preprocessing

In [2]:
filter_list = [':', '(', ')', ',', '-',]
filtered_speeches = []

for speech in speeches:
    filtered_speech = []
    for word in speech:
        # filter out unwanted words
        if word not in filter_list:
            # lower word
            filtered_speech.append(word.lower())
    filtered_speeches.append(filtered_speech)

### Create N-Grams

In [3]:
from nltk import ngrams

WINDOW = 5
grams = [ngrams(s, WINDOW+1) for s in filtered_speeches]
flat_grams = [ng for speech in grams for ng in speech]

### Split Data

In [4]:
import pandas as pd

X = [' '.join(list(x[0:WINDOW])) for x in flat_grams]
Y = [x[-1] for x in flat_grams]
df = pd.DataFrame.from_dict({'x':X, 'y':Y})

# persist
csv_name = '../data/lstm/preproc/{}_encdec_{}grams.csv'.format(PRESIDENT, str(WINDOW))
df.to_csv(csv_name, index=False)

In [5]:
import torchtext

XFIELD = torchtext.data.Field(sequential=True)
YFIELD = torchtext.data.Field(sequential=True)
DATA = torchtext.data.TabularDataset(csv_name,'csv', 
                                     [('x', XFIELD),('y', YFIELD)], skip_header=True)

XFIELD.build_vocab(DATA)  
YFIELD.build_vocab(DATA)



In [6]:
from torchtext.data import BucketIterator, Iterator
import torch

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator = Iterator(DATA, BATCH_SIZE, device=device, train=True)



In [7]:
assert device.type == 'cuda'

# Neural Networks

### Encoder

In [8]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, bidirectional=True)

    def forward(self, x, h0, c0):
        x = self.embedding(x).unsqueeze(0)
        out, (h0, c0) = self.lstm(x, (h0, c0))
        return out, (h0, c0)

### Decoder

In [9]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, dropout=0.5, bidirectional=True)
        self.dense = nn.Linear(hidden_size*2, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
  
    def forward(self, x, h0, c0):
        x = self.embedding(x)
        x, (h0, c0) = self.lstm(x, (h0, c0))
        x = self.dense(x.squeeze(0))
        x = self.softmax(x)
        return x, (h0, c0)

### Training

In [10]:
HIDDEN_SIZE = 20
EMBEDDING_SIZE = 50
NUM_LAYERS = 2
LR = 0.01
ENC_LEARNING_RATE = LR
DEC_LEARNING_RATE = LR
criterion = nn.NLLLoss()

encoder = Encoder(len(XFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
decoder = Decoder(len(XFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr = ENC_LEARNING_RATE)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr = DEC_LEARNING_RATE)

In [11]:
from tqdm import tqdm

EPOCHS = 20
for ep in range(EPOCHS):
    ep_loss = 0
    
    for batch in tqdm(train_iterator):
        if len(batch) != BATCH_SIZE: break;
        inp = batch.x
        target = batch.y
        
        # init
        loss = 0
        h0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        c0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()
        
        # encode
        for w in range(inp.size(0)):
            enc_out, (h0, c0) = encoder(inp[w], h0, c0)
            
        # decode
        cur = inp[WINDOW-1].unsqueeze(0)
        dec_out, (_, _) = decoder(cur, h0, c0)        
        cur = torch.argmax(dec_out,dim=1)
        
        # loss
        # target_onehot = torch.nn.functional.one_hot(target.squeeze(), len(YFIELD.vocab))
        loss += criterion(dec_out, target.squeeze())
        
        # optimize
        ep_loss += loss
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
        
    print('AVG_LOSS={}, (ABS={})'.format(round((ep_loss/(len(DATA)/BATCH_SIZE)).item(),4), round(ep_loss.item(),2)))

100%|█████████▉| 3592/3593 [00:49<00:00, 71.84it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.8158, (ABS=20892.94)


100%|█████████▉| 3592/3593 [00:55<00:00, 64.87it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.2816, (ABS=18973.67)


100%|█████████▉| 3592/3593 [00:57<00:00, 62.13it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.0834, (ABS=18261.69)


100%|█████████▉| 3592/3593 [00:59<00:00, 60.58it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.9568, (ABS=17807.0)


100%|█████████▉| 3592/3593 [01:00<00:00, 59.21it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.8751, (ABS=17513.51)


100%|█████████▉| 3592/3593 [01:01<00:00, 58.35it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.8204, (ABS=17316.87)


100%|█████████▉| 3592/3593 [01:02<00:00, 57.88it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.7694, (ABS=17133.91)


100%|█████████▉| 3592/3593 [01:03<00:00, 56.74it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.7362, (ABS=17014.48)


100%|█████████▉| 3592/3593 [01:03<00:00, 56.78it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.7087, (ABS=16915.77)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.58it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.697, (ABS=16873.63)


100%|█████████▉| 3592/3593 [01:03<00:00, 56.59it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6808, (ABS=16815.44)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.87it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6725, (ABS=16785.66)


100%|█████████▉| 3592/3593 [01:03<00:00, 56.29it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6595, (ABS=16739.1)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.88it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6435, (ABS=16681.43)


100%|█████████▉| 3592/3593 [01:05<00:00, 55.19it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.635, (ABS=16650.88)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.75it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6261, (ABS=16618.99)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.65it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6113, (ABS=16565.95)


100%|█████████▉| 3592/3593 [01:05<00:00, 54.81it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6173, (ABS=16587.34)


100%|█████████▉| 3592/3593 [01:04<00:00, 55.89it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6136, (ABS=16574.12)


100%|█████████▉| 3592/3593 [01:05<00:00, 55.07it/s]

AVG_LOSS=4.6115, (ABS=16566.47)





# Generate Text!

### functions

In [12]:
from torch import torch

def voc_index(words):
    return torch.tensor([XFIELD.vocab.stoi[x] for x in words]).to(device)

def predict(inp, RND_FACTOR=0, multiply=False, h0=None, c0=None):
    with torch.no_grad():
        
        if h0 == None:
            h0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
        if c0 == None:
            c0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
        
        for w in range(inp.size(0)):
                enc_out, (h0, c0) = encoder(inp[w], h0, c0)

        cur = inp[WINDOW-1].unsqueeze(0)
        dec_out, (h0, c0) = decoder(cur, h0, c0)
        
        # randomize
        if multiply:
            rnd = torch.rand(dec_out.shape).to(device) * RND_FACTOR + 1
            cur = torch.argmax(dec_out * rnd,dim=1)
        else:
            rnd = torch.rand(dec_out.shape).to(device) * RND_FACTOR
            cur = torch.argmax(dec_out.add(rnd),dim=1)

        return YFIELD.vocab.itos[cur[0].item()], (h0, c0)

def generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], multiply=False, rnd_factor=10, length=100, decay=None):
    text = intro
    h0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
    c0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
    
    for i in range(length):
        cur_window = text[-WINDOW:]
        vecs = voc_index(cur_window).view(WINDOW,1).repeat(1,BATCH_SIZE)
        
        if decay:
            prediction, (h0, c0) = predict(vecs, rnd_factor, multiply, h0, c0)
            #h0 = torch.rand(h0.shape).to(device) * h0
            #c0 = torch.rand(c0.shape).to(device) * c0
        else:
            prediction, _ = predict(vecs, rnd_factor, multiply)
            
        text.append(prediction)

    return ' '.join(text)

### create N speeches

In [13]:
%%time

generated = []
GENERATE_N = 10
MEAN_OG_SPEECH_LEN = round(np.mean([len(s) for s in speeches]))
for i in range(GENERATE_N):
    generated.append(
        generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], 
                 multiply=True, 
                 rnd_factor=1.2, 
                 length=MEAN_OG_SPEECH_LEN, 
                 decay=True)
    )

generated[0]

CPU times: user 7min 37s, sys: 6.96 s, total: 7min 44s
Wall time: 7min 44s


"good evening ladies and gentlemen . and that 's why i 'm going to be clear that we ’ re going to be a nation that are here in the world . and i ’ m here in the world . we must be met in the world . and that 's why we ’ ve been the promise of the world . but i think that we have a more prosperous . and we will not be easy . and that ’ s why we ’ re not the ability to take us to the house . and we must be solved . and we ’ re going to be the same prescriptions . we ’ re not a more perfect . we know that it is not just a whole bunch of leaders of the great law -- and the answer is not a lot of people in the world . and i ’ m not naïve to the world . and we are the united to the united states of america . and that ’ s why i ’ ve been done . and the united states of america . applause . we will be a different for the money . and we ’ re not clear to the world and the taliban have come through a new of law . and that ’ s why i ’ m announcing a lot of . we will be vigilant in the world . it 

# Evaluate

In [14]:
%%time

import metrics

# persist
for i in range(len(generated)):
    with open("../data/lstm/{}_generated/{}.txt".format(PRESIDENT, str(i)), "w") as text_file:
        text_file.write(generated[i]) 

# scores
mean_cos, std_cos, cos_sim = metrics.get_cosine_sim_tfidf(PRESIDENT, "lstm/{}_generated".format(PRESIDENT), print_results=True)
rouge = metrics.get_rouge_score(PRESIDENT, "lstm/{}_generated".format(PRESIDENT), print_results=True)
og_sen_len = metrics.calculate_mean_sentence_length(PRESIDENT)
gen_sen_len = metrics.calculate_mean_sentence_length("lstm/{}_generated".format(PRESIDENT))
print('mean sentence len (OG vs GEN) {}'.format((og_sen_len, gen_sen_len)))
print('mean sentence len diff {}'.format(og_sen_len-gen_sen_len))
og_w_len = metrics.calculate_mean_word_length(PRESIDENT)
gen_w_len = metrics.calculate_mean_word_length("lstm/{}_generated".format(PRESIDENT))
print('mean word len (OG vs GEN) {}'.format((og_w_len, gen_w_len)))
print('mean word len diff {}'.format(og_w_len-gen_w_len))
top = 15
rank_dist = metrics.get_top_n_rank_distance(orig_speeches_loc=PRESIDENT, gen_speeches_loc="lstm/{}_generated".format(PRESIDENT), n=top)
print('top{} rank distance {}'.format(top, rank_dist))

mean cosine similarity over all generated speeches: 0.046213566006767126
standard deviation of cosine similarity over all generated speeches: 0.012049643472512502
mean rouge score for all generated speeches: 0.4079794917516292
standard deviation of rouge score for all generated speeches: 0.0027243518794677277
mean sentence len (OG vs GEN) (18.303574290751722, 11.772115596735054)
mean sentence len diff 6.531458694016669
mean word len (OG vs GEN) (4.624454002549181, 3.3120036101083032)
mean word len diff 1.3124503924408781
top15 rank distance 601.2669977824696
CPU times: user 49.1 s, sys: 86.7 ms, total: 49.2 s
Wall time: 49.2 s


# Save Models