# <u> LSTM ENCODER DECODER

# Data

In [1]:
import utils
import numpy as np

#utils.setup_nltk()
PRESIDENT = 'obama'
speeches = utils.read_all_text_files(PRESIDENT)

### Preprocessing

In [2]:
filter_list = [':', '(', ')', ',', '-',]
filtered_speeches = []

for speech in speeches:
    filtered_speech = []
    for word in speech:
        # filter out unwanted words
        if word not in filter_list:
            # lower word
            filtered_speech.append(word.lower())
    filtered_speeches.append(filtered_speech)

### Create N-Grams

In [3]:
from nltk import ngrams

WINDOW = 5
grams = [ngrams(s, WINDOW+1) for s in filtered_speeches]
flat_grams = [ng for speech in grams for ng in speech]

### Split Data

In [4]:
import pandas as pd

X = [' '.join(list(x[0:WINDOW])) for x in flat_grams]
Y = [x[-1] for x in flat_grams]
df = pd.DataFrame.from_dict({'x':X, 'y':Y})

# persist
csv_name = '../data/lstm/preproc/{}_encdec_{}grams.csv'.format(PRESIDENT, str(WINDOW))
df.to_csv(csv_name, index=False)

In [5]:
import torchtext

XFIELD = torchtext.data.Field(sequential=True)
YFIELD = torchtext.data.Field(sequential=True)
DATA = torchtext.data.TabularDataset(csv_name,'csv', 
                                     [('x', XFIELD),('y', YFIELD)], skip_header=True)

XFIELD.build_vocab(DATA)  
YFIELD.build_vocab(DATA)



In [6]:
from torchtext.data import BucketIterator, Iterator
import torch

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator = Iterator(DATA, BATCH_SIZE, device=device, train=True)



In [7]:
assert device.type == 'cuda'

# Neural Networks

### Encoder

In [8]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, bidirectional=True)

    def forward(self, x, h0, c0):
        x = self.embedding(x).unsqueeze(0)
        out, (h0, c0) = self.lstm(x, (h0, c0))
        return out, (h0, c0)

### Decoder

In [9]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, dropout=0.5, bidirectional=True)
        self.dense = nn.Linear(hidden_size*2, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
  
    def forward(self, x, h0, c0):
        x = self.embedding(x)
        x, (h0, c0) = self.lstm(x, (h0, c0))
        x = self.dense(x.squeeze(0))
        x = self.softmax(x)
        return x, (h0, c0)

### Training

In [10]:
HIDDEN_SIZE = 20
EMBEDDING_SIZE = 50
NUM_LAYERS = 2
LR = 0.01
ENC_LEARNING_RATE = LR
DEC_LEARNING_RATE = LR
criterion = nn.NLLLoss()

encoder = Encoder(len(XFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
decoder = Decoder(len(XFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr = ENC_LEARNING_RATE)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr = DEC_LEARNING_RATE)

In [11]:
from tqdm import tqdm

EPOCHS = 10
for ep in range(EPOCHS):
    ep_loss = 0
    
    for batch in tqdm(train_iterator):
        if len(batch) != BATCH_SIZE: break;
        inp = batch.x
        target = batch.y
        
        # init
        loss = 0
        h0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        c0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()
        
        # encode
        for w in range(inp.size(0)):
            enc_out, (h0, c0) = encoder(inp[w], h0, c0)
            
        # decode
        cur = inp[WINDOW-1].unsqueeze(0)
        dec_out, (_, _) = decoder(cur, h0, c0)        
        cur = torch.argmax(dec_out,dim=1)
        
        # loss
        # target_onehot = torch.nn.functional.one_hot(target.squeeze(), len(YFIELD.vocab))
        loss += criterion(dec_out, target.squeeze())
        
        # optimize
        ep_loss += loss
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
        
    print('AVG_LOSS={}, (ABS={})'.format(round((ep_loss/(len(DATA)/BATCH_SIZE)).item(),4), round(ep_loss.item(),2)))

100%|█████████▉| 3592/3593 [00:50<00:00, 71.34it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.7409, (ABS=20623.89)


100%|█████████▉| 3592/3593 [00:52<00:00, 68.68it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.2141, (ABS=18731.35)


100%|█████████▉| 3592/3593 [00:55<00:00, 65.05it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=5.0267, (ABS=18057.97)


100%|█████████▉| 3592/3593 [00:55<00:00, 64.57it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.9058, (ABS=17623.86)


100%|█████████▉| 3592/3593 [00:54<00:00, 65.85it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.8318, (ABS=17358.05)


100%|█████████▉| 3592/3593 [00:55<00:00, 65.11it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.7772, (ABS=17161.66)


100%|█████████▉| 3592/3593 [00:57<00:00, 62.30it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.741, (ABS=17031.57)


100%|█████████▉| 3592/3593 [00:58<00:00, 61.51it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.7101, (ABS=16920.72)


100%|█████████▉| 3592/3593 [00:57<00:00, 62.46it/s]
  0%|          | 0/3593 [00:00<?, ?it/s]

AVG_LOSS=4.6817, (ABS=16818.62)


100%|█████████▉| 3592/3593 [00:59<00:00, 60.56it/s]

AVG_LOSS=4.6577, (ABS=16732.61)





# Generate Text!

### functions

In [12]:
from torch import torch

def voc_index(words):
    return torch.tensor([XFIELD.vocab.stoi[x] for x in words]).to(device)

def predict(inp, RND_FACTOR=0, multiply=False, h0=None, c0=None):
    with torch.no_grad():
        
        if h0 == None:
            h0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
        if c0 == None:
            c0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
        
        for w in range(inp.size(0)):
                enc_out, (h0, c0) = encoder(inp[w], h0, c0)

        cur = inp[WINDOW-1].unsqueeze(0)
        dec_out, (h0, c0) = decoder(cur, h0, c0)
        
        # randomize
        if multiply:
            rnd = torch.rand(dec_out.shape).to(device) * RND_FACTOR + 1
            cur = torch.argmax(dec_out * rnd,dim=1)
        else:
            rnd = torch.rand(dec_out.shape).to(device) * RND_FACTOR
            cur = torch.argmax(dec_out.add(rnd),dim=1)

        return YFIELD.vocab.itos[cur[0].item()], (h0, c0)

def generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], multiply=False, rnd_factor=10, length=100, decay=None):
    text = intro
    h0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
    c0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
    
    for i in range(length):
        cur_window = text[-WINDOW:]
        vecs = voc_index(cur_window).view(WINDOW,1).repeat(1,BATCH_SIZE)
        
        if decay:
            prediction, (h0, c0) = predict(vecs, rnd_factor, multiply, h0, c0)
            #h0 = torch.rand(h0.shape).to(device) * h0
            #c0 = torch.rand(c0.shape).to(device) * c0
        else:
            prediction, _ = predict(vecs, rnd_factor, multiply)
            
        text.append(prediction)

    return ' '.join(text)

### create N speeches

In [None]:
%%time

generated = []
GENERATE_N = 10
MEAN_OG_SPEECH_LEN = round(np.mean([len(s) for s in speeches]))
for i in range(GENERATE_N):
    generated.append(
        generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], 
                 multiply=True, 
                 rnd_factor=1.2, 
                 length=MEAN_OG_SPEECH_LEN, 
                 decay=True)
    )

generated[0]

# Evaluate

In [None]:
%%time

import metrics

# persist
for i in range(len(generated)):
    with open("../data/lstm/{}_generated/{}.txt".format(PRESIDENT, str(i)), "w") as text_file:
        text_file.write(generated[i]) 

# scores
mean_cos, std_cos, cos_sim = metrics.get_cosine_sim_tfidf(PRESIDENT, "lstm/{}_generated".format(PRESIDENT), print_results=True)
rouge = metrics.get_rouge_score(PRESIDENT, "lstm/{}_generated".format(PRESIDENT), print_results=True)
og_sen_len = metrics.calculate_mean_sentence_length(PRESIDENT)
gen_sen_len = metrics.calculate_mean_sentence_length("lstm/{}_generated".format(PRESIDENT))
print('mean sentence len (OG vs GEN) {}'.format((og_sen_len, gen_sen_len)))
print('mean sentence len diff {}'.format(og_sen_len-gen_sen_len))
og_w_len = metrics.calculate_mean_word_length(PRESIDENT)
gen_w_len = metrics.calculate_mean_word_length("lstm/{}_generated".format(PRESIDENT))
print('mean word len (OG vs GEN) {}'.format((og_w_len, gen_w_len)))
print('mean word len diff {}'.format(og_w_len-gen_w_len))
top = 15
rank_dist = metrics.get_top_n_rank_distance(orig_speeches_loc=PRESIDENT, gen_speeches_loc="lstm/{}_generated".format(PRESIDENT), n=top)
print('top{} rank distance {}'.format(top, rank_dist))

# Save Models

In [15]:
import pickle

MODEL_PATH = '../data/lstm/models/'
ENC_PATH = "{}{}_enc.pt".format(MODEL_PATH, PRESIDENT)
DEC_PATH = "{}{}_dec.pt".format(MODEL_PATH, PRESIDENT)
XVOC_PATH = "{}{}_xvoc.pt".format(MODEL_PATH, PRESIDENT)
YVOC_PATH = "{}{}_yvoc.pt".format(MODEL_PATH, PRESIDENT)

# SAVE MODELS
torch.save(encoder.state_dict(), ENC_PATH)
torch.save(decoder.state_dict(), DEC_PATH)

# SAVE FIELDS
pickle.dump(XFIELD.vocab, open(XVOC_PATH, 'wb'))
pickle.dump(YFIELD.vocab, open(YVOC_PATH, 'wb'))

In [63]:
class LSTMGenerator():
        
    def __init__(self, president):
        # le president
        self.president = president
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # load vocabularies
        self.xvoc = pickle.load(open('../data/lstm/models/{}_xvoc.pt'.format(president), 'rb'))
        self.yvoc = pickle.load(open('../data/lstm/models/{}_yvoc.pt'.format(president), 'rb'))
        
        # model parameters (fixed)
        self.hidden_size = 20
        self.embedding_size = 50
        self.num_layers = 2
        self.batch_size = 64
        self.window = 5
        
        # load models
        self.encoder = Encoder(len(self.xvoc), self.hidden_size, self.embedding_size, self.num_layers).to(self.device)
        self.encoder.load_state_dict(torch.load('../data/lstm/models/{}_enc.pt'.format(president))) #, map_location=torch.device('cpu')))
        self.decoder = Decoder(len(self.xvoc), self.hidden_size, self.embedding_size, self.num_layers).to(self.device)
        self.decoder.load_state_dict(torch.load('../data/lstm/models/{}_dec.pt'.format(president))) #, map_location=torch.device('cpu')))
    
    def voc_index(self, words):
        return torch.tensor([self.xvoc.stoi[x] for x in words]).to(self.device)
       
    def predict(self, inp, RND_FACTOR=0, multiply=False, h0=None, c0=None):
        with torch.no_grad():

            if h0 == None:
                h0 = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size).to(self.device)
            if c0 == None:
                c0 = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size).to(self.device)

            for w in range(inp.size(0)):
                    enc_out, (h0, c0) = self.encoder(inp[w], h0, c0)

            cur = inp[self.window-1].unsqueeze(0)
            dec_out, (h0, c0) = self.decoder(cur, h0, c0)

            # randomize
            if multiply:
                rnd = torch.rand(dec_out.shape).to(self.device) * RND_FACTOR + 1
                cur = torch.argmax(dec_out * rnd,dim=1)
            else:
                rnd = torch.rand(dec_out.shape).to(self.device) * RND_FACTOR
                cur = torch.argmax(dec_out.add(rnd),dim=1)

            return self.yvoc.itos[cur[0].item()], (h0, c0)

    def generate(self, intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], multiply=True, rnd_factor=1.2, length=4000, carry=True):
        text = intro
        h0 = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        c0 = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size).to(self.device)

        for i in range(length):
            cur_window = text[-self.window:]
            vecs = self.voc_index(cur_window).view(self.window,1).repeat(1,self.batch_size)

            if carry:
                prediction, (h0, c0) = self.predict(vecs, rnd_factor, multiply, h0, c0)
            else:
                prediction, _ = self.predict(vecs, rnd_factor, multiply)

            text.append(prediction)

        return ' '.join(text) 
    
    def generate_n(self, N=10, intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], multiply=True, rnd_factor=1.2, length=4000, carry=True):
        generated = []

        for i in range(N):
            generated.append(
                self.generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], 
                         multiply=multiply, 
                         rnd_factor=rnd_factor, 
                         length=length, 
                         carry=carry)
            
            )
        return generated

    def persist(self, generated, path=None):
        if path == None: path = "../data/lstm/{}_generated/".format(self.president)
        
        # persist
        for i in range(len(generated)):
            with open("{}{}.txt".format(path, str(i)), "w") as text_file:
                text_file.write(generated[i]) 

In [64]:
gen = LSTMGenerator('obama')
generated = gen.generate_n(1)
gen.persist(generated)