# <u> LSTM Encoder / Decoder on character basis

## Data

In [1]:
import utils
import numpy as np

#utils.setup_nltk()
PRESIDENT = 'obama'
speeches = utils.read_all_text_files(PRESIDENT)

In [2]:
assert np.mean([len(x) for x in speeches]) == 4878.9

### Preprocessing

In [3]:
filter_list = [':', '(', ')', ',', '-',]
filtered_speeches = []

for speech in speeches:
    filtered_speech = []
    for word in speech:
        # filter out unwanted words
        if word not in filter_list:
            # lower word
            filtered_speech.append(word.lower())
    filtered_speeches.append(filtered_speech)

### Create Character N-Grams

In [81]:
%%time

WINDOW = 5
cgrams = []
joined_speeches = [[' '.join(s)] for s in filtered_speeches]

for speech in joined_speeches:
    cur = speech[0]
    while len(cur) >= WINDOW:
        cgrams.append((' '.join(cur[:WINDOW][:-1]), cur[:WINDOW][-1]))
        cur = cur[1:]

CPU times: user 2.01 s, sys: 6.97 ms, total: 2.02 s
Wall time: 2.01 s


### Persist Data

In [82]:
import pandas as pd

df = pd.DataFrame(cgrams, columns=['X', 'Y'])

# persist
csv_name = '../data/lstm/' + PRESIDENT + '_preproc/encdec_' + str(WINDOW) + 'CHAR_grams.csv'
df.to_csv(csv_name, index=False)

### Batch Data

In [83]:
import torchtext
from torchtext.data import BucketIterator, Iterator
import torch


XFIELD = torchtext.data.Field(sequential=True)
YFIELD = torchtext.data.Field(sequential=True)
DATA = torchtext.data.TabularDataset(csv_name, 'csv', 
                                     [('x', XFIELD),('y', YFIELD)], skip_header=True)

XFIELD.build_vocab(DATA)  
YFIELD.build_vocab(DATA)

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator = Iterator(DATA, BATCH_SIZE, device=device, train=True)

In [84]:
assert device.type == 'cuda'

# Neural Networks

### Encoder

In [87]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, bidirectional=True)

    def forward(self, x, h0, c0):
        x = self.embedding(x).unsqueeze(0)
        out, (h0, c0) = self.lstm(x, (h0, c0))
        return out, (h0, c0)

### Decoder

In [88]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, dropout=0.5, bidirectional=True)
        self.dense = nn.Linear(hidden_size*2, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
  
    def forward(self, x, h0, c0):
        x = self.embedding(x)
        x, (h0, c0) = self.lstm(x, (h0, c0))
        x = self.dense(x.squeeze(0))
        x = self.softmax(x)
        return x, (h0, c0)

### Training

In [111]:
HIDDEN_SIZE = 100
EMBEDDING_SIZE = 100
NUM_LAYERS = 2
ENC_LEARNING_RATE = 0.01
DEC_LEARNING_RATE = 0.01
criterion = nn.NLLLoss()

encoder = Encoder(len(XFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
decoder = Decoder(len(YFIELD.vocab), HIDDEN_SIZE, EMBEDDING_SIZE, NUM_LAYERS).to(device)
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr = ENC_LEARNING_RATE)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr = DEC_LEARNING_RATE)

In [112]:
from tqdm import tqdm

EPOCHS = 5
for ep in range(EPOCHS):
    ep_loss = 0
    
    for batch in tqdm(train_iterator):
        if len(batch) != BATCH_SIZE: break;
        inp = batch.x
        target = batch.y
        
        # init
        loss = 0
        h0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        c0 = torch.zeros(NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE).to(device)
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()
        
        # encode
        for w in range(inp.size(0)):
            enc_out, (h0, c0) = encoder(inp[w], h0, c0)
            
        # decode
        cur = inp[WINDOW-2].unsqueeze(0)
        dec_out, (h0, c0) = decoder(cur, h0, c0)
        cur = torch.argmax(dec_out,dim=1)
        
        # loss
        # target_onehot = torch.nn.functional.one_hot(target.squeeze(), len(YFIELD.vocab))
        loss += criterion(dec_out, target.squeeze())
        
        # optimize
        ep_loss += loss
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
        
    print('AVG_LOSS={}, (ABS={})'.format(round((ep_loss/(len(DATA)/BATCH_SIZE)).item(),4), 
                                         round(ep_loss.item(),2)))

100%|█████████▉| 18710/18711 [04:12<00:00, 73.97it/s]
  0%|          | 0/18711 [00:00<?, ?it/s]

AVG_LOSS=1.9225, (ABS=35970.41)


100%|█████████▉| 18710/18711 [04:41<00:00, 66.56it/s]
  0%|          | 0/18711 [00:00<?, ?it/s]

AVG_LOSS=1.8448, (ABS=34516.05)


100%|█████████▉| 18710/18711 [04:48<00:00, 64.78it/s]
  0%|          | 0/18711 [00:00<?, ?it/s]

AVG_LOSS=1.8113, (ABS=33890.69)


100%|█████████▉| 18710/18711 [04:54<00:00, 63.54it/s]
  0%|          | 0/18711 [00:00<?, ?it/s]

AVG_LOSS=1.8029, (ABS=33732.77)


100%|█████████▉| 18710/18711 [04:56<00:00, 63.16it/s]

AVG_LOSS=1.7885, (ABS=33462.97)





# Generate Text!

### reload models

In [116]:
pass

### functions

In [117]:
from torch import torch

def voc_index(words):
    return torch.tensor([XFIELD.vocab.stoi[x] for x in words]).to(device)

def predict(inp, RND_FACTOR=0, multiply=False):
    with torch.no_grad():

        h0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)
        c0 = torch.zeros(2*NUM_LAYERS, BATCH_SIZE, HIDDEN_SIZE).to(device)

        for w in range(inp.size(0)):
                enc_out, (h0, c0) = encoder(inp[w], h0, c0)

        cur = inp[WINDOW-1].unsqueeze(0)
        dec_out, (h0, c0) = decoder(cur, h0, c0)
        
        # randomize
        rnd = torch.rand(dec_out.shape).to(device) * RND_FACTOR
        if multiply:
            cur = torch.argmax(dec_out * rnd,dim=1)
        else:
            cur = torch.argmax(dec_out.add(rnd),dim=1)

        return YFIELD.vocab.itos[cur[0].item()]

def generate(intro=['good', 'evening', 'ladies', 'and', 'gentlemen'], multiply=False, rnd_factor=10, length=100):
    text = intro
    for i in range(length):
        cur_window = text[-WINDOW:]
        vecs = voc_index(cur_window).view(WINDOW,1).repeat(1,BATCH_SIZE)
        text.append(predict(vecs, rnd_factor, multiply))

    return ' '.join(text)

In [129]:
intro = ['h', 'e', 'l', 'l', 'o']
generate(intro, multiply=False, rnd_factor=10, length=100)

'h e l l o p <pad> <pad> a y <pad> w e r <pad> i v e s s i o n i t e c k e d g a <pad> n c i m i d e <pad> t h <pad> e r e <pad> w e d <pad> e r <pad> a c h <pad> o m g i t i c s <pad> s <pad> h i s <pad> w i n g s h o o p l e n <pad> w o l <pad> o m <pad> e d — f e d l e g e n'