# Generate obama speeches using stacked RNNs but not batched

With truncated back propagation, add embedding layer instead of one-hot encoding going into RNN.

Use one big long record and then every `bptt` update weights. No breaking into chunks so that `h` is just a vector.  Simple version of [chunked obama](RNN-generate-stacked-obama.ipynb).

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
import codecs
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    with codecs.open(filename, mode='r') as f:
        s = f.read()
    return s

In [3]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [4]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

## Load and split into chunks

In [5]:
text = get_text("data/obama-speeches.txt").lower() # generated from obama-sentences.py
len(text)

4224143

In [6]:
text = text[0:20_000] # testing
n = len(text)

bptt = 8                  # only look back this many time steps for gradients
nhidden = 100
#batch_size = 32
char_embed_sz = 20        # there are 50+ chars, squeeze down into fewer dimensions for embedding prior to input into RNN 

In [7]:
vocab, ctoi = getvocab(text)

In [8]:
X = [ctoi[c] for c in text[0:-1]]
y = [ctoi[c] for c in text[1:]]
n = len(X)

In [9]:
nclasses = len(ctoi)
print(f"{n:,d} char, vocab size {len(ctoi)}, char_embed_sz {char_embed_sz}, state is {nhidden}-vector")

19,999 char, vocab size 46, char_embed_sz 20, state is 100-vector


In [24]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(char_embed_sz, len(ctoi),     device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       char_embed_sz, device=device, dtype=torch.float64, requires_grad=True) # input converter
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target
bx = torch.zeros(nhidden,       1,             device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target
bo = torch.zeros(nclasses,       1,             device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

W2 = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U2 = torch.randn(nhidden,       nclasses,      device=device, dtype=torch.float64, requires_grad=True) # input converter
V2 = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target
bx2 = torch.zeros(nhidden,       1,             device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target
bo2 = torch.zeros(nclasses,       1,             device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

sd = 0.1   # weight stddev init for tanh (default is N(0,1) for torch.randn())
sd = 0.01   # weight stddev init for tanh (default is N(0,1) for torch.randn())
# sd = 1.0
with torch.no_grad():
    E *= sd
    U *= sd
    V *= sd
    U2 *= sd
    V2 *= sd
    
# parameters = [E,W,U,B,V,W2,U2,V2]
parameters = [E,W,U,V,bx,bo,W2,U2,V2,bx2,bo2]
optimizer = torch.optim.Adam(parameters, lr=0.001, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)

history = []
epochs = 20
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    h = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
    h2 = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)
    loss = 0
    for t in range(n):
        embedding_step_t = E[:,X[t]]
        embedding_step_t = embedding_step_t.reshape(char_embed_sz,1)
        h = W @ h + U @ embedding_step_t + bx
        h = torch.tanh(h)
        o = V @ h + bo

        h2 = W2 @ h2 + U2 @ o + bx2
        h2 = torch.tanh(h2)
        o = V2 @ h2 + bo2

        o = o.reshape(1,nclasses)
        loss += F.cross_entropy(o, torch.tensor([y[t]]))

        p = softmax(o)
        correct = torch.argmax(p[0])==y[t]
        epoch_training_accur += correct
        
        if t % bptt == 0 and t > 0:
#             print(f"gradient at {t:4d}, loss {loss.item():7.4f}")
            optimizer.zero_grad()
            loss.backward() # autograd computes U.grad, M.grad, ...
#             torch.nn.utils.clip_grad_value_(parameters, 1)
            optimizer.step()
            epoch_training_loss += loss.detach().item()
            loss = 0
            # no longer consider previous computations
            h = h.detach()
            h2 = h2.detach()

    epoch_training_accur /= n
    epoch_training_loss /= bptt # average loss computed
    scheduler.step()
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.2f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss  7493.58   accur  0.1816   LR 0.001000
Epoch   2 training loss  7496.71   accur  0.1812   LR 0.001000
Epoch   3 training loss  7487.25   accur  0.1814   LR 0.001000
Epoch   4 training loss  7487.55   accur  0.1814   LR 0.001000
Epoch   5 training loss  7487.27   accur  0.1814   LR 0.001000
Epoch   6 training loss  7486.93   accur  0.1814   LR 0.001000
Epoch   7 training loss  7486.80   accur  0.1814   LR 0.001000
Epoch   8 training loss  7269.65   accur  0.2033   LR 0.001000
Epoch   9 training loss  6236.05   accur  0.2924   LR 0.001000
Epoch  10 training loss  5907.63   accur  0.3203   LR 0.001000
Epoch  11 training loss  5713.53   accur  0.3396   LR 0.001000
Epoch  12 training loss  5568.66   accur  0.3531   LR 0.001000
Epoch  13 training loss  5437.24   accur  0.3653   LR 0.001000
Epoch  14 training loss  5361.12   accur  0.3750   LR 0.001000


KeyboardInterrupt: 