# RNN from scratch in PyTorch to generate char sequences

## Support code

In [55]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

np.set_printoptions(precision=3)

dtype = torch.float

In [56]:
def randn(n1, n2, dtype=torch.float32, mean=0.0, std=0.01, requires_grad=True):
    x = torch.randn(n1, n2, dtype=dtype)
    x = x*std + mean # Convert x to have mean and std
    x.requires_grad=requires_grad
    return x

## Use fastai human numbers data

The data is from [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). Looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```


In [57]:
from fastai2.text.all import untar_data, URLs
path = untar_data(URLs.HUMAN_NUMBERS)

## Support

In [58]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

## Load corpus and numericalize tokens

In [59]:
text = get_text(path/'train.txt')
#text = text[:5000] # TESTING!!!
text[:30]

'one \ntwo \nthree \nfour \nfive \ns'

In [60]:
text = re.sub(r'[ \n]+', ' . ', text) # use '.' as separator token
text[:20]

'one . two . three . '

In [61]:
tokens = text.split(' ')
tokens = tokens[:-1] # last token is blank '' so delete
tokens[:5]

['one', '.', 'two', '.', 'three']

In [62]:
vocab = sorted(set(tokens))
vocab[:10]

['.',
 'eight',
 'eighteen',
 'eighty',
 'eleven',
 'fifteen',
 'fifty',
 'five',
 'forty',
 'four']

In [63]:
index = {w:i for i,w in enumerate(vocab)}
tokens = [index[w] for w in tokens]
tokens[:10]

[15, 0, 29, 0, 26, 0, 9, 0, 7, 0]

In [64]:
X = tokens[0:-1]
y = torch.tensor(tokens[1:])
len(X), len(y), len(tokens)

(84159, 84159, 84160)

## Split out validation set

In [65]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)
ntrain = int(len(X)*.80)
X_train, y_train = X[:ntrain], y[:ntrain]
X_valid, y_valid = X[ntrain:], y[ntrain:]

## Get vocab

In [66]:
ctoi = {c:i for i, c in enumerate(vocab)}
ctoi

{'.': 0,
 'eight': 1,
 'eighteen': 2,
 'eighty': 3,
 'eleven': 4,
 'fifteen': 5,
 'fifty': 6,
 'five': 7,
 'forty': 8,
 'four': 9,
 'fourteen': 10,
 'hundred': 11,
 'nine': 12,
 'nineteen': 13,
 'ninety': 14,
 'one': 15,
 'seven': 16,
 'seventeen': 17,
 'seventy': 18,
 'six': 19,
 'sixteen': 20,
 'sixty': 21,
 'ten': 22,
 'thirteen': 23,
 'thirty': 24,
 'thousand': 25,
 'three': 26,
 'twelve': 27,
 'twenty': 28,
 'two': 29}

In [67]:
def onehot(ci:int, vocab):
    v = torch.zeros((len(vocab),1), dtype=torch.float32)
    v[ci] = 1
    return v

In [68]:
onehot(2, vocab)

tensor([[0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])

## Train

In [71]:
def sample(h0, ci, n):
    "Derived from Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086"
    h = h0
    chars = [vocab[ci]]
    with torch.no_grad():
        for i in range(n):
            x = onehot(X_train[i], V)
            h = W.mm(h) + U.mm(x)
            h = torch.relu(h)  # squish to (-1,+1); also better than sigmoid for vanishing gradient
            o = V.mm(h).reshape(1,-1) # unnormalized log probabilities for next char
            p = F.softmax(o)
#             p = np.exp(o[0].numpy())             # unnormalized probabilities
#             p = p / np.sum(p)         # normalized probabilities
            ci = np.random.choice(range(len(vocab)), p=p.ravel())
            chars.append(vocab[ci])
    return chars

In [72]:
nhidden = 64
nfeatures = len(vocab)
nclasses = len(vocab) # predicting chars
seqlen = 16
W = randn(nhidden, nhidden)
#W.requires_gradient=True
U = randn(nhidden, nfeatures)
V = randn(nclasses, nhidden)

n = (len(X_train) // seqlen) * seqlen # make it a multiple of seqlen
learning_rate = 0.01
weight_decay = 0.0
optimizer = torch.optim.Adam([W,U,V], lr=learning_rate, weight_decay=weight_decay)
nepochs=20
loss = 0
for epoch in range(nepochs+1):
    print(f"EPOCH {epoch}")
    h = randn(nhidden, 1, requires_grad=False) # reset hidden state at start of epoch
    for p in range(0,n,seqlen): # do one epoch
        loss = 0    
        optimizer.zero_grad()
        for i in range(p,p+seqlen,1):    # do one subsequence of entire X_train
            x = onehot(X_train[i], V)
            h = W.mm(h) + U.mm(x)
            h = torch.relu(h)  # squish to (-1,+1); also better than sigmoid for vanishing gradient
    #         print(h)
            o = V.mm(h).reshape(1,-1)
#             print(i, vocab[X_train[i]], '->', vocab[y_train[i]], "vs", vocab[np.argmax(F.softmax(o).detach().numpy())])
            loss = loss + F.cross_entropy(o, torch.tensor([y_train[i]]))
#             print(i, X_train[i], loss.item())
#         print(f"SEQUENCE loss={loss.item():.4f} ------------")
        loss.backward() # autograd computes U.grad and M.grad
        optimizer.step()
        h = h.detach() # truncated BPTT; tell pytorch to forget prev h computations
            
    print(sample(h0=h, ci=np.random.randint(0,len(vocab)), n=40))
#     with torch.no_grad():
#         loss = F.cross_entropy(model(train_data.tensors[0]), train_data.tensors[1])
#     print(f"loss={loss.item():.4f} ------------")


EPOCH 0


KeyboardInterrupt: 