# RNN from scratch in PyTorch to generate char sequences

## Support code

In [3]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float

## Use fastai human numbers data

The data is from [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). Looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```


In [11]:
from fastai2.text.all import URLs
path = untar_data(URLs.HUMAN_NUMBERS)

## Support

In [12]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

## Load corpus and numericalize tokens

In [13]:
text = get_text(path/'train.txt')
text[:30]

'one \ntwo \nthree \nfour \nfive \ns'

In [14]:
text = re.sub(r'[ \n]+', ' . ', text) # use '.' as separator token
text[:20]

'one . two . three . '

In [15]:
tokens = text.split(' ')
tokens = tokens[:-1] # last token is blank '' so delete
tokens[:5]

['one', '.', 'two', '.', 'three']

In [16]:
V = sorted(set(tokens))
V[:10]

['.',
 'eight',
 'eighteen',
 'eighty',
 'eleven',
 'fifteen',
 'fifty',
 'five',
 'forty',
 'four']

In [17]:
index = {w:i for i,w in enumerate(V)}
tokens = [index[w] for w in tokens]
tokens[:10]

[15, 0, 29, 0, 26, 0, 9, 0, 7, 0]

In [20]:
X = tokens[0:-1]
y = tokens[1:]
len(X), len(y), len(tokens)

(84159, 84159, 84160)

## Split out validation set

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

## Get vocab

In [24]:
ctoi = {c:i for i, c in enumerate(V)}
ctoi

{'.': 0,
 'eight': 1,
 'eighteen': 2,
 'eighty': 3,
 'eleven': 4,
 'fifteen': 5,
 'fifty': 6,
 'five': 7,
 'forty': 8,
 'four': 9,
 'fourteen': 10,
 'hundred': 11,
 'nine': 12,
 'nineteen': 13,
 'ninety': 14,
 'one': 15,
 'seven': 16,
 'seventeen': 17,
 'seventy': 18,
 'six': 19,
 'sixteen': 20,
 'sixty': 21,
 'ten': 22,
 'thirteen': 23,
 'thirty': 24,
 'thousand': 25,
 'three': 26,
 'twelve': 27,
 'twenty': 28,
 'two': 29}

In [34]:
def onehot(c:int, V):
    v = torch.zeros((len(V),))
    v[c] = 1
    return v

In [36]:
onehot(2, V)

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])