<a href="https://colab.research.google.com/github/lkarjun/fastai-workouts/blob/master/Lesson_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package

In [None]:
!pip install fastai==2.5.3

In [1]:
from fastai.text.all import *

# Loading texts



In [3]:
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [5]:
lines = L()
with open(path/'train.txt') as f: 
  lines+=L(*f.readlines())
with open(path/'valid.txt') as f: 
  lines+=L(*f.readlines())

lines[:50]

(#50) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [11]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [12]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [13]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [14]:
word2idx = {w:i for i, w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

# Our First Language Model from Scratch

In [None]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [16]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [17]:
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs = 64, shuffle=False)

# Our Language Model in PyTorch

In [41]:
class LMModel1(Module):

  def __init__(self, vocab_sz, n_hidden):
  
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)

  def forward(self, x):
    
        h = self.i_h(x[:, 0])
        h = self.h_h(h)
        h = F.relu(h)

        h = h + self.i_h(x[:, 1])
        h = self.h_h(h)
        h = F.relu(h)
        
        h = h + self.i_h(x[:, 2])
        h = self.h_h(h)
        h = F.relu(h)

        return self.h_o(h)

In [39]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics = [accuracy, perplexity])

In [40]:
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,1.76277,1.921423,0.460898,6.830675,00:02
1,1.366518,1.79424,0.466128,6.014903,00:02
2,1.403189,1.653472,0.49275,5.225092,00:02
3,1.373706,1.609752,0.495127,5.00157,00:02


In [49]:
c = Counter(tokens[cut: ])

mc = c.most_common(5)
mc

[('thousand', 7104),
 ('.', 7103),
 ('hundred', 6405),
 ('nine', 2440),
 ('eight', 2344)]

**The most common token has the token thousand. Always predicting this token would give us an accuracy of roughly 15\%, so we are faring way better!**

In [51]:
mc[0][1]/len(tokens[cut:])


0.15353028894988222

# First RNN

In [54]:
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)

  def forward(self, x):
    h = 0
    for i in range(3):
      h = h + self.i_h(x[:, i])
      h = self.h_h(h)
      h = F.relu(h)
    return self.h_o(h)

In [55]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.731641,1.963726,0.466366,00:02
1,1.369604,1.807862,0.467792,00:02
2,1.392134,1.63404,0.496078,00:02
3,1.343208,1.697748,0.379605,00:02


**A neural network that is defined using a loop like this is called a recurrent neural network (RNN). It is important to realize that an RNN is not a complicated new architecture, but simply a refactoring of a multilayer neural network using a for loop.**

# Improving The RNN

## Maintain State

In [56]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

In [57]:
a = torch.rand(4, requires_grad=True)
print(a.requires_grad)
b = a.detach()
b.requires_grad

# Detach throw away the gradient.

True


False

In [58]:
m = len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [59]:
def group_chunks(ds, bs):
  m = len(ds) // bs
  new_ds = L()
  for i in range(m): new_ds += L(ds[i + m * j] for j in range(bs))
  return new_ds

In [60]:
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs = bs, drop_last = True, shuffle = False
)

In [69]:
ModelResetter??

In [68]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func = F.cross_entropy,
                 metrics = accuracy, cbs = ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.691886,1.86836,0.469952,00:02
1,1.277486,1.748191,0.451442,00:02
2,1.111267,1.48423,0.526683,00:02
3,1.044472,1.655347,0.470913,00:02
4,0.977525,1.709594,0.549279,00:02
5,0.943801,1.840906,0.532452,00:02
6,0.877928,1.723314,0.580048,00:02
7,0.831641,1.696125,0.598798,00:02
8,0.785702,1.802927,0.600962,00:02
9,0.767383,1.780044,0.595673,00:02


## Creating More Signal

In [70]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [74]:
[L(vocab[o] for o in s) for s in seqs[2]]

[(#16) ['seventeen','.','eighteen','.','nineteen','.','twenty','.','twenty','one'...],
 (#16) ['.','eighteen','.','nineteen','.','twenty','.','twenty','one','.'...]]

In [118]:
class LMModel4(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0

  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:, i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach()
    return torch.stack(outs, dim=1)
  
  def reset(self): self.h = 0


In [112]:
def loss_func(inp, targ):
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [119]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func = loss_func,
                  metrics = accuracy, cbs = ModelResetter)
learn.fit_one_cycle(1, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.991287,2.02902,0.46582,00:00
