<a href="https://colab.research.google.com/github/lkarjun/fastai-workouts/blob/master/Lesson_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package

In [3]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 727kB 30.0MB/s 
[K     |████████████████████████████████| 194kB 43.0MB/s 
[K     |████████████████████████████████| 1.2MB 48.3MB/s 
[K     |████████████████████████████████| 51kB 9.1MB/s 
[K     |████████████████████████████████| 61kB 10.1MB/s 
[K     |████████████████████████████████| 61kB 11.9MB/s 
[?25hMounted at /content/gdrive


In [4]:
from fastbook import *
from fastai.text.all import *

# Loading texts



In [5]:
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH = path
path.ls()

(#2) [Path('valid.txt'),Path('train.txt')]

In [6]:
lines = L()
with open(path/'train.txt') as f: lines+=L(*f.readlines())
with open(path/'valid.txt') as f: lines+=L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [7]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [8]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [9]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [10]:
word2idx = {w:i for i, w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

# Our First Language Model from Scratch

In [11]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [12]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [13]:
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs = 64, shuffle=False)

# Our Language Model in PyTorch

In [20]:
class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)

  def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [21]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics = accuracy)

In [22]:
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.816274,1.964143,0.460185,00:01
1,1.423805,1.739964,0.473259,00:01
2,1.430327,1.685172,0.485382,00:01
3,1.38839,1.657033,0.470406,00:01


In [23]:
n, counts = 0, torch.zeros(len(vocab))

for x, y in dls.valid:
  n += y.shape[0]
  for i in range_of(vocab): 
    counts[i] += (y==i).long().sum()

idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

(tensor(29), 'thousand', 0.15165200855716662)

In [24]:
c = Counter(tokens[cut:])
mc = c.most_common(5)
mc

[('thousand', 7104),
 ('.', 7103),
 ('hundred', 6405),
 ('nine', 2440),
 ('eight', 2344)]

In [25]:
mc[0][1] / len(tokens[cut:])

0.15353028894988222

# First RNN

In [51]:
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)

  def forward(self, x):
    h = 0
    for i in range(3):
      h = h + self.i_h(x[:, i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [52]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.789902,1.983067,0.46589,00:01
1,1.375125,1.787143,0.467792,00:01
2,1.398314,1.623682,0.49584,00:01
3,1.357892,1.654664,0.419301,00:01


# Improving The RNN

## Maintain State

In [45]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

In [58]:
a = torch.rand(4, requires_grad=True)
print(a.requires_grad)
b = a.detach()
b.requires_grad

# Detach throw away the gradient.

True


False

In [59]:
m = len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [60]:
def group_chunks(ds, bs):
  m = len(ds) // bs
  new_ds = L()
  for i in range(m): new_ds += L(ds[i + m * j] for j in range(bs))
  return new_ds

In [62]:
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs = bs, drop_last = True, shuffle = False
)

In [63]:
ModelResetter??

In [64]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func = F.cross_entropy,
                 metrics = accuracy, cbs = ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.681508,1.891548,0.467067,00:01
1,1.338465,1.691868,0.435096,00:01
2,1.136347,1.667587,0.5,00:01
3,1.048985,1.638949,0.549038,00:01
4,0.987948,1.633759,0.567788,00:01
5,0.933869,1.535209,0.578606,00:01
6,0.893617,1.669236,0.595913,00:01
7,0.828426,1.518214,0.606731,00:01
8,0.789039,1.536025,0.609615,00:01
9,0.773665,1.54348,0.612019,00:01


## Creating More Signal

In [70]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [71]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [76]:
class LMModel4(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  
  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:, i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach()
    return torch.stack(outs, dim=1)
  
  def reset(self): self.h = 0


In [77]:
def loss_func(inp, targ):
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [79]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func = loss_func,
                  metrics = accuracy, cbs = ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.202626,2.964301,0.265462,00:00
1,2.299167,1.991426,0.455892,00:00
2,1.729516,1.827375,0.467936,00:00
3,1.44489,1.982106,0.47876,00:00
4,1.267259,1.800314,0.54126,00:00
5,1.13833,1.804239,0.525879,00:00
6,1.038432,1.749226,0.52653,00:00
7,0.958852,1.688386,0.590088,00:00
8,0.905461,1.618653,0.586914,00:00
9,0.848806,1.766231,0.605876,00:00
