# Data preparation

In [3]:
import torch
import torch.nn as nn

In [4]:
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.8/719.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [5]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

In [6]:
Path.BASE_PATH = path
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [7]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [8]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [9]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [10]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [11]:
L((tokens[i:i+2], tokens[i+2]) for i in range(0,len(tokens)-2,2))

(#31547) [(['one', '.'], 'two'),(['two', '.'], 'three'),(['three', '.'], 'four'),(['four', '.'], 'five'),(['five', '.'], 'six'),(['six', '.'], 'seven'),(['seven', '.'], 'eight'),(['eight', '.'], 'nine'),(['nine', '.'], 'ten'),(['ten', '.'], 'eleven')...]

In [12]:
seqs = L((tensor(nums[i:i+2]), nums[i+2]) for i in range(0,len(nums)-2,2))
seqs

(#31547) [(tensor([0, 1]), 2),(tensor([2, 1]), 3),(tensor([3, 1]), 4),(tensor([4, 1]), 5),(tensor([5, 1]), 6),(tensor([6, 1]), 7),(tensor([7, 1]), 8),(tensor([8, 1]), 9),(tensor([9, 1]), 10),(tensor([10,  1]), 11)...]

In [13]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

# Questionaire 5
Write a module that predicts the third word given the previous two words of a
sentence.

In [14]:
class ModelQ5(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.inp = nn.Embedding(vocab_sz, n_hidden)
        self.hidden = nn.Linear(n_hidden, n_hidden)
        self.out = nn.Linear(n_hidden, vocab_sz)


    def forward(self, x):
        h = 0
        for i in range(2):
            h = h + self.inp(x[:, i])
            h = F.relu(h + self.hidden(self.inp(x[:, i])))

        return self.out(h)

In [15]:
learn = Learner(dls, ModelQ5(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.81108,2.418282,0.298098,00:06
1,1.372512,2.082177,0.387322,00:10
2,1.433461,1.794139,0.460539,00:13
3,1.422006,1.801781,0.38225,00:09


# More data preparation

In [16]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [17]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [18]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

# Questionaire 13
Write code to print out the first few batches of the validation set, including converting the token IDs back into English strings, as we showed for batches of IMDb data in Chapter 10.

In [19]:
len(dls.valid)

12

In [20]:
list(dls.valid)[0]

(tensor([[29, 26,  3,  ...,  1,  8, 29],
         [ 0, 28, 18,  ..., 28, 20,  1],
         [28, 22,  6,  ...,  0, 28, 22],
         ...,
         [ 9, 29,  8,  ...,  1,  9, 29],
         [ 1,  9, 29,  ...,  3,  1,  9],
         [28, 20,  2,  ...,  9, 28, 20]]),
 tensor([[26,  3,  1,  ...,  8, 29, 26],
         [28, 18,  1,  ..., 20,  1,  8],
         [22,  6,  1,  ..., 28, 22,  8],
         ...,
         [29,  8, 28,  ...,  9, 29,  8],
         [ 9, 29,  8,  ...,  1,  9, 29],
         [20,  2,  1,  ..., 28, 20,  4]]))

In [21]:
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [22]:
@patch
def show_batch(self:DataLoaders, vocab, max_n=1, max_toks=100):
    batches = list(self.valid)[:max_n]
    dic = {0:'x', 1:'y'}
    for i in range(min(len(self.valid), max_n)):
        for j in range(2):
            print(f'{dic[j]}{i}: ')
            count = 0
            for k in batches[i][j]:
                for l in k:
                    print(vocab[l], end=' ')
                    count += 1

                print()
                if count > max_toks:
                    break

            print()

In [23]:
dls.show_batch(vocab, max_n=2)

x0: 
thousand eighty three . eight thousand eighty four . eight thousand eighty five . eight thousand 
one hundred eighteen . eight thousand one hundred nineteen . eight thousand one hundred twenty . 
hundred forty six . eight thousand one hundred forty seven . eight thousand one hundred forty 
one hundred seventy four . eight thousand one hundred seventy five . eight thousand one hundred 
hundred two . eight thousand two hundred three . eight thousand two hundred four . eight 
two . eight thousand two hundred thirty three . eight thousand two hundred thirty four . 
hundred sixty . eight thousand two hundred sixty one . eight thousand two hundred sixty two 

y0: 
eighty three . eight thousand eighty four . eight thousand eighty five . eight thousand eighty 
hundred eighteen . eight thousand one hundred nineteen . eight thousand one hundred twenty . eight 
forty six . eight thousand one hundred forty seven . eight thousand one hundred forty eight 
hundred seventy four . eight thousand o

# Further Research 2: Create an LSTM model from scratch

In-book implementation (with pytorch LSTM module)

In [44]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        raw, self.h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in self.h]
        return self.h_o(out),raw,out

    def reset(self):
        for h in self.h: h.zero_()


In [45]:
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),
    loss_func=CrossEntropyLossFlat(), metrics=accuracy)
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.635747,1.948849,0.476562,00:02
1,1.643492,1.28316,0.632487,00:03
2,0.897427,0.90551,0.775391,00:02
3,0.436292,0.596966,0.828125,00:02
4,0.220455,0.581759,0.832601,00:02
5,0.123104,0.57045,0.838949,00:02
6,0.07561,0.505292,0.85498,00:03
7,0.052278,0.490127,0.866455,00:02
8,0.037712,0.479434,0.869466,00:02
9,0.029503,0.500043,0.861328,00:02


Verbose:

In [24]:
class LSTM_Chip(Module):
    def __init__(self, i_n, hn):
        self.forget_gate = nn.Linear(i_n + hn, hn)
        self.input_gate = nn.Linear(i_n + hn, hn)
        self.cell_gate = nn.Linear(i_n + hn, hn)
        self.output_gate = nn.Linear(i_n + hn, hn)

    def forward(self, inp, state):
        h, c = state
        h = torch.cat([h, inp], dim=1)
        c = c * torch.sigmoid(self.forget_gate(h))
        c = c + torch.sigmoid(self.input_gate(h)) * self.cell_gate(h).tanh()
        h = torch.sigmoid(self.output_gate(h)) * torch.tanh(c)
        return h, (h, c)

Refactored:

In [25]:
class LSTM_Ref(Module):
    def __init__(self, i_n, hn):
        self.i_g = nn.Linear(i_n, 4*hn, bias=True)
        self.h_g = nn.Linear(hn, 4*hn, bias=True)

    def forward(self, inp, state):
        h, c = state
        gates = (self.i_g(inp) + self.h_g(h)).chunk(4, 1)
        forget_gate, input_gate, output_gate = map(torch.sigmoid, gates[:3])
        cell_gate = gates[3].tanh()
        c = c * forget_gate + input_gate * cell_gate
        h = output_gate * torch.tanh(c)
        return h, (h, c)


Let's put it in a model

In cannonical dropout method I should return dropped instead of raw, however after many experiments I realised that when using dropout just for regularization and returning raw I get better results

In [26]:
class LSTM_Model(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.n_layers = n_layers
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = LSTM_Ref(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]
        self.drop = nn.Dropout(p)

    def forward(self, x):
        raw = []
        dropped = []
        for i in range(sl):
            for j in range(self.n_layers):
                y, self.h[j] = self.rnn(self.i_h(x[:, i]), self.h[j])

            raw.append(y)
            y = self.drop(y)
            dropped.append(y)

        for i in range(self.n_layers):
            self.h[i] = torch.stack(self.h[i])

        self.h = [h_.detach() for h_ in self.h]
        raw = torch.stack(raw, dim=1)
        dropped = torch.stack(dropped, dim=1)

        # RNNRegularizer config:
        self.rnn.out = dropped
        self.rnn.raw_out = raw

        return self.h_o(raw)

    def reset(self):
        for h in self.h:
            h.zero_()

In [27]:
learn = Learner(dls, LSTM_Model(len(vocab), 64, 2, 0.5),
    loss_func=CrossEntropyLossFlat(), metrics=accuracy,
    cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])
learn.fit_one_cycle(15, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.778816,2.516778,0.375488,00:02
1,1.931565,2.088421,0.338298,00:02
2,1.563989,1.858421,0.39681,00:03
3,1.198632,1.396879,0.565918,00:03
4,0.766268,0.912037,0.719076,00:03
5,0.44501,0.659433,0.815999,00:02
6,0.269486,0.534309,0.865397,00:03
7,0.180512,0.499417,0.87443,00:03
8,0.135877,0.462129,0.883708,00:04
9,0.111229,0.458973,0.883057,00:02


# Further Research 3: GRU architecture
Search the internet for the GRU architecture and implement it from scratch, and
try training a model. See if you can get results similar to those we saw in this
chapter. Compare your results to the results of PyTorch’s built-in GRU module

Let's first use the built in GRU module

In [125]:
class Torch_GRU(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.GRU(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = torch.zeros(n_layers, bs, n_hidden)

    def forward(self, x):
        raw, self.h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = self.h.detach()

        # RNNRegularizer config:
        self.rnn.out = out
        self.rnn.raw_out = raw

        return self.h_o(out)

    def reset(self):
        self.h.zero_()

In [129]:
learn = Learner(dls, Torch_GRU(len(vocab), 64, 4, 0.5),
    loss_func=CrossEntropyLossFlat(), metrics=accuracy,
    cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.786015,2.150858,0.444824,00:06
1,1.8246,1.333833,0.659993,00:04
2,0.992794,0.741336,0.800863,00:06
3,0.517223,0.528107,0.852051,00:04
4,0.275163,0.403291,0.87793,00:04
5,0.162597,0.318267,0.904948,00:05
6,0.107959,0.274084,0.923991,00:04
7,0.079868,0.32734,0.896077,00:04
8,0.063948,0.266987,0.927002,00:05
9,0.054332,0.292912,0.90804,00:04


Sometimes it is perfoming much better than LSTM (I have got over 95% accuracy a few times), I assume thats because its simpler architecture fits better to this simple task, in a way at least

I found out that pytorch implementation of GRU module differs from the cannonical version, therefore I will test both

In [115]:
class GRU_Standard(Module):
    def __init__(self, i_s, n_hidden, n_layers):
        inp = i_s + n_hidden
        self.reset = nn.Linear(inp, n_hidden, bias=True)
        self.update = nn.Linear(inp, n_hidden, bias=True)
        self.outp = nn.Linear(inp, n_hidden, bias=True)

    def forward(self, x, state):
        inp = torch.cat([state, x], dim=1)
        u = torch.sigmoid(self.reset(inp)) * state
        z = torch.sigmoid(self.update(inp))
        inp2 = torch.cat([state, u], dim=1)
        ht = self.outp(inp2).tanh()
        state = z * state + (1-z) * ht
        return state, state

In [116]:
class GRU_TorchLike(Module):
    def __init__(self, i_s, n_hidden, n_layers):
        inp = i_s + n_hidden
        self.reset = nn.Linear(inp, n_hidden, bias=True)
        self.update = nn.Linear(inp, n_hidden, bias=True)
        self.outx = nn.Linear(i_s, n_hidden, bias=True)
        self.outs = nn.Linear(n_hidden, n_hidden, bias=True)

    def forward(self, x, state):
        inp = torch.cat([state, x], dim=1)
        r = torch.sigmoid(self.reset(inp))
        z = torch.sigmoid(self.update(inp))
        ht = (self.outx(x) + r * self.outs(state)).tanh()
        state = z * state + (1-z) * ht
        return state, state

In [122]:
class GRU_Model(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.n_layers = n_layers
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = GRU_TorchLike(n_hidden, n_hidden, n_layers)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.drop = nn.Dropout(p)
        self.h = [torch.zeros(bs, n_hidden) for _ in range(n_layers)]

    def forward(self, x):
        raw = []
        dropped = []
        for i in range(sl):
            for j in range(self.n_layers):
                y, self.h[j] = self.rnn(self.i_h(x[:, i]), self.h[j])

            raw.append(y)
            y = self.drop(y)
            dropped.append(y)

        raw = torch.stack(raw, dim=1)
        dropped = torch.stack(dropped, dim=1)
        self.h = [h.detach() for h in self.h]

        # RNNRegularizer config:
        self.rnn.out = dropped
        self.rnn.raw_out = raw

        return self.h_o(dropped)

    def reset(self):
        for h in self.h:
            h.zero_()

In [123]:
learn = Learner(dls, GRU_Model(len(vocab), 64, 4, 0.5),
    loss_func=CrossEntropyLossFlat(), metrics=accuracy,
    cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,3.866516,2.546429,0.280518,00:04
1,2.483351,2.016129,0.375651,00:06
2,1.86933,1.471062,0.553874,00:05
3,1.338608,1.128061,0.651123,00:04
4,0.961023,0.877481,0.726318,00:05
5,0.717956,0.675056,0.802897,00:04
6,0.553062,0.615733,0.812419,00:06
7,0.447666,0.513644,0.869222,00:05
8,0.376951,0.467945,0.891357,00:04
9,0.328746,0.399597,0.921143,00:05


The TorchLike version of the module seems to perform better here, although because pytorch built-in implementation initializes weights better than me, my implementatations usually ends up with much worse results, and to be honest the result above is kinda a miracle :P usually I was getting 88% accuracy