In [1]:
from fastai.text.all import *

# Chapter 12 - Language Model from scratch

In [2]:
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('C:/Users/oneir/.fastai/data/human_numbers/train.txt'),Path('C:/Users/oneir/.fastai/data/human_numbers/valid.txt')]

## Preparation

In [11]:
lines = L()

with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [13]:
text = ' . '.join(l.strip() for l in lines)
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [8]:
tokens = text.split(' ')
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [14]:
word2idx = {word: index for index,word in enumerate(vocab)}
nums = L(word2idx[token] for token in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

## First simple LM

We'll take 3 tokens and predict the next one.

In [16]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4, 3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [20]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4, 3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [22]:
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck
Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


In [33]:
class LM1(Module):
    def __init__(self, vocab_size, n_hidden):
        # super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_hidden)
        self.fc1 = nn.Linear(n_hidden, n_hidden)
        self.fc2 = nn.Linear(n_hidden, vocab_size)
        
    def _apply_net(self, x):
        return self.fc1(self.embedding(x))
        
    def forward(self, x):
        # first token
        h = F.relu(self.fc1(self.embedding(x[:,0])))
        
        # second token
        h = h + self.embedding(x[:,1])
        h = F.relu(self.fc1(h))
        
        # third token
        h = h + self.embedding(x[:,2])
        h = F.relu(self.fc1(h))
        
        return self.fc2(h)

In [34]:
learn = Learner(dls, LM1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.825217,1.953809,0.467316,00:00
1,1.368626,1.787102,0.468267,00:00
2,1.410161,1.632404,0.489185,00:00
3,1.378835,1.602587,0.494176,00:00


### Baseline

Let's compare this to a baseline: predicting the most common token.

In [48]:
n = 0
counts = torch.zeros(len(vocab))

for _,y in dls.valid:
    n += len(y)
    for i in range_of(vocab): counts[i] += (y==i).sum()
    
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx]/n

(tensor(29), 'thousand', tensor(0.1517))