<a href="https://colab.research.google.com/github/phucb2/lm-hackers/blob/main/NLP_DeepDive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

In [2]:
path

Path('/root/.fastai/data/human_numbers')

In [3]:
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/train.txt'),Path('/root/.fastai/data/human_numbers/valid.txt')]

In [4]:
!cat {path}/train.txt | head

one 
two 
three 
four 
five 
six 
seven 
eight 
nine 
ten 


In [5]:
!cat {path}/valid.txt | head

eight thousand one 
eight thousand two 
eight thousand three 
eight thousand four 
eight thousand five 
eight thousand six 
eight thousand seven 
eight thousand eight 
eight thousand nine 
eight thousand ten 


In [6]:
contents = []
with open(path/'train.txt') as file:
  contents += file.readlines()
with open(path/'valid.txt') as file:
  contents += file.readlines()

In [7]:
contents[0].strip()

'one'

In [8]:
text = ' . '.join([c.strip() for c in contents])

In [9]:
vocab = sorted(list(set(text.split())))

In [10]:
word2idx = {w:i for i, w in enumerate(vocab)}
idx2word = {i:w for i, w in enumerate(vocab)}

encode = lambda x: [word2idx[w] for w in x]
decode = lambda x: ' '.join([idx2word[i] for i in x.tolist()])

In [59]:
block_size = 3
tok_text = [word2idx[w] for w in text.split()]
blocks = [(tensor(tok_text[i:i+3]), tok_text[i+3]) for i in range(0, len(tok_text) - 3, 3)]

In [61]:
decode(blocks[-1][0])

'thousand nine hundred'

In [74]:
batch_size = 32

train_size = int(len(blocks) * 0.8)
dls = DataLoaders.from_dsets(blocks[:train_size], blocks[train_size:], bs=batch_size, shuffle=False)

In [75]:
next(iter(dls))

<fastai.data.core.TfmdDL at 0x78d0b46c8310>

In [76]:
class MyModel1(Module):
  def __init__(self, vocab_size, n_hidden):
    self.embedding = nn.Embedding(vocab_size, n_hidden)
    self.linear1 = nn.Linear(n_hidden, n_hidden)
    self.linear2 = nn.Linear(n_hidden, vocab_size)

  def forward(self, x):
    xt0, xt1, xt2 = x[:, 0], x[:, 1], x[:, 2]
    ht0 = F.relu(self.linear1(self.embedding(xt0)))
    ht1 = F.relu(self.linear1(ht0 + self.embedding(xt1)))
    ht2 = F.relu(self.linear1(ht1 + self.embedding(xt2)))
    return self.linear2(ht2)

In [77]:
xb = torch.randint(0, len(vocab), (batch_size, 3))
yb = torch.randint(0, len(vocab), (batch_size, 1))

In [78]:
m = MyModel1(len(vocab), 64)
with torch.no_grad():
  o = m(xb)
o.shape

torch.Size([32, 30])

In [81]:
learn = Learner(dls, MyModel1(len(vocab), 128), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(20, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.50557,1.979384,0.481816,00:03
1,1.277574,2.075715,0.492275,00:04
2,1.125904,1.824558,0.506061,00:04
3,1.074388,1.937783,0.485144,00:03
4,1.075069,2.008507,0.500357,00:03
5,1.080431,2.558194,0.485619,00:04
6,1.036296,2.744169,0.496553,00:04
7,1.069334,1.915261,0.507012,00:04
8,1.030374,2.311141,0.506299,00:04
9,1.048479,2.709433,0.507012,00:04


In [19]:
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)

    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [20]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.246966,1.583526,0.473851,00:09
1,1.185816,1.660742,0.474168,00:09
2,1.213201,1.669734,0.474168,00:09
3,1.620471,1.704624,0.47401,00:09


In [21]:
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.emb_i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)

  def forward(self, x):
    h = 0
    for i in range(3):
      h += self.i_h(x[:, i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [22]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.250449,1.623366,0.473693,00:10
1,1.221574,1.853922,0.474168,00:10
2,1.228235,1.757317,0.474168,00:10
3,1.61017,1.673392,0.474168,00:10


In [38]:
# def group_chunks(ds, bs):
#   m = len(ds) // bs
#   chunks = L()
#   for i in range(m): # for each batch
#     chunks += L(ds[i + j*m] for j in range(bs))
#   return chunks

def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [39]:
group_chunks(text.split(), 32)

(#63072) ['one','.','eighty','.','four','twenty','fifty','hundred','seven','.'...]

In [40]:
el = L(range(12))
group_chunks(el, 4)

(#12) [0,3,6,9,1,4,7,10,2,5...]

In [69]:
# batch_size = 64
# train_size = int(len(blocks)*0.8)
# dls = DataLoaders.from_dsets(
#     group_chunks(blocks[:train_size], batch_size),
#     group_chunks(blocks[train_size:], batch_size),
#     shuffle=False, bs=batch_size, drop_last=True
# )
bs = 64
batch_size = bs
cut = int(len(blocks) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(blocks[:cut], bs),
    group_chunks(blocks[cut:], bs),
    bs=bs, drop_last=True, shuffle=False)

In [70]:
class LMModel3(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0

  def forward(self, x):
    for i in range(3):
      self.h = self.h + self.i_h(x[:, i]) # self.h += self.i_h(x[:, i])
      self.h = F.relu(self.h_h(self.h))
    out = self.h_o(self.h)
    self.h = self.h.detach()
    return out

  def reset(self): self.h=0

In [71]:
# class LMModel3(Module):
#     def __init__(self, vocab_sz, n_hidden):
#         self.i_h = nn.Embedding(vocab_sz, n_hidden)
#         self.h_h = nn.Linear(n_hidden, n_hidden)
#         self.h_o = nn.Linear(n_hidden,vocab_sz)
#         self.h = 0

#     def forward(self, x):
#         for i in range(3):
#             self.h = self.h + self.i_h(x[:,i])
#             self.h = F.relu(self.h_h(self.h))
#         out = self.h_o(self.h)
#         self.h = self.h.detach()
#         return out

#     def reset(self): self.h = 0

In [73]:
# torch.autograd.set_detect_anomaly(True)
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(20, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.10069,1.91281,0.477163,00:02
1,1.434285,1.792653,0.422837,00:02
2,1.209179,1.677742,0.482692,00:02
3,1.068935,1.780653,0.471394,00:02
4,1.021763,1.675339,0.515144,00:02
5,0.971681,1.709825,0.551202,00:02
6,0.95493,1.719184,0.560337,00:02
7,0.930293,1.644374,0.546394,00:02
8,0.919593,1.761062,0.560096,00:02
9,0.890928,1.660768,0.560096,00:02
