<a href="https://colab.research.google.com/github/mmsamiei/just-practice-deep/blob/master/language-modeling-torchtext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torchtext
from torchtext import data
import spacy

from spacy.symbols import ORTH
my_tok = spacy.load('en')

def spacy_tok(x):
  return [tok.text for tok in my_tok.tokenizer(x)]

TEXT = data.Field(lower=True, tokenize=spacy_tok)

In [0]:
my_tok.tokenizer.add_special_case("don't", [{ORTH: "do"},{ORTH: "n't"}])

In [0]:
from torchtext.datasets import WikiText2

train, valid, test = WikiText2.splits(TEXT)

In [4]:
len(train)

1

In [0]:
TEXT.build_vocab(train, vectors="glove.6B.200d")

In [0]:
import torch
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=32,
    bptt_len=30, # this is where we specify the sequence length
    device=torch.device("cuda"),
    repeat=False)

In [0]:
b = next(iter(train_iter))

In [8]:
b


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 30x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 30x32 (GPU 0)]

In [9]:
vars(b).keys()

dict_keys(['batch_size', 'dataset', 'fields', 'text', 'target'])

In [10]:
b.text.shape

torch.Size([30, 32])

In [11]:
b.text[:5,:3]

tensor([[   12,  1934, 20015],
        [   13,    10,    30],
        [   12,    32,     2],
        [   15,   472, 10782],
        [ 3875,    22,  3276]], device='cuda:0')

In [12]:
b.target[:5, :3]

tensor([[   13,    10,    30],
        [   12,    32,     2],
        [   15,   472, 10782],
        [ 3875,    22,  3276],
        [ 3895,   323,     6]], device='cuda:0')

**Training!**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V

class RNNModel(nn.Module):
  def __init__(self, ntoken, ninp, nhid, nlayers, bsz, dropout=0.5, tie_weights=True):
    super(RNNModel, self).__init__()
    self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
    self.drop = nn.Dropout(dropout)
    self.encoder = nn.Embedding(ntoken, ninp)
    self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout = dropout)
    self.decoder = nn.Linear(nhid, ntoken)
    self.hidden = self.init_hidden(bsz)
  
  def init_weights(self):
    initrange = 0.1
    self.encoder.weight.data.uniform_(-initrange, initrange)
    self.decoder.bias.data.fill_(0)
    self.decoder.weight.data.uniform_(-initrange, initrange)

  def forward(self, input):
    emb = self.drop(self.encoder(input))
    output, self.hidden = self.rnn(emb, self.hidden)
    output = self.drop(output)
    decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
    return decoded.view(output.size(0), output.size(1), decoded.size(1))

  def init_hidden(self, bsz):
    weight = next(self.parameters()).data
    return (V(weight.new(self.nlayers, bsz, self.nhid).zero_().cuda()),
                V(weight.new(self.nlayers, bsz, self.nhid).zero_()).cuda())
  
  def reset_history(self):
        self.hidden = tuple(V(v.data) for v in self.hidden)


In [14]:
weight_matrix = TEXT.vocab.vectors
BATCH_SIZE = 32
model = RNNModel(weight_matrix.size(0), weight_matrix.size(1), 200, 1, BATCH_SIZE)
model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

  "num_layers={}".format(dropout, num_layers))


RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(28870, 200)
  (rnn): LSTM(200, 200, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28870, bias=True)
)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.7, 0.99))
n_tokens = weight_matrix.size(0)

In [0]:
from tqdm import tqdm 
def train_epoch(epoch):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_iter):
    # reset the hidden state or else the model will try to backpropagate to the
    # beginning of the dataset, requiring lots of time and a lot of memory
         model.reset_history()
 
    optimizer.zero_grad()
 
    text, targets = batch.text, batch.target
    prediction = model(text)
    # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
    # we therefore flatten the predictions out across the batch axis so that it becomes
    # shape (batch_size * sequence_length, n_tokens)
    # in accordance to this, we reshape the targets to be
    # shape (batch_size * sequence_length)
    loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
    loss.backward()
 
    optimizer.step()
 
    epoch_loss += loss.data * prediction.size(0) * prediction.size(1)
 
    epoch_loss /= len(train.examples[0].text)
 
    # monitor the loss
    val_loss = 0
    model.eval()
    for batch in valid_iter:
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        val_loss += loss.data * text.size(0)
    val_loss /= len(valid.examples[0].text)
 
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

In [17]:
n_epochs = 3
for epoch in range(1, n_epochs + 1):
    train_epoch(epoch)

100%|██████████| 2330/2330 [00:00<00:00, 4618.57it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.0037, Validation Loss: 0.3201


100%|██████████| 2330/2330 [00:00<00:00, 4513.43it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.0037, Validation Loss: 0.3192


100%|██████████| 2330/2330 [00:00<00:00, 4657.04it/s]


Epoch: 3, Training Loss: 0.0036, Validation Loss: 0.3182


In [0]:
import numpy as np
def word_ids_to_sentence(id_tensor, vocab, join=None):
  if isinstance(id_tensor, torch.LongTensor):
    ids = id_tensor.transpose(0, 1).contiguous().view(-1)
  elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
  batch = [vocab.itos[ind] for ind in ids]
  if join is None:
        return batch
  else:
        return join.join(batch)

In [19]:
arrs = model(b.text).cpu().data.numpy()
word_ids_to_sentence(np.argmax(arrs, axis=2), TEXT.vocab, join=' ')

'the the the allegation gel running stamps 52nd anticyclone clerical clerical clerical nogić baritone jewelry pace the pole > solbakken solbakken patch of the of the the the gel the the the the the the the the the the the the the the dismisses > solbakken baritone the responsibilities the the the owasco clerical clerical and clerical the the the the the the 237 carrier pole convoys solbakken celebrates duchovny the the clerical the the the the the the the kurdish averaged averaged the the the the the the the for 52nd = clerical = = = = clerical clerical clerical nogić nogić allegation = = = clerical baritone shrimp = = = = clerical clerical clerical duchovny and the the the the the the the clerical the the the the the forehead > solbakken lbw the the clerical clerical the the clerical the clerical clerical clerical of clerical the the the the the the the the the the of the the the the the clerical clerical clerical stretched the the the the the the the the the the 1691 the the the the 