In [None]:
!pip install torchtext==0.6.0

##Loading Libraries and Data


In [2]:
# imports
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random
import pdb

import torch
from torch import nn
import torch.nn.functional as F
import torchtext

# download and load the data
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='/content/drive/MyDrive/NLP/Language_Modeling', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets

text_field.build_vocab(train_dataset, validation_dataset, test_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)

train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text

print(validation_text[:30])

['<eos>', '=', 'Homarus', 'gammarus', '=', '<eos>', '<eos>', 'Homarus', 'gammarus', ',', 'known', 'as', 'the', 'European', 'lobster', 'or', 'common', 'lobster', ',', 'is', 'a', 'species', 'of', '<unk>', 'lobster', 'from', 'the', 'eastern', 'Atlantic', 'Ocean']


##Unigram Model

In [None]:
class UnigramModel:
  def __init__(self, train_text):
    self.counts = Counter(train_text)
    self.total_count = len(train_text)

  def probability(self, word):
    return self.counts[word] / self.total_count

  def next_word_probabilities(self, text_prefix):
    return [self.probability(word) for word in vocab.itos]

  def perplexity(self, full_text):
    log_probs = []
    for word in full_text:
      log_probs.append(math.log(self.probability(word),2))

    return 2 ** -np.mean(log_probs)

In [None]:
unigram_demonstration_model = UnigramModel(train_text)
print('unigram validation perplexity:',
      unigram_demonstration_model.perplexity(validation_text))

unigram validation perplexity: 965.0860734119312


In [None]:
def check_validity(model):
    """Performs several sanity checks on your model:
    1) That next_word_probabilities returns a valid distribution
    2) That perplexity matches a perplexity calculated from next_word_probabilities

    Although it is possible to calculate perplexity from next_word_probabilities,
    it is still good to have a separate more efficient method that only computes
    the probabilities of observed words.
    """

    log_probabilities = []
    for i in range(10):
        prefix = validation_text[:i]
        probs = model.next_word_probabilities(prefix)
        assert min(probs) >= 0, "Negative value in next_word_probabilities"
        assert max(probs) <= 1 + 1e-8, "Value larger than 1 in next_word_probabilities"
        assert abs(sum(probs)-1) < 1e-4, "next_word_probabilities do not sum to 1"

        word_id = vocab.stoi[validation_text[i]]
        selected_prob = probs[word_id]
        log_probabilities.append(math.log(selected_prob))

    perplexity = math.exp(-np.mean(log_probabilities))
    your_perplexity = model.perplexity(validation_text[:10])
    assert abs(perplexity-your_perplexity) < 0.1, "your perplexity does not " + \
    "match the one we calculated from `next_word_probabilities`,\n" + \
    "at least one of `perplexity` or `next_word_probabilities` is incorrect.\n" + \
    f"we calcuated {perplexity} from `next_word_probabilities`,\n" + \
    f"but your perplexity function returned {your_perplexity} (on a small sample)."


In [None]:
check_validity(unigram_demonstration_model)

In [18]:
def generate_text(model, n=20, prefix=('<eos>', '<eos>')):
  prefix = list(prefix)
  for _ in range(n):
    probs = model.next_word_probabilities(prefix)
    word = random.choices(vocab.itos, probs)[0]
    prefix.append(word)

  return ' '.join(prefix)


print(generate_text(unigram_demonstration_model))

## N-gram Model

In [None]:
def n_grams(text, n):
    n_gram = []

    for i in range(len(text)-(n-1)):
      words = []
      j = i
      while(j<=len(text) and j-i+1 <= n):
        words.extend([text[j]])
        j+=1
      n_gram.append(tuple(words))

    return n_gram

In [None]:
class NGramModel:
  def __init__(self, train_text, n=2, alpha = 3e-3):
    self.n = n
    self.smoothing = alpha
    self.counts_n_gram = Counter(n_grams(train_text, n))
    self.counts_n_gram_1 = Counter(n_grams(train_text, n-1))


  def n_gram_probability(self, n_gram):
    assert len(n_gram) == self.n
    if(self.n>1):
        return (self.counts_n_gram[tuple(n_gram)] + self.smoothing) / (self.counts_n_gram_1[tuple(n_gram[:len(n_gram)-1])] + (vocab_size * self.smoothing))
    else:
        return (self.counts_n_gram[tuple(n_gram)] + self.smoothing) / (len(train_text) + (vocab_size * self.smoothing))

  def next_word_probabilities(self, text_prefix):
    probs = []
    for word in vocab.itos:
      if(len(text_prefix)<self.n-1):
         probs.append(1/vocab_size)
      else:
        n_gram = text_prefix[(len(text_prefix) - self.n+1):len(text_prefix)] + [word]
        probs.append(self.n_gram_probability(n_gram))

    return probs

  def perplexity(self, full_text):
    log_probs = []
    for i in range(len(full_text)):
      if(i>=self.n-1):
        n_gram = full_text[i-self.n+1:i+1]
        log_probs.append(math.log(self.n_gram_probability(n_gram), 2))
      else:
        log_probs.append(1/vocab_size)
    return 2 ** -np.mean(log_probs)

In [None]:
unigram_model = NGramModel(train_text, 1)
print('unigram validation perplexity:', unigram_model.perplexity(validation_text)) # this should be the almost the same as our unigram model perplexity above

bigram_model = NGramModel(train_text, n=2)
print('bigram validation perplexity:', bigram_model.perplexity(validation_text))

trigram_model = NGramModel(train_text, n=3)
print('trigram validation perplexity:', trigram_model.perplexity(validation_text)) # this won't do very well...

unigram validation perplexity: 965.0913686618096
bigram validation perplexity: 504.4054886536929
trigram validation perplexity: 2965.381793306292


In [None]:
generate_text(unigram_model)

"<eos> <eos> <unk> 2003 Medical A usually way sent possible the school into the . grew Echmarcach toll had Benares 's tenure"

In [None]:
generate_text(bigram_model)

"<eos> <eos> Joe 's nose , and Tennyson pyramids wonderful Piedras TA 1751 joins Dentists Graves Tomb outgassing Hancock fledglings anarchy NC"

In [None]:
generate_text(trigram_model)

'<eos> <eos> = = Career = mm unjust paradoxical Although Baibars Wynne indefinitely discretion margins Performing Nadu O. motion suggests 1754 Armstrong'

In [None]:
# Free up some RAM.
del bigram_model
del trigram_model

## Discounted Backoff

This basic model works okay for bigrams, but a better strategy (especially for higher-order models) is to use backoff.  Implement backoff with absolute discounting.
$$P\left(w_i|w_{i-n+1}^{i-1}\right)=\frac{max\left\{C(w_{i-n+1}^i)-\delta,0\right\}}{\sum_{w_i} C(w_{i-n+1}^i)} + \alpha(w_{i-n+1}^{i-1}) P(w_i|w_{i-n+2}^{i-1})$$

$$\alpha\left(w_{i-n+1}^{i-1}\right)=\frac{\delta N_{1+}(w_{i-n+1}^{i-1})}{{\sum_{w_i} C(w_{i-n+1}^i)}}$$
where $N_{1+}$ is the number of words that appear after the previous $n-1$ words (the number of times the max will select something other than 0 in the first equation).  If $\sum_{w_i} C(w_{i-n+1}^i)=0$, use the lower order model probability directly (the above equations would have a division by 0).


In [None]:
class DiscountBackoffModel(NGramModel):
  def __init__(self, train_text, lower_order_model, n=2, delta=0.9):
    super().__init__(train_text, n = n)
    self.lower_order_model = lower_order_model
    self.discount = delta
    self.counts_n_gram = Counter(n_grams(train_text,n))
    self.counts_n_gram_1 = Counter(n_grams(train_text,n-1))

  def n_gram_probability(self, n_gram):
    assert len(n_gram) == self.n
    n_gram_1 = n_gram[:len(n_gram)-1]
    total_count_n_gram = 0
    N = 0
    for word in vocab.itos:
      text = n_gram_1 + [word]
      total_count_n_gram += self.counts_n_gram[tuple(text)]
      N += min(self.counts_n_gram[tuple(text)],1)

    if(total_count_n_gram == 0):
      return self.lower_order_model.n_gram_probability(n_gram[1:])

    prob = max(self.counts_n_gram[tuple(n_gram)] - self.discount, 0) / total_count_n_gram
    alpha = (self.discount * N) / total_count_n_gram
    prob += alpha * self.lower_order_model.n_gram_probability(n_gram[1:])
    return prob

In [None]:
bigram_backoff_model = DiscountBackoffModel(train_text, unigram_model, 2)
trigram_backoff_model = DiscountBackoffModel(train_text, bigram_backoff_model, 3)
print('trigram backoff validation perplexity:', trigram_backoff_model.perplexity(validation_text))

trigram backoff validation perplexity: 271.0957323219511


In [None]:
del unigram_model
del bigram_backoff_model
del trigram_backoff_model

## Neural N-gram Model

In [None]:
def ids(tokens):
  return [vocab.stoi[t] for t in tokens]

assert torch.cuda.is_available()

class NeuralNgramDataset(torch.utils.data.Dataset):
  def __init__(self, text_token_ids, n):
    self.text_token_ids = text_token_ids
    self.n = n

  def __len__(self):
    return len(self.text_token_ids)

  def __getitem__(self,i):
    if i < self.n-1:
      prev_token_ids = [vocab.stoi['<eos>']] * (self.n-i-1) + self.text_token_ids[:i]
    else:
      prev_token_ids = self.text_token_ids[i-self.n+1:i]

    assert len(prev_token_ids) == self.n-1

    x = torch.tensor(prev_token_ids)
    y = torch.tensor(self.text_token_ids[i])
    return x, y

class NeuralNGramNetwork(nn.Module):
  def __init__(self,n):
    super().__init__()
    self.n = n
    self.net = nn.Sequential(
    nn.Linear((n-1)*128,1024),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(1024,128),
    nn.ReLU(),
    nn.Dropout(0.1),
    )
    self.final_layer = nn.Linear(128, vocab_size)

  def forward(self,x):
    embeds = F.embedding(x,self.final_layer.weight)
    if(len(embeds)>2):
      embeds = torch.flatten(embeds,1)
    else:
      embeds = torch.flatten(embeds)
    out = self.net(embeds)
    out = F.log_softmax(self.final_layer(out))
    return out


class NeuralNGramModel:
  def __init__(self,n):
    self.n = n
    self.network = NeuralNGramNetwork(n).cuda()
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.Adam(self.network.parameters(),1e-3)

  def train(self):
    dataset = NeuralNgramDataset(ids(train_text), self.n)
    train_loader = torch.utils.data.DataLoader(dataset, batch_size =128, shuffle=True)
    for epoch in range(10):
      print('epoch: {}'.format(epoch + 1))
      running_loss = 0
      for prefix, target in (train_loader):
        prefix = prefix.cuda()
        target = target.cuda()
        output = self.network(prefix)
        loss = self.criterion(output,target)
        running_loss += loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
      print('Loss: {:.6f}'.format(running_loss/len(train_loader)))

  def next_word_probabilities(self, text_prefix):
    self.network.eval()
    probs = []
    prefix = text_prefix + ["<eos>"]
    dataset = NeuralNgramDataset(ids(prefix), self.n)
    prefix, target = dataset[len(prefix)-1]
    prefix = prefix.cuda()
    output = self.network(prefix)
    for i in range(vocab_size):
      probs.append(math.exp(output[i]))
    return probs

  def perplexity(self,text):
    log_probs = []
    self.network.eval()
    dataset = NeuralNgramDataset(ids(text), self.n)
    for i in range(len(text)):
       prefix, target = dataset[i]
       prefix = prefix.cuda()
       target = target.cuda()
       output = self.network(prefix)
       log_probs.append(math.log(math.exp(output[target]),2))
    return 2 ** -np.mean(log_probs)

In [None]:
neural_trigram_model = NeuralNGramModel(3)
check_validity(neural_trigram_model)
neural_trigram_model.train()

  out = F.log_softmax(self.final_layer(out))


epoch: 1
Loss: 5.919364
epoch: 2
Loss: 5.387251
epoch: 3
Loss: 5.195302
epoch: 4
Loss: 5.075787
epoch: 5
Loss: 4.991659
epoch: 6
Loss: 4.926989
epoch: 7
Loss: 4.877513
epoch: 8
Loss: 4.836218
epoch: 9
Loss: 4.803491
epoch: 10
Loss: 4.776093


In [None]:
print('neural trigram validation perplexity:', neural_trigram_model.perplexity(validation_text))

  out = F.log_softmax(self.final_layer(out))


neural trigram validation perplexity: 264.9605479065314


In [None]:
# Delete model we don't need.
del neural_trigram_model

##LSTM Language Model

In [46]:
class LSTMNetwork (nn.Module):

  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(input_size = 128, hidden_size = 512, num_layers = 3, dropout = 0.5)
    self.dropout = nn.Dropout(0.5)
    self.linear_1 = nn.Linear(512, 128)
    self.linear_2 = nn.Linear(128, vocab_size)

  def forward(self, x, state):
    embeds = F.embedding(x,self.linear_2.weight)
    out, hidden = self.lstm(embeds, state)
    out = self.dropout(out)
    out = self.linear_1(out)
    out = F.log_softmax(self.linear_2(out))
    return out, hidden

class LSTMModel:

  def __init__(self):
    self.network = LSTMNetwork().cuda()
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.Adam(self.network.parameters(),1e-3)

  def detach_hidden(self, hidden):
    hidden, cell = hidden
    hidden = hidden.detach()
    cell = cell.detach()
    return hidden, cell

  def init_hidden(self, batch_size):
    hidden = torch.zeros(3, batch_size, 512).cuda()
    cell = torch.zeros(3, batch_size, 512).cuda()
    return hidden, cell

  def train(self):
    self.network.train()
    train_iterator = torchtext.data.BPTTIterator(train_dataset, batch_size = 64,
                                                 bptt_len=32, device = 'cuda')
    for epoch in range(20):
      print('epoch: {}'.format(epoch+1))
      running_loss = 0
      hidden = self.init_hidden(batch_size = 64)
      for batch in train_iterator:
        prefix = batch.text.cuda()
        target = batch.target.cuda()

        self.network.zero_grad()

        hidden = self.detach_hidden(hidden)
        output, hidden = self.network(prefix, hidden)

        loss = self.criterion(output.view(output.size(0)*output.size(1), output.size(2)), target.view(-1))
        running_loss += loss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
      print('Loss: {:.6f}'.format(running_loss/len(train_iterator)))


  def next_word_probabilities(self, text_prefix):
    prefix_token_tensor = torch.tensor(ids(text_prefix), device='cuda').view(-1, 1)
    self.network.eval()
    probs = []
    hidden = self.init_hidden()
    output, hidden = self.network(prefix_token_tensor, hidden)
    for i in range(vocab_size):
      probs.append(math.exp(output[i]))
    return probs

  def dataset_perplexity(self, torchtext_dataset):
    with torch.no_grad():
      iterator = torchtext.data.BPTTIterator(torchtext_dataset, batch_size = 32, bptt_len=32, device='cuda')
      losses = []
      self.network.eval()
      hidden = self.init_hidden(batch_size = 32)
      for batch in iterator:
        prefix = batch.text.cuda()
        target = batch.target.cuda()
        output, hidden = self.network(prefix, hidden)
        loss = F.cross_entropy(output.view(output.size(0)*output.size(1), output.size(2)), target.view(-1))
        losses.append(loss.cpu().numpy())
      return math.exp(np.mean(losses))

In [47]:
lstm_model = LSTMModel()
lstm_model.train()

epoch: 1


  out = F.log_softmax(self.linear_2(out))


Loss: 7.138511
epoch: 2
Loss: 6.111068
epoch: 3
Loss: 5.860744
epoch: 4
Loss: 5.619581
epoch: 5
Loss: 5.464509
epoch: 6
Loss: 5.347710
epoch: 7
Loss: 5.240488
epoch: 8
Loss: 5.152400
epoch: 9
Loss: 5.089320
epoch: 10
Loss: 5.018318
epoch: 11
Loss: 4.955593
epoch: 12
Loss: 4.904957
epoch: 13
Loss: 4.861936
epoch: 14
Loss: 4.818425
epoch: 15
Loss: 4.776217
epoch: 16
Loss: 4.750140
epoch: 17
Loss: 4.743425
epoch: 18
Loss: 4.689441
epoch: 19
Loss: 4.682596
epoch: 20
Loss: 4.643999


In [48]:
print('lstm validation perplexity:', lstm_model.dataset_perplexity(validation_dataset))

  out = F.log_softmax(self.linear_2(out))


lstm validation perplexity: 157.97293265712904
