# Language Models

One of the important and almost backbone of different NLP related tasks!

`# TODO: Add more details`

We will explore statistical and machine learning methods!
Lets starts with statistical methods first!

### Statistical method

In [1]:
# Thanks https://nlpforhackers.io/language-models/
import random
from collections import Counter, defaultdict

import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams

In [None]:
nltk.download('punkt')
nltk.download('reuters')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maqboolkhan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
uni_grams = Counter(reuters.words())
total_count = len(reuters.words())
 

# Compute the probabilities (uni-grams)
for word in uni_grams:
    uni_grams[word] /= float(total_count)

In [14]:
uni_grams_counter = Counter(uni_grams)
uni_grams_counter.most_common(10)

[('.', 0.055021758950689205),
 (',', 0.042047741270415905),
 ('the', 0.033849129031826936),
 ('of', 0.02090707135390124),
 ('to', 0.01977743054365126),
 ('in', 0.015386126221089999),
 ('said', 0.014657438167564549),
 ('and', 0.014552260705293332),
 ('a', 0.013650988639090802),
 ('mln', 0.010481137497159917)]

In [15]:
def generate_text(grams, n, context, length):
    text = list(context)
    context = context[0] if n == 2 else context # bigrams had , in tne context tuple hence to remedy that comma!
    
    for i in range(length):
        sum = 0
        
        r = random.random()
        
        if context:
            candidates = grams[context]
        else:
            candidates = grams
        
    
        for k in candidates.keys():
            sum += candidates[k]
            if sum > r:
                text.append(k)
                
                if context:
                    context = (k) if n == 2 else (context[2-n], k)
                
                break
    text = ['None' if token == None else token  for token in text] # Replacing None with 'None'
    return ' '.join(text)

In [653]:
generate_text(uni_grams, 1, (), 10)

', IN Allegis banks 348 said April intervention total year'

In [227]:
def calc_probs(grams):
    # Let's transform the counts to probabilities
    for context in grams:
        total_count = float(sum(grams[context].values()))
        for next_word in grams[context]:
            grams[context][next_word] /= total_count

In [228]:
bi_grams = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in reuters.sents():
    for w1, w2 in bigrams(sentence, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>"):
        bi_grams[(w1)][w2] += 1 

calc_probs(bi_grams)

In [231]:
generate_text(bi_grams, 2, ('The',), 10)

'The Federal Savings System Inc and Japanese intervention ," he added'

In [239]:
tri_grams = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>"):
        tri_grams[(w1, w2)][w3] += 1
    
calc_probs(tri_grams)

In [242]:
generate_text(tri_grams, 3, ('<s>', 'The'), 10)

'<s> The bonus award was made during the morning after analyst Daniel'

### Perplexity

In [223]:
# https://stats.stackexchange.com/a/143638/291743
N = 0
summation = 0
for word in nltk.tokenize.wordpunct_tokenize(reuters.raw()):
    N += 1
    summation += math.log(uni_grams[word], 2)

print("Uni gram perplexity: ", pow(2, -summation * (1/N)))

Uni gram perplexity:  1077.8271341207844


In [249]:
# https://towardsdatascience.com/perplexity-in-language-models-87a196019a94
N = 0
summation = 0
for sentence in reuters.sents():
    N += len(sentence) + 2 # +2 for <s> and </s> 
    for w1, w2 in bigrams(sentence, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>"): 
        summation += math.log(bi_grams[(w1)][w2], 2)
print("Bi gram perplexity: ", pow(2, -summation * (1/N)))

Bi gram perplexity:  40.65095064037762


In [254]:
N = 0
summation = 0
for sentence in reuters.sents():
    N += len(sentence) + 4 # +4 for <s> and </s> 
    for w1, w2, w3 in trigrams(sentence, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>"): 
        summation += math.log(tri_grams[(w1, w2)][w3], 2)
print("Tri gram perplexity: ", pow(2, -summation * (1/N)))

Tri gram perplexity:  5.9633012418563265


In [93]:
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F

from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
hyp_params = {
    "batch_size": 32,
    "embedding_dim": 125,
    "hidden_dim": 2,
    "sequence_len": 10
}

In [4]:
class LMDataset(Dataset):
    def __init__(self, nltk_corpus, sentence_window = 50, train_vocab=None):
        self.corpus = nltk.tokenize.wordpunct_tokenize(nltk_corpus)
        self.vocab = train_vocab if train_vocab else self._build_vocab()
        self.seq_len = sentence_window
        
        self.slider = -1

    def __len__(self):
        return math.floor(len(self.corpus)/self.seq_len)
    
    def __getitem__(self, item):
        
        self.slider += 1
        
        src_text_tokens = self.corpus[self.slider * self.seq_len : (self.slider + 1) * self.seq_len]
        trg_text_tokens = self.corpus[(self.slider * self.seq_len) + 1 : ((self.slider + 1) * self.seq_len) + 1]
        
        
        return {
            'src': self.vocab.lookup_indices(src_text_tokens),
            'trg': self.vocab.lookup_indices(trg_text_tokens)
        }
    
    def _build_vocab(self):
        vocab = build_vocab_from_iterator([self.corpus], specials=["<unk>","<pad>"])
        vocab.set_default_index(vocab['<unk>'])

        return vocab

In [5]:
def collate_fn(batch, pad_value, device):
    trgs = []
    srcs = []
    for row in batch:        
        srcs.append(torch.tensor(row["src"], dtype=torch.long).to(device))
        trgs.append(torch.tensor(row["trg"]).to(device))

    padded_srcs = pad_sequence(srcs, padding_value=pad_value)
    padded_trgs = pad_sequence(trgs, padding_value=pad_value)
    return {"src": padded_srcs, "trg": padded_trgs}

train_lmds = LMDataset(reuters.raw(), hyp_params["sequence_len"])

pad_value = train_lmds.vocab['<pad>']

train_dt = DataLoader(train_lmds, 
                      batch_size=hyp_params["batch_size"], 
                      shuffle=True,
                      collate_fn=lambda batch_size: collate_fn(batch_size, pad_value, device))

In [203]:
class LM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LM, self).__init__()


        # Embedding is just an lookup table of size "vocab_size"
        # and each element has "embedding_size" dimension
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden_state=None, cell_state=None):
        # Shape --> [Sequence_length , batch_size , embedding dims]
        embedding = self.embedding(x)
        # Shape --> (output) [Sequence_length , batch_size , hidden_size]
        # Shape --> (hs, cs) [num_layers, batch_size size, hidden_size]
        if hidden_state != None:
            outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))
        else:
            outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        '''
            Unlike Classification task, 
            here we are making use of outputs from our LSTM.
        '''
        # shape --> (linear_outputs) (10, 32, 41602) sentence len, batch size, vocab size
        linear_outputs = self.fc(outputs)
        
        return linear_outputs, hidden_state, cell_state

In [204]:
model = LM(len(train_lmds.vocab), hyp_params["embedding_dim"], hyp_params["hidden_dim"]).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [7]:
for epoch in range(10):
    model.train()
    epoch_loss = 0
    print('Epoch: ', epoch)
    for idx, batch in enumerate(tqdm(train_dt)):
        src = batch["src"]  # shape --> e.g. (10, 32) sentence len, batch size
        trg = batch["trg"]  # shape --> e.g. (10, 32) sentence len, batch size
        
        trg = trg.view(-1) # making them linear (1d) --> bsz * seq len
 
        # Clear the accumulating gradients
        optimizer.zero_grad()

        # shape --> (10, 32, 41602) sentence len, batch size, vocab
        output = model(src)
        
        # Calculate the loss value for every epoch
        loss = criterion(output.view(-1, len(train_lmds.vocab)), trg)

        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

        epoch_loss += loss.item()

        # Clip the gradient value is it exceeds > 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the weights values
        optimizer.step()
    print(f'\tTrain loss: {epoch_loss/len(train_dt)}, Train perplexity: {math.exp(epoch_loss/len(train_dt))}')
    train_lmds.slider = -1

Epoch:  0


100%|██████████| 5378/5378 [01:58<00:00, 45.32it/s]


	Train loss:  7.6194728451973885
Epoch:  1


100%|██████████| 5378/5378 [01:54<00:00, 46.90it/s]


	Train loss:  6.86116912720504
Epoch:  2


100%|██████████| 5378/5378 [02:00<00:00, 44.61it/s]


	Train loss:  6.690379174237093
Epoch:  3


100%|██████████| 5378/5378 [01:56<00:00, 46.13it/s]


	Train loss:  6.571957478517935
Epoch:  4


100%|██████████| 5378/5378 [01:56<00:00, 46.15it/s]


	Train loss:  6.48696317208128
Epoch:  5


100%|██████████| 5378/5378 [01:56<00:00, 46.08it/s]


	Train loss:  6.415047833892039
Epoch:  6


100%|██████████| 5378/5378 [01:56<00:00, 46.04it/s]


	Train loss:  6.354719042556354
Epoch:  7


100%|██████████| 5378/5378 [01:57<00:00, 45.96it/s]


	Train loss:  6.306523078699988
Epoch:  8


100%|██████████| 5378/5378 [01:56<00:00, 46.03it/s]


	Train loss:  6.269592829910341
Epoch:  9


100%|██████████| 5378/5378 [01:58<00:00, 45.51it/s]

	Train loss:  6.2395854212618





In [8]:
torch.save(model.state_dict(),'lm.pt')

In [205]:
pre_model = torch.load('lm.pt', map_location=device)
model.load_state_dict(pre_model)
model.eval()

LM(
  (embedding): Embedding(41602, 125)
  (LSTM): LSTM(125, 2)
  (fc): Linear(in_features=2, out_features=41602, bias=True)
)

In [264]:
inp = "The"

hs = None
cs = None
for i in range(10):
    inp_ind = torch.tensor([train_lmds.vocab[inp]]).unsqueeze(1)
    output, hs, cs = model(inp_ind, hs, cs)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    inp = train_lmds.vocab.lookup_token(word_idx)
    print(inp, end=" ")

Bank and a market - Europe visible expected its third 

### Calculating perplexity of a single sentence
But we only have input? How to manage it?

In [329]:
# Thanks: https://github.com/flairNLP/flair/issues/498#issuecomment-465192107
sentence = reuters.sents()[0]

# this was the main step
inp = sentence[:-1]
trg = sentence[1:]

trg_tensor = torch.tensor(train_lmds.vocab.lookup_indices(trg))

inp_tensor = torch.tensor(train_lmds.vocab.lookup_indices(inp)).unsqueeze(1)
output, _, _ = model(inp_tensor)

loss = criterion(output.view(-1, len(train_lmds.vocab)), trg_tensor).item()

math.exp(loss)

822.3764490590262

In [None]:
hs = None
cs = None
sentence = reuters.sents()[0]
probs = 0
for idx in range(len(sentence)):
    inp_ind = torch.tensor(train_lmds.vocab.lookup_indices(sentence[:idx+1])).unsqueeze(1)
    output, _, _ = model(inp_ind)
    output = output.squeeze().detach()
    if idx > 0:
        # Getting the last one (so far probabilities)
        output = output[idx]
    
    probs += output[train_lmds.vocab[sentence[idx]]].item()

print(probs/len(sentence))