# Language Models

One of the important and almost backbone of different NLP related tasks!

`# TODO: Add more details`

We will explore statistical and machine learning methods!
Lets starts with statistical methods first!

### Statistical method

In [2]:
# Thanks https://nlpforhackers.io/language-models/
import random
from collections import Counter, defaultdict

import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams

In [None]:
nltk.download('punkt')
nltk.download('reuters')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maqboolkhan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
uni_grams = Counter(reuters.words())
total_count = len(reuters.words())
 

# Compute the probabilities (uni-grams)
for word in uni_grams:
    uni_grams[word] /= float(total_count)

In [4]:
uni_grams_counter = Counter(uni_grams)
uni_grams_counter.most_common(10)

[('.', 0.055021758950689205),
 (',', 0.042047741270415905),
 ('the', 0.033849129031826936),
 ('of', 0.02090707135390124),
 ('to', 0.01977743054365126),
 ('in', 0.015386126221089999),
 ('said', 0.014657438167564549),
 ('and', 0.014552260705293332),
 ('a', 0.013650988639090802),
 ('mln', 0.010481137497159917)]

In [5]:
def generate_text(grams, n, context, length):
    text = list(context)
    context = context[0] if n == 2 else context # bigrams had , in tne context tuple hence to remedy that comma!
    
    for i in range(length):
        sum = 0
        
        r = random.random()
        
        if context:
            candidates = grams[context]
        else:
            candidates = grams
        
    
        for k in candidates.keys():
            sum += candidates[k]
            if sum > r:
                text.append(k)
                
                if context:
                    context = (k) if n == 2 else (context[2-n], k)
                
                break
    text = ['None' if token == None else token  for token in text] # Replacing None with 'None'
    return ' '.join(text)

In [653]:
generate_text(uni_grams, 1, (), 10)

', IN Allegis banks 348 said April intervention total year'

In [6]:
def calc_probs(grams):
    # Let's transform the counts to probabilities
    for context in grams:
        total_count = float(sum(grams[context].values()))
        for next_word in grams[context]:
            grams[context][next_word] /= total_count

In [7]:
bi_grams = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in reuters.sents():
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bi_grams[(w1)][w2] += 1 

calc_probs(bi_grams)

In [8]:
generate_text(bi_grams, 2, ('The',), 10)

'The company in its pretax securities by protectionist bills or sale'

In [None]:
tri_grams = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        tri_grams[(w1, w2)][w3] += 1
 
for w1_w2 in tri_grams:
    total_count = float(sum(tri_grams[w1_w2].values()))
    for w3 in tri_grams[w1_w2]:
        tri_grams[w1_w2][w3] /= total_count

In [660]:
generate_text(tri_grams, 3, (None, 'The'), 10)

'None The present five - year Canadian bonds only about 760 oil'

In [74]:
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torchtext.vocab import build_vocab_from_iterator

from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [78]:
hyp_params = {
    "batch_size": 32,
    "embedding_dim": 125,
    "hidden_dim": 2,
    "sequence_len": 10
}

In [79]:
class LMDataset(Dataset):
    def __init__(self, nltk_corpus, sentence_window = 50, train_vocab=None):
        self.corpus = nltk.tokenize.wordpunct_tokenize(nltk_corpus)
        self.vocab = train_vocab if train_vocab else self._build_vocab()
        self.seq_len = sentence_window
        
        self.slider = -1

    def __len__(self):
        return math.floor(len(self.corpus)/self.seq_len)
    
    def __getitem__(self, item):
        
        self.slider += 1
        
        src_text_tokens = self.corpus[self.slider * self.seq_len : (self.slider + 1) * self.seq_len]
        trg_text_tokens = self.corpus[(self.slider * self.seq_len) + 1 : ((self.slider + 1) * self.seq_len) + 1]
        
        
        return {
            'src': self.vocab.lookup_indices(src_text_tokens),
            'trg': self.vocab.lookup_indices(trg_text_tokens)
        }
    
    def _build_vocab(self):
        vocab = build_vocab_from_iterator([self.corpus], specials=["<unk>","<pad>"])
        vocab.set_default_index(vocab['<unk>'])

        return vocab

In [80]:
def collate_fn(batch, pad_value, device):
    trgs = []
    srcs = []
    for row in batch:        
        srcs.append(torch.tensor(row["src"], dtype=torch.long).to(device))
        trgs.append(torch.tensor(row["trg"]).to(device))

    padded_srcs = pad_sequence(srcs, padding_value=pad_value)
    padded_trgs = pad_sequence(trgs, padding_value=pad_value)
    return {"src": padded_srcs, "trg": padded_trgs}

train_lmds = LMDataset(reuters.raw(), hyp_params["sequence_len"])

pad_value = train_lmds.vocab['<pad>']

train_dt = DataLoader(train_lmds, 
                      batch_size=hyp_params["batch_size"], 
                      shuffle=True,
                      collate_fn=lambda batch_size: collate_fn(batch_size, pad_value, device))

In [81]:
class LM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim):
        super(LM, self).__init__()


        # Embedding is just an lookup table of size "vocab_size"
        # and each element has "embedding_size" dimension
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        # Shape --> [Sequence_length , batch_size , embedding dims]
        embedding = self.embedding(x)
        # Shape --> (output) [Sequence_length , batch_size , hidden_size]
        # Shape --> (hs, cs) [num_layers, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        '''
            Unlike Classification task, 
            here we are making use of outputs from our LSTM.
        '''
        
        linear_outputs = self.fc(outputs)
        
        return linear_outputs

In [None]:
model = LM(len(train_lmds.vocab), len(train_lmds.vocab), hyp_params["embedding_dim"], hyp_params["hidden_dim"]).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    epoch_loss = 0
    print('Epoch: ', epoch)
    for idx, batch in enumerate(tqdm(train_dt)):
        src = batch["src"]  # shape --> e.g. (10, 32) sentence len, batch size
        trg = batch["trg"]  # shape --> e.g. (10, 32) sentence len, batch size

        # Clear the accumulating gradients
        optimizer.zero_grad()

        # shape --> (10, 32, 41602) sentence len, batch size, trg vocab
        output = model(src)

        # Calculate the loss value for every epoch
        
        loss = criterion(output.view(-1, len(train_lmds.vocab)), trg.view(-1))

        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

        epoch_loss += loss.detach().cpu()

        # Clip the gradient value is it exceeds > 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the weights values
        optimizer.step()
    print('\tTrain loss: ', epoch_loss/len(train_dt))
    train_lmds.slider = -1