# DeepNLP 2 - Language Models

In this notebook, several Neural Language Models are implemented and analyzed.

### **To get started, first run the following cell to create a PyDrive client and download data to your own Google Drive.**

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

[?25l[K    1% |▎                               | 10kB 19.8MB/s eta 0:00:01[K    2% |▋                               | 20kB 4.8MB/s eta 0:00:01[K    3% |█                               | 30kB 6.8MB/s eta 0:00:01[K    4% |█▎                              | 40kB 4.3MB/s eta 0:00:01[K    5% |█▋                              | 51kB 5.3MB/s eta 0:00:01[K    6% |██                              | 61kB 6.3MB/s eta 0:00:01[K    7% |██▎                             | 71kB 7.0MB/s eta 0:00:01[K    8% |██▋                             | 81kB 7.9MB/s eta 0:00:01[K    9% |███                             | 92kB 8.7MB/s eta 0:00:01[K    10% |███▎                            | 102kB 7.0MB/s eta 0:00:01[K    11% |███▋                            | 112kB 7.1MB/s eta 0:00:01[K    12% |████                            | 122kB 9.4MB/s eta 0:00:01[K    13% |████▎                           | 133kB 9.4MB/s eta 0:00:01[K    14% |████▋                           | 143kB 16.5MB/s eta 0:00:01

... Now run the below cell to download all of the data.

In [0]:
import torch, pickle, os, sys, random, time
from torch import nn, optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

if not os.path.isdir('./checkpoints'):
  os.mkdir('./checkpoints')   # directory to save checkpoints

# Download id2word
f_wikitext = drive.CreateFile({'id': '1fBS7PyEOeQMuH5Ea1_hnEjU3PmFE7ZZc'})
f_wikitext.GetContentFile('./wikitext.pkl') 
with open('./wikitext.pkl', 'rb') as f_in:
  wikitext = pickle.load(f_in)

wikitext['train'] = torch.LongTensor(wikitext['train']).cuda()
wikitext['dev'] = torch.LongTensor(wikitext['valid']).cuda()
wikitext['test'] = torch.LongTensor(wikitext['test']).cuda()
idx_to_word = wikitext['id2word']

print("Wikitext data downloaded!")
# Demonstrate id2word
print('There are ' + str(len(idx_to_word)) + ' words in vocabulary')
for id in range(8):
  print('Word id ' + str(id) + " stands for '" + str(idx_to_word[id]) + "\'")
print('...')
print((wikitext['train'] > 0).sum())
    
print('Set up finished')


cuda:0
Wikitext data downloaded!
There are 28654 words in vocabulary
Word id 0 stands for '<pad>'
Word id 1 stands for '<unk>'
Word id 2 stands for '<bos>'
Word id 3 stands for '<eos>'
Word id 4 stands for 'the'
Word id 5 stands for ','
Word id 6 stands for '.'
Word id 7 stands for 'of'
...
tensor(1622368, device='cuda:0')
Set up finished


### Vanilla neural language models: 

The input to the model is a *minibatch* of sequences which takes the form of a  $N \times L$ matrix  where $N$ is the batch size and $L$ is the maximum sequence length. For each minibatch, the models produce an $N \times L \times V$ tensor where $V$ is the size of the vocabulary. This tensor stores a prediction of the next word for every position of every sequence in the batch. Note that each batch is padded to dimensionality $L=40$ using the special padding token <*pad>*; similarly, each sequence begins with the <*bos>* token and ends with the <*eos>* token.

To get familiar with the inputs and outputs, let's first take a look at a simple bigram language model. The Language Models are trained on the WikiText2 dataset which as ~2 million tokens. The *perplexity* metric is used to evaluate the models on the dev set.



In [1]:
# A simple model that predicts the next word given just the previous word
class BigramLM(nn.Module):
    def __init__(self, params):
        super(BigramLM, self).__init__()
        self.vocab_size = params['vocab_size']
        self.d_emb = params['d_emb']
        self.embeddings = nn.Embedding(self.vocab_size, self.d_emb)
        self.W = nn.Linear(self.d_emb, self.vocab_size) # output matrix
        

    def forward(self, batch): 
        # each example in a batch is of the form <BOS> w1 w2 ... wn <EOS>
        bsz, seq_len = batch.size()
        embs = self.embeddings(batch)
        #print(embs.size())
        logits = self.W(embs)
        return logits
      
# function to evaluate LM perplexity on some input data
def compute_perplexity(dataset, net, bsz=64):
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    num_examples, seq_len = dataset.size()
    

    batches = [(start, start + bsz) for start in\
               range(0, num_examples, bsz)]
   
    total_unmasked_tokens = 0. # init count of unpadded tokens
    nll = 0.
    for b_idx, (start, end) in enumerate(batches):
      
        batch = dataset[start:end]
        ut = torch.nonzero(batch).size(0)
        preds = net(batch)
        targets = batch[:, 1:].contiguous().view(-1)
        preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
        loss = criterion(preds, targets)
        nll += loss.detach()
        total_unmasked_tokens += ut

    perplexity = torch.exp(nll / total_unmasked_tokens).cpu()
    return perplexity.data
    

# training loop for language models
def train_lm(dataset, params, net):
    
    # computing the loss
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
    num_examples, seq_len = dataset.size()  
    batches = [(start, start + params['batch_size']) for start in\
               range(0, num_examples, params['batch_size'])]
    #print(batches)
    for epoch in range(params['epochs']):
        ep_loss = 0.
        start_time = time.time()
        random.shuffle(batches)
        # for each batch, calculate loss and optimize model parameters            
        for b_idx, (start, end) in enumerate(batches):
            
            batch = dataset[start:end]
            preds = net(batch)
    
            preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
            targets = batch[:, 1:].contiguous().view(-1)
            loss = criterion(preds, targets)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            ep_loss += loss

        print('epoch: %d, loss: %0.2f, time: %0.2f sec, dev perplexity: %0.2f' %\
              (epoch, ep_loss, time.time()-start_time, compute_perplexity(wikitext['dev'], net)))

params = {}
params['vocab_size'] = len(idx_to_word)
params['d_emb'] = 50
params['batch_size'] = 128
params['epochs'] = 5
params['learning_rate'] = 0.001

bigramNet = BigramLM(params)
bigramNet.cuda()
train_lm(wikitext['train'], params, bigramNet)


NameError: name 'nn' is not defined

In [0]:
torch.save(bigramNet, 'Bigram.pkl') # save on colab machine
model_file = drive.CreateFile()
model_file.SetContentFile('Bigram.pkl') # copy the file from colab machine to google drive
model_file.Upload()

  "type " + obj.__name__ + ". It won't be checked "


#### Perplexity

Run the following cell to compute the perplexity on the training and validation set.

In [0]:
bigramNet.eval() # evaluation of the network
print('%s perplexity: %0.2f' % ('train', compute_perplexity(wikitext['train'], bigramNet)))
print('%s perplexity: %0.2f' % ('dev', compute_perplexity(wikitext['dev'], bigramNet)))
print('%s perplexity: %0.2f' % ('test', compute_perplexity(wikitext['test'], bigramNet)))

train perplexity: 249.88
dev perplexity: 259.63
test perplexity: 242.17


### Recurrent neural language models: 
In this part, a recurrent neural language model is implemented. Run the following cell to evaluate the RNNLM -

In [0]:
class RNNLM(nn.Module):
    def __init__(self, params):
        super(RNNLM, self).__init__()
        self.vocab_size = params['vocab_size']
        self.d_emb = params['d_emb'] #input_size
        self.embeddings = nn.Embedding(self.vocab_size, self.d_emb)
        self.d_hid = params['d_hid'] #hidden_size
        self.RNNLM = nn.LSTM(self.d_emb, self.d_hid, batch_first = True)
        self.W = nn.Linear(self.d_hid, self.vocab_size) # output matrix
    
    def init_hidden(self,bsz):
        hx = torch.randn(1, bsz, self.d_hid).cuda()
        cx = torch.randn(1, bsz, self.d_hid).cuda()
        return (hx, cx)
              
    def forward(self, batch):
        bsz, seq_len = batch.size()
        embs = self.embeddings(batch)
        (hx, cx) = self.init_hidden(bsz)   
        '''
        print ('...printing embs.size() = ', embs.size())
        print ('...printing hx.size() = ', hx.size())
        print ('...printing cx.size() = ', cx.size())
        '''
        out, (hx, cx) = self.RNNLM(embs, (hx, cx))
           
        logits = self.W(out)
        return logits 
       
params = {}
params['vocab_size'] = len(idx_to_word)
params['d_emb'] = 512
params['d_hid'] = 512
params['batch_size'] = 50
params['epochs'] = 5
params['learning_rate'] = 0.001

RNNnet = RNNLM(params)
RNNnet.cuda()
train_lm(wikitext['train'], params, RNNnet)

epoch: 0, loss: 7927.28, time: 226.63 sec, dev perplexity: 151.22
epoch: 1, loss: 6799.85, time: 226.52 sec, dev perplexity: 127.21
epoch: 2, loss: 6135.33, time: 226.65 sec, dev perplexity: 123.18
epoch: 3, loss: 5587.93, time: 226.62 sec, dev perplexity: 126.27
epoch: 4, loss: 5117.60, time: 226.62 sec, dev perplexity: 135.96


In [0]:
RNNnet.eval() # evaluate the network
print('%s perplexity: %0.2f' % ('train', compute_perplexity(wikitext['train'], RNNnet)))
print('%s perplexity: %0.2f' % ('dev', compute_perplexity(wikitext['dev'], RNNnet)))
print('%s perplexity: %0.2f' % ('test', compute_perplexity(wikitext['test'], RNNnet)))

train perplexity: 24.52
dev perplexity: 135.97
test perplexity: 127.26


In [0]:
# saving
torch.save(RNNnet, 'LSTM.pkl') #save on colab machine

  "type " + obj.__name__ + ". It won't be checked "
