<a href="https://colab.research.google.com/github/njainds/Colab_notebooks/blob/master/Train_LM_RNN/lm_train_v2.0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Langauge model training on Wiki Text 103 dataset

In [0]:
# Data preparation
#DataSource: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
#Scripts:
#https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/02-intermediate/language_model
#https://github.com/pytorch/examples/tree/master/word_language_model

In [1]:
import torch
print("Name of GPU : {}".format(torch.cuda.get_device_name(0)))
print("# of GPU : {}".format(torch.cuda.device_count()))


Name of GPU : Tesla T4
# of GPU : 1


In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
#os.mkdir('/content/drive/My Drive/LM_model')
#os.mkdir('/content/drive/My Drive/LM_model/data')
#os.mkdir('/content/drive/My Drive/LM_model/models')
#os.mkdir('/content/drive/My Drive/LM_model/outputs')
!ls "/content/drive/My Drive/LM_model"
os.chdir('/content/drive/My Drive/LM_model')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data  models  outputs


In [0]:
#Downloading data
oc.chdir('./data')
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
!unzip wikitext-2-v1.zip
!head wikitext-2/wiki.test.tokens

In [0]:
import torch
import os

class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0
  def add_word(self,word):
    if not word in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1
  def __len__(self):
    return len(self.word2idx)

class Corpus(object):
  def __init__(self):
    self.dictionary = Dictionary()
  def get_data(self, path, batch_size=20):
    with open(path, 'r') as f:
      tokens=0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words:
          self.dictionary.add_word(word)
    ids = torch.LongTensor(tokens)
    token = 0
    with open(path, 'r') as f:
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          ids[token] = self.dictionary.word2idx[word]
          token +=1
    num_batches = ids.size(0)//batch_size
    ids = ids[:num_batches*batch_size]
    #print(ids.size())
    return ids.view(batch_size, -1)

In [0]:
import torch.nn as nn
import torch.nn.utils as clip_grad_norm_
import numpy as np

class config(object):
  def __init__(self):
    self.embed_size = 128
    self.hidden_size = 1024
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.num_layers = 1
    self.num_epochs = 5
    self.num_samples = 1000     # number of words to be sampled
    self.batch_size = 20
    self.seq_length = 30
    self.learning_rate = 0.002  

In [5]:
%%time
#Load Data
con = config()
corpus = Corpus()
ids = corpus.get_data('./data/wikitext-2/wiki.train.tokens', con.batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // con.seq_length # no. of sequences per batch
#2088620 | 20, 104431 | 20,30,3481|

CPU times: user 10.1 s, sys: 1.04 s, total: 11.1 s
Wall time: 11.1 s


In [6]:
%%time
#Load Data
valcorpus = Corpus()
vids = valcorpus.get_data('./data/wikitext-2/wiki.test.tokens', con.batch_size)
val_vocab_size = len(valcorpus.dictionary)
#245560 | 20, 12278 | 20,30,409|

CPU times: user 1.19 s, sys: 118 ms, total: 1.31 s
Wall time: 1.32 s


In [0]:
class RNNLM(nn.Module):
  def __init__(self,vocab_size, embed_size, hidden_size, num_layers):
    super(RNNLM,self).__init__()
    self.embed = nn.Embedding(vocab_size,embed_size)
    self.lstm = nn.LSTM(embed_size,hidden_size,num_layers,batch_first=True)
    self.linear = nn.Linear(hidden_size, vocab_size)
  def forward(self,x,h):
    x = self.embed(x)
    out, (h,c) = self.lstm(x,h)
    #batch_size(20),seg_len(30),hidden_size(1024)
    out = out.reshape(out.size(0)*out.size(1), out.size(2))
    out = self.linear(out)
    return out, (h,c)
  

In [0]:
model = RNNLM(vocab_size, con.embed_size, con.hidden_size, con.num_layers).to(con.device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = con.learning_rate)

In [0]:
#for name, param in model.named_parameters():
#  print(name,param.size())

In [0]:
def detach(states):
  return [state.detach() for state in states]

#Training model
for epoch in range(con.num_epochs):
  states = (torch.zeros(con.num_layers,con.batch_size,con.hidden_size).to(con.device),
            torch.zeros(con.num_layers,con.batch_size,con.hidden_size).to(con.device))
  vstates = (torch.zeros(con.num_layers,con.batch_size,con.hidden_size).to(con.device),
            torch.zeros(con.num_layers,con.batch_size,con.hidden_size).to(con.device))
  model.train()
  
  #Training set
  for i in range(0, ids.size(1)-con.seq_length, con.seq_length):
    #Get mini-batch and inputs
    inputs = ids[:,i:i+con.seq_length].to(con.device)
    targets = ids[:,(i+1):(i+1)+con.seq_length].to(con.device)
    
    #Forward pass
    states = detach(states)
    output,states = model(inputs,states)
    loss = criterion(output,targets.reshape(-1))
    
    #backward pass and optimize
    model.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),0.5)
    optimizer.step()
    step = (i+1)//con.seq_length
    if step%100==0:
      print ('Step [{}/{}], Epoch [{}/{}], Train Loss: {:.4f}, Train Perplexity: {:5.2f}'
         .format(step, num_batches, epoch+1, con.num_epochs, loss.item(), np.exp(loss.item())))
  
  #Validation set
  model.eval()
  val_loss = 0
  val_size = vids.size(1)//con.seq_length
  for vi in range(0, vids.size(1)-con.seq_length, con.seq_length):
    #Get mini-batch and inputs
    vinputs = ids[:,vi:vi+con.seq_length].to(con.device)
    vtargets = ids[:,(vi+1):(vi+1)+con.seq_length].to(con.device)
    
    #Forward pass
    vstates = detach(vstates)
    voutput,vstates = model(vinputs,vstates)
    voutput = voutput.detach() ##Saves lot of GPU memory
    vloss = criterion(voutput,vtargets.reshape(-1))
    val_loss += vloss/val_size

  print('#################################################################')
  print ('Epoch [{}/{}], Train Loss: {:.4f}, Train Perplexity: {:5.2f}, Val Loss: {:.4f}, Val Perplexity: {:5.2f}'
         .format(epoch+1, con.num_epochs, loss.item(), np.exp(loss.item()), val_loss.item(), np.exp(val_loss.item())))                

In [0]:
#Model Testing
with torch.no_grad():
  with open('./outputs/sample.txt','w') as f:
    state = (torch.zeros(con.num_layers,1,con.hidden_size).to(con.device),
            torch.zeros(con.num_layers,1,con.hidden_size).to(con.device))
    prob = torch.ones(vocab_size)
    input = torch.multinomial(prob,num_samples=1).unsqueeze(1).to(con.device)
    for i in range(con.num_samples):
      output,state = model(input,state)
      prob = output.exp()
      word_id = torch.multinomial(prob,num_samples=1).item()
      input.fill_(word_id)
      word = corpus.dictionary.idx2word[word_id]
      word = '\n' if word == '<eos>' else word + ' '
      f.write(word)
      if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, con.num_samples, '/outputs/sample.txt'))

torch.save(model.state_dict(), 'model2.ckpt')
    

Sampled [100/1000] words and save to /outputs/sample.txt
Sampled [200/1000] words and save to /outputs/sample.txt
Sampled [300/1000] words and save to /outputs/sample.txt
Sampled [400/1000] words and save to /outputs/sample.txt
Sampled [500/1000] words and save to /outputs/sample.txt
Sampled [600/1000] words and save to /outputs/sample.txt
Sampled [700/1000] words and save to /outputs/sample.txt
Sampled [800/1000] words and save to /outputs/sample.txt
Sampled [900/1000] words and save to /outputs/sample.txt
Sampled [1000/1000] words and save to /outputs/sample.txt


In [0]:
# Next steps:
#Get error and perplixity for val set for each epoch | on Test set at end of training
#Fine tuning using LR, regularisation
#Implement AWD LSTM with drop connect, dropout over hidden-hidden, Variable batch size
#Weight tying with the embeddings for the last fully connected layer
#Evaluation of the Langauge model