#Setup

In [None]:
%matplotlib inline
import sys
import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
import random
import math
drive.mount('/content/gdrive/')
!pip install wandb --upgrade
import wandb
wandb.login()

In [None]:
cd /content/gdrive/MyDrive/IDL/HW/HW4/P1/handout_5/hw4/

/content/gdrive/MyDrive/IDL/HW/HW4/P1/handout_5/hw4


#Model Tools

In [None]:
def log_softmax(x, axis):
    ret = x - np.max(x, axis=axis, keepdims=True)
    lsm = np.log(np.sum(np.exp(ret), axis=axis, keepdims=True))
    return ret - lsm


def array_to_str(arr, vocab):
    return " ".join(vocab[a] for a in arr)


def test_prediction(out, targ):
    out = log_softmax(out, 1)
    nlls = out[np.arange(out.shape[0]), targ]
    nll = -np.mean(nlls)
    return nll

def test_generation(inp, pred, vocab):
    outputs = u""
    for i in range(inp.shape[0]):
        w1 = array_to_str(inp[i], vocab)
        w2 = array_to_str(pred[i], vocab)
        outputs += u"Input | Output #{}: {} | {}\n".format(i, w1, w2)
    return outputs

In [None]:
# load all that we need
dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy')  # test
vocab = np.load('../dataset/vocab.npy')

In [None]:
# data loader
class LanguageModelDataLoader(DataLoader):

    def __init__(self,seq_p, variance_seq, dataset, batch_size, shuffle=True, sequence_len = 10):       
        self.data = np.concatenate(dataset, axis=0)
        self.sequence_len = sequence_len
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seq_p = seq_p
        self.variance_seq = variance_seq

    def __iter__(self):
        #We need to change our sequences every epoch
        seq_draw = np.random.binomial(1,self.seq_p,1)
        if seq_draw == 1:
          self.sequence_len = self.sequence_len
          self.sequence_len = abs(math.floor(np.random.normal(loc = self.sequence_len, scale = self.variance_seq )))+1
        else:
          self.sequence_len = self.sequence_len/2
          self.sequence_len =abs(math.floor(np.random.normal(loc = self.sequence_len, scale = self.variance_seq )))+1

        #Breaking the inputs and targets out 
        inputs = [self.data[index*self.sequence_len:self.sequence_len*index + self.sequence_len] for index in range(len(self.data)//self.sequence_len)]
        targets = [self.data[index*self.sequence_len+1:self.sequence_len*index + self.sequence_len+1] for index in range(len(self.data)//self.sequence_len)]
        assert len(inputs)==len(targets), "inputs and targets are not the same length"
        assert len(inputs[0]) == len(targets[0]), "inputs and targets are not the same length"

        #Creating a list of batches
        inputs_batched = [inputs[index*self.batch_size:index*self.batch_size+self.batch_size] for index in range(len(inputs)//self.batch_size)]#Missing out on the end of the data, fix this or use small batch size
        targets_batched = [targets[index*self.batch_size:index*self.batch_size+self.batch_size] for index in range(len(targets)//self.batch_size)]#Missing out on the end of the data, fix this or use small batch size
        
        #Converting the list of batches so that each batch is in the shape of Batch x Sequence length
        for batch_index in range(len(inputs_batched)):
          batch_refined = np.stack(inputs_batched[batch_index])
          inputs_batched[batch_index] = batch_refined
        for batch_index in range(len(targets_batched)):
          batch_refined = np.stack(targets_batched[batch_index])
          targets_batched[batch_index] = batch_refined

        data = list(zip(inputs_batched, targets_batched))
        
        if self.shuffle == True:
          random.shuffle(data)
        
        for inputs, targets in data:
          yield torch.from_numpy(inputs), torch.from_numpy(targets)



In [None]:
class LanguageModel(nn.Module):

    def __init__(self, e_size = 400, h_l_size = 1150, num_layers = 3, dropout = .2, bidirectional = False, sequence_len = 10):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings = 33278, embedding_dim = e_size)
        self.dropout = nn.Dropout()

        #LSTM model takes in inputs as N X L X H 
        self.LSTM = nn.LSTM(input_size = e_size, hidden_size = h_l_size, num_layers = num_layers, dropout = dropout, bidirectional = bidirectional, batch_first = True)
        self.linear_layers = nn.Linear(1150, 33278, bias=True)

        #apply weight tying 
        self.linear_layers.weight = self.embedding.weight

        self.softmax = nn.LogSoftmax(dim = 2)

    def forward(self, x, prediction = False):
        #Embedding transforms data from B X L --> B X L X Input size
        x = self.embedding(x)
        x = self.dropout(x)
        x, _ = self.LSTM(x)
        x = self.linear_layers(x)

        #Don't pass through 
        if prediction == False:
          x = self.softmax(x)
        
        return(x)

In [None]:
class LangaugeModelPrecise(nn.Module):

  def __init__(self, linear_layer_h_0, e_size = 400, h_0_size = 1150, dropout = .5, vocab_size = 33278):
    super(LangaugeModelPrecise, self).__init__()
    
    self.vocab_size = vocab_size
    self.e_size = e_size
    self.h_0_size = h_0_size
    
    self.embedding = nn.Embedding(num_embeddings = 33278, embedding_dim = e_size)
    self.lstm0 = nn.LSTMCell(input_size=e_size, hidden_size = h_0_size)
    self.lstm1 = nn.LSTMCell(input_size=h_0_size, hidden_size = h_0_size)
    self.lstm2 = nn.LSTMCell(input_size=h_0_size, hidden_size = e_size)

    self.bn_0 = nn.BatchNorm1d(num_features = e_size)
    self.bn_1 = nn.BatchNorm1d(num_features = linear_layer_h_0)
    self.dropout = nn.Dropout(p = dropout)
    self.activation = nn.ReLU()

    self.linear_layers = nn.Sequential(self.bn_0, nn.ReLU(),self.dropout, nn.Linear(e_size, linear_layer_h_0, bias=False),self.bn_1, nn.ReLU(), torch.nn.Dropout(), nn.Linear(linear_layer_h_0,33278))
    self.softmax = nn.LogSoftmax(dim = 2)

  def forward(self, x, prediction = False):

    x = self.embedding(x)
    #Output = B x time X size ---> time X B X size
    x = torch.permute(x, (1,0,2))
    time_len, batch_size, input_after_embedding_size = x.shape

    #init states
    lstm0_states = (torch.zeros(batch_size, self.h_0_size), torch.zeros(batch_size, self.h_0_size))
    lstm1_states = (torch.zeros(batch_size, self.h_0_size), torch.zeros(batch_size, self.h_0_size))
    lstm2_states = (torch.zeros(batch_size, self.e_size), torch.zeros(batch_size, self.e_size))

    #output tensor
    stored_output = torch.empty((time_len, batch_size, self.vocab_size))

    #Looping through time 
    for time_step in range(x.shape[0]):

      #Getting through lstm layers
      lstm0_states = self.lstm0(x[time_step],lstm0_states)
      h_0, m_0 = lstm0_states
      h_0 = self.activation(h_0)
      h_0 = self.dropout(h_0)
      
      lstm1_states = self.lstm1(h_0,lstm1_states)
      h_1, m_1 = lstm1_states
      h_1 = self.activation(h_1)
      h_1 = self.dropout(h_1)

      lstm2_states = self.lstm2(h_1, lstm2_states)
      h_2, m_2 = lstm2_states

      #Linear layers 
      output_one_time_step = self.linear_layers(h_2)

      stored_output[time_step] = output_one_time_step
    if prediction == False:
      stored_output = self.softmax(stored_output)
    
    return(stored_output)



#Training/Testing

In [None]:
# TODO: define other hyperparameters here
config = {
    'epochs' : 50,
    'lr' : .1,
    'optimizer' : 'adam',
    'batch_size' : 128,
    'schedular' : 'ReduceLROnPlateau',
    'weight_decay' : 5e-6,
    'LSTM_hidden' : 1150,
    'LSTM_layers' : 3,
    'dropout': .25,
    'patience' : 2,
    'factor' : .2,
    'embedding' : 'wordembedding',
    'search_type' : 'greedy',
    'random_batch' : "yes",
    'dynamic sequence' : "yes",
    'locked dropout' : "None",
    "Emedding dropout": "None",
    'num_layers': 3,
    'dropout_lstm': .2,
    'bidirectional': False,
    'seq_len': 50,
    'seq_p': .8,
    'variance_seq': 10,
    'linear_layer_h_0' : 600,
    'e_size' : 1150,
    'h_0_size' : 1150
}

#USE ADAM or ASGD increase embedding size and 

In [None]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.Adam(model.parameters(),lr =config['lr'] , weight_decay=config['weight_decay'])
        self.criterion = nn.NLLLoss()
        self.schedular = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min', patience=config['patience'], factor=config['factor'], verbose = True)

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0

        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)
        wandb.log({"Training Average Loss":epoch_loss, "epoch":self.epochs })

    def train_batch(self, inputs, targets):
        #Inputs are B x Sequence Length

        self.optimizer.zero_grad()
        inputs, targets = inputs.to(self.device), targets.to(self.device)
        output = self.model(inputs)

        #FOR LSTM Output is given in dim B X L X C, must change to B X C X L 
        output = output.permute(0,2,1)

        #FOR LSTMCELL time X B X size --> B X C X L
        #output = output.permute(1, 2, 0)

        loss = self.criterion(output, targets.long())
        loss.backward()
        self.optimizer.step()
        return(loss)

    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, nll))
        self.schedular.step(nll)
        
        wandb.log({"Testing Average Loss":nll})
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [None]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        inp = torch.from_numpy(inp)
        inp = inp.to(trainer.device)
        predictions = model(inp, prediction = True)
        # Predictions is in the form of B X L X C
        predicted_word = predictions[:,-1,:]
        return(predicted_word.cpu().detach().numpy())

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """
        #inp shape is B x Seq_len
        inp = torch.from_numpy(inp)
        inp = inp.to(trainer.device)
        output = model(inp)
        batch_size, L, C = output.shape

        #B X L X C is our output dimensions
        predicted_words = torch.argmax(output, dim = 2)
        predicted_words = predicted_words[:,-1]
        predicted_words = torch.reshape(predicted_words,(-1,1))
        #Storing out generated words
        full_forward_predictions = []
        full_forward_predictions.append(predicted_words.cpu().detach())

        #Generating forward words
        for word in range(forward-1):
          output = model(predicted_words)
          #Getting our prediction and storing it
          predicted_words = torch.argmax(output, dim = 2)
          full_forward_predictions.append(predicted_words.cpu().detach())

        full_forward_predictions = torch.cat(full_forward_predictions, dim = 1)
        return full_forward_predictions.cpu().detach().numpy()
        

In [None]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1637808207


In [None]:
model = LanguageModel(e_size = config['e_size'],h_l_size = config['LSTM_hidden'],num_layers = config['num_layers'], dropout = config['dropout_lstm'], bidirectional = config['bidirectional'] )
#model = LangaugeModelPrecise(linear_layer_h_0=config["linear_layer_h_0"], e_size = config['e_size'], h_0_size=config['h_0_size'],dropout=config['dropout'])
loader = LanguageModelDataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=True,sequence_len = config["seq_len"], seq_p =config["seq_p"], variance_seq =config['variance_seq'])

trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=config['epochs'], run_id=run_id)

In [None]:
best_nll = 1e30 
wandb.init(project="Language Model", config=config)
for epoch in range(config['epochs']):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
wandb.finish()

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output