In [1]:
import torch 
import torch.nn as nn

import numpy as np
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
from torch.nn.parameter import Parameter
#from .module import Module
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
# Use the GPU if you have one
if torch.cuda.is_available():
    print("Using GPU: " + str(args.gpu_no))
    device = torch.device("cuda")
    torch.cuda.set_device(args.gpu_no)
else:
    print("WARNING: You are about to run on cpu, and this will likely run out \
      of memory. \n You can try setting batch_size=1 to reduce memory usage")
    device = torch.device("cpu")

 You can try setting batch_size=1 to reduce memory usage


In [3]:
class Softmax:
    def predict(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores)

    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[y])

    def diff(self, x, y):
        probs = self.predict(x)
        probs[y] -= 1.0
        return probs

In [9]:
torch.manual_seed(1)

# Problem 1

    
class Layer(nn.Module):
  
  def __init__(self, hidden_size):
  
    super(Layer, self).__init__()
    self.hidden_size=hidden_size
    self.fc = nn.Linear (self.hidden_size, self.hidden_size, bias = True)
    self.rec = nn.Linear (self.hidden_size, self.hidden_size, bias = True)
    

class RNN(nn.Module): # Implement GRU.
    
    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
        """
        emb_size:     The number of units in the input embeddings
        hidden_size:  The number of hidden units per layer
        seq_len:      The length of the input sequences
        vocab_size:   The number of tokens in the vocabulary (10,000 for Penn TreeBank)
        num_layers:   The depth of the stack (i.e. the number of hidden layers at 
                      each time-step)
        dp_keep_prob: The probability of *not* dropping out units in the 
                      non-recurrent connections.
                      Do not apply dropout on recurrent connections.
        """
        super(RNN, self).__init__()

        self.emb_size = emb_size 
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dp_keep_prob = dp_keep_prob
        
        # Network definition
        rnn_type = 'GRU'
        self.drop = nn.Dropout(dp_keep_prob)
        self.embedding = nn.Embedding (self.vocab_size, self.emb_size)
        self.rnn = getattr(nn, rnn_type)(self.emb_size, self.hidden_size, self.num_layers, dropout=dp_keep_prob)
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
        self.init_weights() 

    def init_weights(self):
        # TODO ========================
        # Initialize the embedding and output weights uniformly in the range [-0.1, 0.1]
        # and output biases to 0 (in place). The embeddings should not use a bias vector.
        # Initialize all other (i.e. recurrent and linear) weights AND biases uniformly 
        # in the range [-k, k] where k is the square root of 1/hidden_size
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def init_hidden(self, bsz):
        # TODO ========================
        # initialize the hidden states to zero
        weight = next(self.parameters())
        return weight.new_zeros(self.nlayers, bsz, self.nhid)

    def forward(self, inputs, hidden):
        # TODO ========================
        # Compute the forward pass, using nested python for loops.
        # The outer for loop should iterate over timesteps, and the 
        # inner for loop should iterate over hidden layers of the stack. 
        # 
        # Within these for loops, use the parameter tensors and/or nn.modules you 
        # created in __init__ to compute the recurrent updates according to the 
        # equations provided in the .tex of the assignment.
        #
        # Note that those equations are for a single hidden-layer RNN, not a stacked
        # RNN. For a stacked RNN, the hidden states of the l-th layer are used as 
        # inputs to to the {l+1}-st layer (taking the place of the input sequence).

        emb = self.drop(self.embedding(inputs))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def generate(self, input, hidden, generated_seq_len):
        # TODO ========================
        # Compute the forward pass, as in the self.forward method (above).
        # You'll probably want to copy substantial portions of that code here.
        # 
        # We "seed" the generation by providing the first inputs.
        # Subsequent inputs are generated by sampling from the output distribution, 
        # as described in the tex (Problem 5.3)
        # Unlike for self.forward, you WILL need to apply the softmax activation 
        # function here in order to compute the parameters of the categorical 
        # distributions to be sampled from at each time-step.

        """
        Arguments:
            - input: A mini-batch of input tokens (NOT sequences!)
                            shape: (batch_size)
            - hidden: The initial hidden states for every layer of the stacked RNN.
                            shape: (num_layers, batch_size, hidden_size)
            - generated_seq_len: The length of the sequence to generate.
                           Note that this can be different than the length used 
                           for training (self.seq_len)
        Returns:
            - Sampled sequences of tokens
                        shape: (generated_seq_len, batch_size)
        """
        pass
        #return samples
    

In [10]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def __len__(self):
        return len(self.word2idx)

In [11]:
class TextProcess(object):
    
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words: 
                    self.dictionary.add_word(word)  
        #Create a 1-D tensor that contains the index of all the words in the file
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1
        #Find out how many batches we need   
        num_batches = rep_tensor.shape[0] // batch_size   
        print("# Batches: ", num_batches)
        print("Batch size: ", batch_size)
        #Remove the remainder (Filter out the ones that don't fit)
        rep_tensor = rep_tensor[:num_batches*batch_size]
        # return (batch_size,num_batches)
        print("vocab_size: ", len(self.dictionary.idx2word))
        rep_tensor = rep_tensor.view(batch_size, -1)
        print("Rep_tensor shape: ", rep_tensor.shape)
        return rep_tensor

In [12]:
doc = TextProcess()
train_data = doc.get_data('data/ptb.char.train.txt', 20)
num_steps = 20
epoch_size = 50

# Batches:  250874
Batch size:  20
vocab_size:  50
Rep_tensor shape:  torch.Size([20, 250874])


In [13]:

vocab_size = 50
hidden_dim = 100
np.random.seed(10)
#vocab_size=50
emb_size=256
batch_size=20
hidden_size=200
seq_len=50
num_layers=2
dp_keep_prob=0.35
lr =1e-4
# LEARNING RATE SCHEDULE    
lr_decay_base = 1 / 1.15
m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs
model = RNN(emb_size=emb_size, hidden_size=hidden_size, 
                seq_len=seq_len, batch_size=batch_size,
                vocab_size=vocab_size, num_layers=num_layers, 
                dp_keep_prob=dp_keep_prob) 

In [14]:
# LOSS FUNCTION
loss_fn = nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(), lr)

In [None]:
print("\n########## Running Main Loop ##########################")
loss_arr = []
num_epochs = 50

#===============================ADDED
num_examples_seen = 0
num_steps=30
losses = []
for epoch in range(num_epochs):
    states=(torch.zeros(num_layers,num_steps,hidden_size))
    #states=(torch.zeros(num_layers,batch_size,hidden_size))
    for i in range (0, train_data.size(1)-num_steps,num_steps):
        time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        inputs = train_data[:, i:(i+num_steps)]
        targets = train_data[:, (i+1):(i+1)+num_steps]
        outputs,_ = model(inputs,states)
        loss = loss_fn(outputs.contiguous().view(-1, model.vocab_size), targets.reshape(-1))  
        #Back propagation
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optimizer.step()
        
        step=(i+1) // num_steps
        
        if step % 100 == 0:
            print ("Time: {}, Timestep:{} Epoch [{}/{}], loss: {:4f}".format(time,step,epoch+1,num_epochs,loss.item()))
    loss_arr.append(loss.item())
    print("Loss: ", loss_arr)


########## Running Main Loop ##########################
Time: 2019-03-21 13:19:34, Timestep:0 Epoch [1/50], loss: 2.422590
Time: 2019-03-21 13:19:40, Timestep:100 Epoch [1/50], loss: 2.377115
Time: 2019-03-21 13:19:48, Timestep:200 Epoch [1/50], loss: 2.326538
Time: 2019-03-21 13:19:56, Timestep:300 Epoch [1/50], loss: 2.309144
