# Let's review last assignment

In [1]:
import torch

In [2]:
trainFolder = '../data/names/train/'
validFolder = '../data/names/valid/'
testFolder = '../data/names/test/'

In [3]:
from utils.data_utils import CharCorpus
corpus = CharCorpus(
    trainFolder+'m.txt', trainFolder+'f.txt',
    validFolder+'m.txt', validFolder+'f.txt',
    testFolder+'m.txt', testFolder+'f.txt', limit=10000)

In [6]:
corpus.train_1[1]

tensor([  8,   9,   2,  10,   6,  11,  12,   7,   0])

In [7]:
numDistinctChars = len(corpus.dictionary)

In [8]:
import torch.nn as nn
import torch.nn.functional as F

class LRClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size):
        super(LRClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))

In [9]:
clf = LRClassifier(2, numDistinctChars)

In [10]:
def createMiniBatch(masculines, feminines, batch_size, batch_num, vecLen):
    data = masculines[batch_num: batch_num + batch_size // 2]
    data.extend(feminines[batch_num: batch_num + batch_size // 2])
    data = list(map(lambda x: indicesToFeatures(x, vecLen), data))
    data = torch.stack(data)
    labels = [1] * (batch_size // 2) + [0] * (batch_size // 2)
    labels = torch.tensor(labels)
    return data, labels


def indicesToFeatures(seq, vecLen):
    out = torch.zeros(vecLen)
    for ind in seq:
        out[ind] += 1
    return out

In [11]:
def trainEpoch(epochNum, model, loss_function, optimizer, cuda=False):
    for batchNum in range(numBatches):
        # Step 1 PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get the datapoints and labels
        dataBatch, labelsBatch = createMiniBatch(
            corpus.train_1, corpus.train_2, batchSize, batchNum, numDistinctChars)
        if cuda:
            dataBatch = dataBatch.cuda()
            labelsBatch = labelsBatch.cuda()
        # Step 3. Run our forward pass.
        logProbs = model(dataBatch)
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(logProbs, labelsBatch)
        if batchNum % 20 == 0:
            print('\rEpoch: {0}, Batch: {1}, loss:{2:.5f}'.format(epochNum, batchNum, loss), flush=True, end=" ")
        loss.backward()
        optimizer.step()
        
def trainModel(numEpochs, model, cuda=False):
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    for epochNum in range(numEpochs):
        trainEpoch(epochNum, model, loss_function, optimizer, cuda) 

In [12]:
import torch.optim as optim
numEpochs = 10

loss_function = nn.NLLLoss()
optimizer = optim.SGD(clf.parameters(), lr=0.01)
batchSize = 10
numBatches = 300
        
trainModel(numEpochs, clf)

Epoch: 0, Batch: 140, loss:0.48918 

  # This is added back by InteractiveShellApp.init_path()


Epoch: 9, Batch: 280, loss:0.31179    

In [13]:
class MyDNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.firstLayer = nn.Linear(input_dim, 200)
        self.secondLayer = nn.Linear(200, 100)
        self.thirdLayer = nn.Linear(100, 2)
        self.relu = nn.ReLU()
        
    def forward(self, bow_vec):
        out = self.firstLayer(bow_vec)
        out = self.relu(out)
        out = self.secondLayer(out)
        out = self.relu(out)
        out = self.thirdLayer(out)
        return F.log_softmax(out)

In [14]:
dnn = MyDNN(numDistinctChars)
dnn = dnn.cuda()

In [15]:
numEpochs = 10

trainModel(numEpochs, dnn, cuda=True)

Epoch: 0, Batch: 120, loss:0.64391 

  from ipykernel import kernelapp as app


Epoch: 9, Batch: 280, loss:0.18959 

# What are we missing out?

<img src="files/images/rnn_unfold.png">

### From Wikipedia:

A recurrent neural network (RNN) is a class of artificial neural network where connections between nodes form a directed graph along a sequence. This allows it to exhibit temporal dynamic behavior for a time sequence. Unlike feedforward neural networks, RNNs can use their internal state (memory) to process sequences of inputs. This makes them applicable to tasks such as unsegmented, connected handwriting recognition or speech recognition

$X_i$: feature vector of size input_size
$h_i$: hidden state vector of size hidden_size

What happens inside cell A? It depends on the cell type, for "vanilla RNN":

$$h_t = tanh(W_{hh} h_{t-1} + W_{xh} x_t)$$

<img src="images/vanilla_rnn_cell.png">

The "output" of an RNN is either the full sequence of hidden states or the last hidden state

In [24]:
from torch.autograd.variable import Variable


class RNNClassifier(nn.Module):

    def __init__(self, vocab_size, emb_size, hidden_size, output_size, n_layers=1, verbose=False):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.verbose = verbose

        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.RNN(emb_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        # Note: we run this all at once (over the whole input sequence)

        # input = B x S . size(0) = B
        batch_size = input.size(0)

        # input:  B x S  -- (transpose) --> S x B
        input = input.t()

        # Embedding S x B -> S x B x I (embedding size)
        if self.verbose:
            print("\t input", input.size())
        embedded = self.embedding(input)
        if self.verbose:
            print("\t embedding", embedded.size())

        # Make a hidden
        hidden = self._init_hidden(batch_size)

        output, hidden = self.rnn(embedded, hidden)
        if self.verbose:
            print("\t RNN hidden output", hidden.size())
        hidden = hidden[0, :, :]
        # Use the last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        if self.verbose:
            print("\t fc output", fc_output.size())
        return F.log_softmax(fc_output)

    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)

In [48]:
def padToMaxLen(seq):
    if seq.shape[0] > maxLen:
        seq = seq[:maxLen]
    zeros = torch.zeros(maxLen - seq.shape[0]).long()
    return torch.cat((seq, zeros))

def createMiniBatch(masculines, feminines, batch_size, batch_num, vecLen):
    data = masculines[batch_num: batch_num + batch_size // 2]
    data.extend(feminines[batch_num: batch_num + batch_size // 2])
    data = list(map(padToMaxLen, data))
    data = torch.stack(data).long()
    labels = [1] * (batch_size // 2) + [0] * (batch_size // 2)
    labels = torch.tensor(labels)
    return data, labels

In [26]:
torch.backends.cudnn.enabled = True

In [27]:
maxLen = 15

In [28]:
dataBatch, labelsBatch = createMiniBatch(
        corpus.train_1, corpus.train_2, batchSize, 0, numDistinctChars)

In [31]:
clf = RNNClassifier(numDistinctChars, 200, 200, 2)
clf(dataBatch)



tensor([[-0.8419, -0.5637],
        [-0.8417, -0.5638],
        [-0.8406, -0.5646],
        [-0.8423, -0.5634],
        [-0.8424, -0.5633],
        [-0.8443, -0.5619],
        [-0.8426, -0.5632],
        [-0.8426, -0.5632],
        [-0.8426, -0.5632],
        [-0.8415, -0.5640]])

In [32]:
trainModel(numEpochs, clf, cuda=False)

Epoch: 0, Batch: 20, loss:0.69089 



Epoch: 9, Batch: 280, loss:0.01372 

## Let's go back to sentiment analysis, this time using chars as features!

In [34]:
trainFolder = '../data/sentiment/train/'
validFolder = '../data/sentiment/valid/'
testFolder = '../data/sentiment/test/'

In [35]:
corpus = CharCorpus(
    trainFolder+'positive.txt', trainFolder+'negative.txt',
    validFolder+'positive.txt', validFolder+'negative.txt',
    testFolder+'positive.txt', testFolder+'negative.txt', limit=10000)

In [53]:
maxLen = 120
numDistinctChars = len(corpus.dictionary)

In [54]:
clf = RNNClassifier(numDistinctChars, 100, 100, 2)

In [55]:
trainModel(numEpochs, clf, cuda=False)

Epoch: 0, Batch: 0, loss:0.70028 



Epoch: 2, Batch: 100, loss:0.69315 

KeyboardInterrupt: 

<img src="files/images/vanishing_gradient.png">

Vanilla RNNs are not "smart" enough to backpropagate through time for long sequences, common problems are:
    - vanishing gradients (loss does not improve)
    - exploding gradients (loss goes to NaN)

## Sepp Hochreiter; Jürgen Schmidhuber: Long-Short Memory Cells
<img src="images/lstm_rnn_cell.png">

Bonus joke: Look for #schmidubered

<img src="images/schimdubered.jpg">