### Deep Learning for NLP with PyTorch

Tutorial with deep learning in PyTorch.

In [1]:
# Imports
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x11edc3370>

In [9]:
x_1 = torch.randn(4, 3)
x_2 = torch.randn(2, 3)
y_2 = torch.randn(2, 5)
# second arg specifies which axis to concat along
z_2 = torch.cat([x_2, y_2], dim=1)
z_1 = torch.cat([x_1,x_2])
print(z_2)
print(z_1)
print(z_2.view(4,4)) # similar to reshape in numpy 


 0.8673 -0.2732 -0.4608 -0.2871 -1.1619  0.0276  0.5652 -0.0115
-0.0991  0.4728  1.0049  0.6706 -0.4929  1.5050 -2.3264  1.6169
[torch.FloatTensor of size 2x8]


-0.6368  1.0429  0.4903
 1.0318 -0.5989  1.6015
-1.0735 -1.2173  0.6472
-0.0412 -0.1775 -0.5000
 0.8673 -0.2732 -0.4608
-0.0991  0.4728  1.0049
[torch.FloatTensor of size 6x3]


 0.8673 -0.2732 -0.4608 -0.2871
-1.1619  0.0276  0.5652 -0.0115
-0.0991  0.4728  1.0049  0.6706
-0.4929  1.5050 -2.3264  1.6169
[torch.FloatTensor of size 4x4]



### Affine Maps
One of the core workhorses of deep learning is the affine map, which is a function $f(x)$ where

$$f(x)=Ax+b$$

for a matrix $A$ and vectors $x,b$. The parameters to be learned here are $A$ and $b$. Often, $b$ is refered to as the bias term.

In [10]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = autograd.Variable(torch.randn(2, 5))
print(lin(data))  # yes

Variable containing:
-0.2007 -0.1253 -0.5372
 0.7078 -1.2168 -0.0176
[torch.FloatTensor of size 2x3]



### Non-Linearities

In [14]:
# In pytorch, most non-linearities are in torch.functional (we have it imported as F)
# Note that non-linearites typically don't have parameters like affine maps do.
# That is, they don't have weights that are updated during training.
torch.manual_seed(1223)
data = autograd.Variable(torch.randn(2, 2))
print(data)
print(F.relu(data))

Variable containing:
-1.2032 -0.3849
 1.4087  0.8899
[torch.FloatTensor of size 2x2]

Variable containing:
 0.0000  0.0000
 1.4087  0.8899
[torch.FloatTensor of size 2x2]



### Softmax and Probabilities
The function $Softmax(x)$ is also just a non-linearity, but it is special in that it usually is the last operation done in a network. This is because it takes in a vector of real numbers and returns a probability distribution. Its definition is as follows. Let $x$ be a vector of real numbers (positive, negative, whatever, there are no constraints). Then the $i$’th component of $Softmax(x)$ is

$$\frac{exp(x_i)}{\sum_jexp(x_j)}$$

It should be clear that the output is a probability distribution: each element is non-negative and the sum over all components is 1.  You could also think of it as just applying an element-wise exponentiation operator to the input to make everything non-negative and then dividing by the normalization constant.

In [21]:
# Softmax is also in torch.nn.functional
data = autograd.Variable(torch.randn(5))
print(data)
print(F.softmax(data,dim=0))
print(F.softmax(data,dim=0).sum())  # Sums to 1 because it is a distribution!
# print(F.log_softmax(data))  # theres also log_softmax

Variable containing:
-1.4995
-3.3450
-0.4310
-0.0604
 0.7900
[torch.FloatTensor of size 5]

Variable containing:
 0.0551
 0.0087
 0.1603
 0.2323
 0.5436
[torch.FloatTensor of size 5]

Variable containing:
 1
[torch.FloatTensor of size 1]



### Simple BOW neural net

Denote this BOW vector as $x$. The output of our network is:

$$log Softmax(Ax+b)$$

That is, we pass the input through an affine map and then do log softmax.

In [39]:
torch.manual_seed(253)
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

class bow_nn(nn.Module):
    def __init__(self, vocab_size, num_labs):
        super(bow_nn,self).__init__()
        self.lin = nn.Linear(vocab_size, num_labs)
        
    def forward(self,inpt):
        return F.log_softmax(self.lin(inpt),dim=1)

    
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = bow_nn(VOCAB_SIZE,NUM_LABELS)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module
# in this case, bow_nn() will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
Parameter containing:

Columns 0 to 9 
-0.1619  0.0140  0.1859 -0.1057 -0.1404 -0.0096  0.1937  0.1890 -0.1916  0.0392
 0.1409 -0.1065  0.0049 -0.0887  0.0053  0.0161  0.1223  0.1224  0.0216  0.0138

Columns 10 to 19 
 0.1858  0.1494 -0.0888  0.0736 -0.1078 -0.0605  0.1188  0.1785 -0.0868  0.0224
-0.0993 -0.1635 -0.0157  0.0312 -0.1743  0.1342 -0.1573  0.1487 -0.1127 -0.1398

Columns 20 to 25 
-0.0849  0.0402 -0.1435  0.0128 -0.1493 -0.1225
-0.0188  0.0035  0.1894 -0.0861 -0.0907 -0.1083
[torch.FloatTensor of size 2x26]

Parameter containing:
 0.0125
-0.1506
[torch.FloatTensor of size 2]

Variable containing:
-0.7064 -0.6800
[torch.FloatTensor of size 1x2]



In [56]:
label_to_ix = {'SPANISH': 0, 'ENGLISH': 1}

# Run on test data before we train, just to see a before-and-after
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(100):
    for sent, lab in data:
        val = autograd.Variable(make_bow_vector(sent, word_to_ix))
        target = autograd.Variable(make_target(lab, label_to_ix))
        
        # zero the gradient 
        model.zero_grad()
        
        # Forward pass
        log_probs = model(val)
        
        # Calculate loss
        loss = loss_func(log_probs, target)
        
        # Backward pass
        loss.backward()
        
        # Update params 
        optimizer.step()
        
for sent,lab in test_data:
    val = autograd.Variable(make_bow_vector(sent, word_to_ix))
    target = autograd.Variable(make_target(lab, label_to_ix))
    # Forward pass
    log_probs = model(val)
    _,pred = torch.max(log_probs,dim=1)
    print(log_probs.data.numpy(),'correct?',torch.equal(target,pred))
print()
# Index corresponding to Spanish goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["creo"]])

Variable containing:
-0.0299 -3.5262
[torch.FloatTensor of size 1x2]

Variable containing:
-3.7251 -0.0244
[torch.FloatTensor of size 1x2]

Variable containing:
 0.8014
-0.7149
[torch.FloatTensor of size 2]

[[-0.02909533 -3.5516901 ]] correct? True
[[-3.76237464 -0.02350255]] correct? True

Variable containing:
 0.8075
-0.7209
[torch.FloatTensor of size 2]



In [58]:
# Saving the model
torch.save(model,'data/models/nlptest.pt')

# Loading the model
nm = torch.load('data/models/nlptest.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [59]:
for sent,lab in test_data:
    val = autograd.Variable(make_bow_vector(sent, word_to_ix))
    target = autograd.Variable(make_target(lab, label_to_ix))
    # Forward pass
    log_probs = nm(val)
    _,pred = torch.max(log_probs,dim=1)
    print(log_probs.data.numpy(),'correct?',torch.equal(target,pred))

[[-0.02909533 -3.5516901 ]] correct? True
[[-3.76237464 -0.02350255]] correct? True


## Word Embedings

   Word embeddings are dense vectors of real numbers, one per word in your vocabulary. In NLP, it is almost always the case that your features are words! But how should you represent a word in a computer? You could store its ascii character representation, but that only tells you what the word is, it doesn’t say much about what it means (you might be able to derive its part of speech from its affixes, or properties from its capitalization, but not much). Even more, in what sense could you combine these representations? We often want dense outputs from our neural networks, where the inputs are |V||V| dimensional, where VV is our vocabulary, but often the outputs are only a few dimensional (if we are only predicting a handful of labels, for instance). How do we get from a massive dimensional space to a smaller dimensional space?


   In summary, word embeddings are a representation of the *semantics* of a word, efficiently encoding semantic information that might be relevant to the task at hand. You can embed other things too: part of speech tags, parse trees, anything! The idea of feature embeddings is central to the field.

In [72]:
# Using the Embedding module 
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed) # changes every time if no random seed

Variable containing:
-2.4978 -0.5175  0.8160  0.8567 -0.9662
[torch.FloatTensor of size 1x5]



In [110]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler,self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim,128)
        self.linear2 = nn.Linear(128,vocab_size)
        
    def forward(self, out):
        out = self.embed(out).view((1,-1))
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out,dim=1)
        return out
    
model = NGramLanguageModeler(len(vocab),EMBEDDING_DIM,CONTEXT_SIZE)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(),lr=0.001)

losses=[]
for epoch in range(15):
    total_loss = torch.Tensor([0])
    for context,target in trigrams:
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = [word_to_ix[w] for w in context]
        context_var = autograd.Variable(torch.LongTensor(context_idxs))
        
        # Prepare the target for comparison
        target_var = autograd.Variable(torch.LongTensor([word_to_ix[target]]))
        
        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()
        
        outp = model(context_var)
        loss = loss_func(outp,target_var)
        
        # Backprop and update coefficients
        loss.backward()
        optimizer.step()
        
        total_loss+=loss.data
    losses.append(total_loss)
for ll in losses:
    print(ll.numpy()[0])
        

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
525.72
523.285
520.87
518.471
516.091
513.726
511.375
509.038
506.712
504.397
502.092
499.797
497.511
495.234
492.962


## Computing Word Embeddings: Continuous Bag-of-Words

The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep learning. It is a model that tries to predict words given the context of a few words before and a few words after the target word. This is distinct from language modeling, since CBOW is not sequential and does not have to be probabilistic. Typcially, CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings of some more complicated model. Usually, this is referred to as pretraining embeddings. It almost always helps performance a couple of percent.

In [122]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print('Data:\n',data[:5],'\n')


class CBOW(nn.Module):

    def __init__(self, vocab_size, context_size,embedding_dim):
        super(CBOW,self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(2*context_size*embedding_dim, vocab_size)
        

    def forward(self, inpts):
        x = self.embed(inpts).view((1,-1))
        x = self.linear(x)
        log_probs = F.log_softmax(x,dim=1)
        return log_probs
        

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


make_context_vector(data[0][0], word_to_ix)  # example

# Make essentials
model = CBOW(vocab_size, CONTEXT_SIZE, 15)
loss_func = nn.NLLLoss()
op = optim.Adam(model.parameters(), lr=1e-2)


for epoch in range(20):
    total_loss = torch.Tensor([0])
    for context, target in data:
        # Format inputs
        context_v = make_context_vector(context,word_to_ix)
        
        # Format target
        target_v = make_context_vector([target],word_to_ix)
        
        # Zero gradients 
        model.zero_grad()
        
        # Forward and loss calculation
        log_probs = model(context_v)
        loss = loss_func(log_probs,target_v)
        
        # Backprop and coef update
        loss.backward()
        op.step()
        
        total_loss+=loss.data
    print('Loss at epoch {}: {}'.format(epoch+1,total_loss.numpy()[0]))

# Accuracy
right=0
for context, target in data:
    # Format inputs
    context_v = make_context_vector(context,word_to_ix)

    # Format target
    target_v = make_context_vector([target],word_to_ix)

    # Zero gradients 
    model.zero_grad()

    # Forward and loss calculation
    log_probs = model(context_v)
    _,pred = torch.max(log_probs,dim=1)
    right += torch.equal(pred,target_v)
print('Trained model accuracy: {}%'.format(right/len(data)*100))

Data:
 [(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')] 

Loss at epoch 1: 240.81631469726562
Loss at epoch 2: 78.46018981933594
Loss at epoch 3: 11.565673828125
Loss at epoch 4: 4.080151557922363
Loss at epoch 5: 2.399442195892334
Loss at epoch 6: 1.63668954372406
Loss at epoch 7: 1.2057476043701172
Loss at epoch 8: 0.9326581358909607
Loss at epoch 9: 0.7465099692344666
Loss at epoch 10: 0.6129368543624878
Loss at epoch 11: 0.5133311152458191
Loss at epoch 12: 0.4367898106575012
Loss at epoch 13: 0.37653249502182007
Loss at epoch 14: 0.3281404972076416
Loss at epoch 15: 0.2886223793029785
Loss at epoch 16: 0.2558879554271698
Loss at epoch 17: 0.22843746840953827
Loss at epoch 18: 0.20517009496688843
Loss at epoch 19: 0.18526168167591095
Loss at epoch 20: 0.16808423399925232
Trained model accuracy: 100.0%


## Sequence models (RNN and LSTM)

At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. This might not be the behavior we want. Sequence models are central to NLP: they are models where there is some sort of dependence through time between your inputs. The classical example of a sequence model is the Hidden Markov Model for part-of-speech tagging. Another example is the conditional random field.

A recurrent neural network is a network that maintains some kind of state. For example, its output could be used as part of the next input, so that information can propogate along as the network passes over the sequence. In the case of an LSTM, for each element in the sequence, there is a corresponding hidden state $h_t$, which in principle can contain information from arbitrary points earlier in the sequence. We can use the hidden state to predict words in a language model, part-of-speech tags, and a myriad of other things.

### LSTM’s in Pytorch
Before getting to the example, note a few things. Pytorch’s LSTM expects all of its inputs to be 3D tensors. The semantics of the axes of these tensors is important. The first axis is the sequence itself, the second indexes instances in the mini-batch, and the third indexes elements of the input. We haven’t discussed mini-batching, so lets just ignore that and assume we will always have just 1 dimension on the second axis.

In [128]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3)))
          for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
    torch.randn((1, 1, 3))))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(inputs.size()) # first is number of inputs, second is dimension of mini-batches, third is size of each input
print(out)
print(hidden)

torch.Size([5, 1, 3])
Variable containing:
(0 ,.,.) = 
  0.0543 -0.4309  0.0259

(1 ,.,.) = 
  0.0692 -0.0798  0.1166

(2 ,.,.) = 
  0.0720 -0.0669  0.2205

(3 ,.,.) = 
  0.0220  0.2183  0.1193

(4 ,.,.) = 
  0.0286  0.1278  0.2187
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
  0.0286  0.1278  0.2187
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
  0.0417  0.4001  0.5227
[torch.FloatTensor of size 1x1x3]
)


## Part-of-Speech tagging 

In [164]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix,char_to_ix = {},{}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for c in word:
            if c not in char_to_ix:
                char_to_ix[c] = len(char_to_ix)
print(word_to_ix)
print(char_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores
    
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
#         print(tag_scores.size(),targets.size())
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{'T': 0, 'h': 1, 'e': 2, 'd': 3, 'o': 4, 'g': 5, 'a': 6, 't': 7, 'p': 8, 'l': 9, 'E': 10, 'v': 11, 'r': 12, 'y': 13, 'b': 14, 'k': 15}
Variable containing:
-1.3709 -1.1310 -0.8594
-1.2458 -1.3437 -0.7953
-1.1938 -1.3985 -0.7986
-1.2377 -1.3353 -0.8055
-1.1234 -1.3530 -0.8762
[torch.FloatTensor of size 5x3]

Variable containing:
-0.1648 -1.9717 -4.3662
-3.2699 -0.0471 -4.8310
-3.6398 -3.6653 -0.0532
-0.0528 -3.6088 -3.7148
-4.8709 -0.0086 -7.0126
[torch.FloatTensor of size 5x3]



### Model with additional character embedding



In [168]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class cLSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, char_size, tagset_size):
        super(cLSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.char_embeddings = nn.Embedding(char_size, embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.clstm = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.chidden = self.init_hidden()
        self.shidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence, chars):
        sembeds = self.word_embeddings(sentence)
        cembeds = self.char_embeddings(chars)
        
        clstm_out, self.chidden = self.lstm(cembeds.view(len(chars), 1, -1), self.chidden)
        slstm_in = torch.cat((clstm_out,sembeds.view(len(sentence), 1, -1)))
        slstm_out, self.shidden = self.lstm(slstm_in, self.shidden)

        lin_inpt = slstm_out.view(slstm_out.size(0),-1)
        tag_space = self.hidden2tag(lin_inpt[-len(sentence):])
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores
    
model = cLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
cinputs = prepare_sequence([c for w in training_data[0][0] for c in list(w)],char_to_ix)
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs,cinputs)
print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    total_loss = torch.Tensor([0])
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.shidden = model.init_hidden()
        model.chidden = model.init_hidden()
        
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        chars = [c for w in sentence for c in list(w)]
        sentence_in = prepare_sequence(sentence, word_to_ix)
        char_in = prepare_sequence(chars, char_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in, char_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        total_loss+=loss.data
        
    if epoch %50==49:
        print('Loss at epoch {}: {}'.format(epoch,total_loss.numpy()[0]))
    
# See what the scores are after training
cinputs = prepare_sequence([c for w in training_data[0][0] for c in list(w)],char_to_ix)
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs,cinputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)

Variable containing:
-1.2533 -1.0090 -1.0502
-1.2714 -1.0059 -1.0390
-1.1486 -1.1510 -1.0035
-1.0832 -1.1961 -1.0241
-1.1649 -1.1424 -0.9969
[torch.FloatTensor of size 5x3]

Loss at epoch 49: 1.9785442352294922
Loss at epoch 99: 1.4877008199691772
Loss at epoch 149: 0.7288872599601746
Loss at epoch 199: 0.29509446024894714
Loss at epoch 249: 0.14594370126724243
Loss at epoch 299: 0.08934877812862396
Variable containing:
-0.0811 -3.0344 -3.5128
-3.8920 -0.0229 -6.0855
-2.9772 -4.4400 -0.0648
-0.0510 -4.7633 -3.1904
-3.7659 -0.0280 -5.4069
[torch.FloatTensor of size 5x3]



## Making Dynamic Decisions and the Bi-LSTM CRF
### Named entity recognition
#### Dynamic versus Static Deep Learning Toolkits
Pytorch is a dynamic neural network kit. Another example of a dynamic kit is Dynet (I mention this because working with Pytorch and Dynet is similar. If you see an example in Dynet, it will probably help you implement it in Pytorch). The opposite is the static tool kit, which includes Theano, Keras, TensorFlow, etc. The core difference is the following:

In a static toolkit, you define a computation graph once, compile it, and then stream instances to it.
In a dynamic toolkit, you define a computation graph for each instance. It is never compiled and is executed on-the-fly

#### Implementation Notes:

The example below implements the forward algorithm in log space to compute the partition function, and the viterbi algorithm to decode. Backpropagation will compute the gradients automatically for us. We don’t have to do anything by hand.

The implementation is not optimized. If you understand what is going on, you’ll probably quickly see that iterating over the next tag in the forward algorithm could probably be done in one big operation. I wanted to code to be more readable. If you want to make the relevant change, you could probably use this tagger for real tasks.

In [169]:
# Helpers
def to_scalar(var):
    # returns a python float
    return var.view(-1).data.tolist()[0]


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return to_scalar(idx)


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [170]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
                autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = autograd.Variable(init_alphas)

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward variables at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = autograd.Variable(torch.Tensor([0]))
        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = autograd.Variable(init_vvars)
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id])
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        """ CUSTOM LOSS FUNCTION """
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [173]:
torch.manual_seed(1293)
# Training 
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# Make up some training data
training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
print(model(precheck_sent))

# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(
        300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.LongTensor([tag_to_ix[t] for t in tags])

        # Step 3. Run our forward pass.
        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        neg_log_likelihood.backward()
        optimizer.step()

# Check predictions after training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
print(model(precheck_sent))
# We got it!

(Variable containing:
 13.7093
[torch.FloatTensor of size 1]
, [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
(Variable containing:
 19.9582
[torch.FloatTensor of size 1]
, [0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])
