In [31]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
torch.manual_seed(1)

<torch._C.Generator at 0x107ae5030>

In [147]:
def to_scalar(var):
    # returns a python float
    return var.view(-1).data.tolist()[0]


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return to_scalar(idx)


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else 29 for w in seq ]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


In [6]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.ix_to_tag = {v:k for k,v in self.tag_to_ix.items()}

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
                autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = autograd.Variable(init_alphas)

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward variables at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = autograd.Variable(torch.Tensor([0]))
        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = autograd.Variable(init_vvars)
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id])
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


In [37]:
def load_data(path):
    f = open(path)
    data = [line.split("\t") for line in f]
    data = [ (tokens.split()[1:-1],tags.split()[1:-1]) for (tokens,tags) in data]
    f.close()
    return data

def create_word_to_ix(training_data):
    word_to_ix = {}
    for (sentence, tags) in training_data:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

def create_tag_to_ix(training_data,START_TAG=None,STOP_TAG=None):
    if START_TAG and STOP_TAG : tag_to_ix = {START_TAG: 0, STOP_TAG: 1}
    else : tag_to_ix = {}
    for (sentence, tags) in training_data:
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    return tag_to_ix

In [129]:
training_data = load_data("../data/mit-movie-1/processed/train.bio")
testing_data = load_data("../data/mit-movie-1/processed/test.bio")

In [39]:
START_TAG = "BOS"
STOP_TAG = "EOS"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

tag_to_ix = create_tag_to_ix(training_data,START_TAG,STOP_TAG)
word_to_ix = create_word_to_ix(training_data)

In [41]:
len(word_to_ix),len(tag_to_ix)

(6710, 27)

In [42]:
for tag in tag_to_ix : print(tag)

BOS
EOS
O
B-ACTOR
I-ACTOR
B-YEAR
B-TITLE
B-GENRE
I-GENRE
B-DIRECTOR
I-DIRECTOR
B-SONG
I-SONG
B-PLOT
I-PLOT
B-REVIEW
B-CHARACTER
I-CHARACTER
B-RATING
B-RATINGS_AVERAGE
I-RATINGS_AVERAGE
I-TITLE
I-RATING
B-TRAILER
I-TRAILER
I-REVIEW
I-YEAR


In [43]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [44]:
# Check predictions before training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
print(model(precheck_sent))

(Variable containing:
 12.0993
[torch.FloatTensor of size 1]
, [3, 21, 24, 3, 21])


In [None]:
sentences_in = [prepare_sequence(sentence, word_to_ix) for (sentence,tags) in training_data]


# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(
        10):  # again, normally you would NOT do 300 epochs, it is toy data
    print("epoch = ",epoch)
    for idx ,(sentence, tags) in enumerate(training_data[:-1]):
        if idx%1000 == 0 : print(idx)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Variables of word indices.
        sentence_in = sentences_in[idx]
        targets = torch.LongTensor([tag_to_ix[t] for t in tags])

        # Step 3. Run our forward pass.
        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        neg_log_likelihood.backward()
        optimizer.step()

# Check predictions after training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
print(model(precheck_sent))
# We got it!

epoch =  0
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
epoch =  1
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
epoch =  2
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
epoch =  3
0
1000
2000
3000
4000
5000
6000


In [116]:
ix_to_tag = {v:k for k,v in tag_to_ix.items()}

In [136]:
def persist_output(path,preds,data):
    fout = open(path,"w")
    for row in range(0,len(preds)):
        predicted = preds[row][1]
        actual = data[row][1]
        sentence = data[row][0]
        predicted = [ix_to_tag[ix] for ix in predicted]
        print(predicted,actual,sentence)
        for idx in range(len(predicted)):
            fout.write(" ".join([sentence[idx],'UNK','UNK',actual[idx],predicted[idx]]) + "\n")
    fout.close()

In [128]:
path = "train.res"
precheck_sents = [prepare_sequence(training_data[i][0], word_to_ix) for i in range(len(training_data))]
preds = [model(precheck_sent) for precheck_sent in precheck_sents[:-1]]
persist_output(path,preds,training_data)

['i',
 'want',
 'to',
 'find',
 'the',
 'movie',
 'with',
 'scarlett',
 'o',
 'hara',
 'in',
 'it']

In [148]:
path = "test.res"
precheck_sents = [prepare_sequence(testing_data[i][0], word_to_ix) for i in range(len(testing_data))]
preds = [model(precheck_sent) for precheck_sent in precheck_sents[:-1]]


In [149]:
persist_output(path,preds,testing_data)

['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'B-GENRE', 'O', 'B-TITLE', 'I-TITLE'] ['O', 'O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'B-YEAR', 'I-YEAR'] ['are', 'there', 'any', 'good', 'romantic', 'comedies', 'out', 'right', 'now']
['O', 'O', 'O', 'O', 'O', 'B-GENRE', 'O', 'O'] ['O', 'O', 'O', 'O', 'O', 'B-PLOT', 'I-PLOT', 'I-PLOT'] ['show', 'me', 'a', 'movie', 'about', 'cars', 'that', 'talk']
['O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR'] ['O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR'] ['list', 'the', 'five', 'star', 'rated', 'movies', 'starring', 'mel', 'gibson']
['O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'B-TITLE', 'I-TITLE', 'I-TITLE'] ['O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'O', 'O', 'B-YEAR'] ['what', 'science', 'fiction', 'films', 'have', 'come', 'out', 'recently']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'O'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE

['O', 'O', 'O', 'O', 'O', 'B-DIRECTOR', 'I-DIRECTOR'] ['O', 'O', 'O', 'O', 'O', 'B-DIRECTOR', 'I-DIRECTOR'] ['what', 'movies', 'are', 'directed', 'by', 'garry', 'marshall']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] ['O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE'] ['what', 'is', 'the', 'plot', 'of', 'requiem', 'for', 'a', 'heavyweight']
['O', 'O', 'B-ACTOR', 'I-ACTOR', 'B-GENRE', 'O'] ['O', 'O', 'B-DIRECTOR', 'I-DIRECTOR', 'B-GENRE', 'I-GENRE'] ['name', 'a', 'paul', 'williams', 'musical', 'film']
['O', 'O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'O', 'O', 'O'] ['O', 'O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'O', 'B-REVIEW', 'O'] ['show', 'me', 'the', 'movie', 'that', 'judy', 'holliday', 'won', 'an', 'oscar', 'for']
['O', 'O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE'] ['O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'B-ACTOR', 'I-ACTOR'] ['locate', 'that', 'movie', 'with', 'sandra', '

['O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'O', 'O', 'B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE'] ['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'O', 'B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE'] ['find', 'a', 'viewers', 'review', 'for', 'the', 'fast', 'and', 'the', 'furious']
['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'O', 'O', 'O', 'B-DIRECTOR', 'I-DIRECTOR'] ['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'O', 'O', 'B-PLOT', 'I-PLOT'] ['show', 'me', 'an', 'lee', 'meriweather', 'film', 'about', 'a', 'torch', 'singer']
['O', 'O', 'O', 'B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE'] ['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'I-ACTOR', 'I-ACTOR', 'O', 'B-ACTOR', 'I-ACTOR'] ['what', 'film', 'featured', 'mel', 'gibson', 'danny', 'glover', 'and', 'jet', 'li']
['O', 'O', 'O', 'B-GENRE', 'O', 'O', 'O', 'O', 'B-PLOT'] ['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'B-PLOT', 'I-PLOT', 'I-PLOT', 'I-PLOT'] ['which', 'movie', 'features', 'jack', 'blacks', 'voice'

['O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'B-GENRE', 'O', 'O', 'O', 'O'] ['O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'B-PLOT', 'O', 'O', 'B-RATINGS_AVERAGE', 'O'] ['is', 'there', 'a', 'spaghetti', 'western', 'about', 'a', 'sheriff', 'with', 'an', 'average', 'rating']
['O', 'O', 'O', 'B-GENRE', 'O', 'O', 'O', 'O', 'O', 'B-RATING', 'I-RATING', 'O', 'B-TITLE', 'I-TITLE'] ['O', 'O', 'O', 'B-GENRE', 'O', 'O', 'O', 'O', 'O', 'B-RATING', 'I-RATING', 'O', 'B-ACTOR', 'I-ACTOR'] ['is', 'there', 'a', 'sport', 'movie', 'with', 'a', 'rating', 'of', 'pg', '13', 'starring', 'katherine', 'hepburn']
['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-RATING', 'O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'O', 'O', 'O', 'B-YEAR', 'I-YEAR', 'O', 'B-DIRECTOR', 'I-DIRECTOR'] ['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-RATING', 'O', 'B-GENRE', 'I-GENRE', 'O', 'B-PLOT', 'O', 'O', 'O', 'B-YEAR', 'I-YEAR', 'O', 'B-ACTOR', 'I-ACTOR'] ['is', 'there', 'a', 'very', 'popular', 'pg', 'rate

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE'] ['what', 'is', 'a', 'summary', 'of', 'the', 'plot', 'of', 'the', 'movie', 'wall', 'e']
['O', 'O', 'O', 'O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE'] ['O', 'O', 'O', 'B-PLOT', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE'] ['what', 'is', 'a', 'trickery', 'movie', 'with', 'excellent', 'ratings']
['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-GENRE', 'O', 'O', 'O', 'B-YEAR', 'I-YEAR', 'I-YEAR'] ['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-GENRE', 'O', 'O', 'O', 'B-YEAR', 'I-YEAR', 'I-YEAR'] ['what', 'is', 'a', 'very', 'good', 'western', 'film', 'in', 'the', 'past', 'seven', 'decades']
['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-RATING', 'I-RATING', 'B-GENRE', 'I-GENRE', 'O'] ['O', 'O', 'O', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'B-RATING', 'I-RATING', 'B-GENRE', 'I-GENRE', 'O'] [

In [146]:
word_to_ix['a']

29

In [158]:
sent = precheck_sents[2:]
embeds = model.word_embeds(sent).view(len(sent), 1, -1)

In [165]:
precheck_sents = sorted(precheck_sents,key=lambda x : len(x),reverse=True)

In [None]:
nn.utils.rnn.pack_padded_sequence()

In [None]:
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)