In [1]:
import subprocess
import argparse
import sys
import gzip
import cPickle

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np

class Classifier(object):
    def __init__(self):
        pass

    def train(self):
        """
        Override this method in your class to implement train
        """
        raise NotImplementedError("Train method not implemented")

    def inference(self):
        """
        Override this method in your class to implement inference
        """
        raise NotImplementedError("Inference method not implemented")



def conlleval(p, g, w, filename='tempfile.txt'):
    '''
    INPUT:
    p :: predictions
    g :: groundtruth
    w :: corresponding words

    OUTPUT:
    filename :: name of the file where the predictions
    are written. it will be the input of conlleval.pl script
    for computing the performance in terms of precision
    recall and f1 score
    '''
    out = ''
    for sl, sp, sw in zip(g, p, w):
        out += 'BOS O O\n'
        for wl, wp, ww in zip(sl, sp, sw):
            out += ww + ' ' + wl + ' ' + wp + '\n'
        out += 'EOS O O\n\n'

    f = open(filename, 'w')
    f.writelines(out)
    f.close()

    return get_perf(filename)

def get_perf(filename):
    ''' run conlleval.pl perl script to obtain precision/recall and F1 score '''
    _conlleval = 'conlleval.pl'

    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate(open(filename).read())
    for line in stdout.split('\n'):
        if 'accuracy' in line:
            out = line.split()
            break

    precision = float(out[6][:-2])
    recall    = float(out[8][:-2])
    f1score   = float(out[10])

    return (precision, recall, f1score)

class MyNNClassifier(Classifier):
    def __init__(self):
        pass

    def train(self):
        pass

    def inference(self):
        pass

In [2]:
class NeuralNet(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_input_nodes, num_hidden_nodes, output_dimension):
        super(NeuralNet, self).__init__()
        self.input_linear = nn.Linear(num_input_nodes, num_hidden_nodes)
        self.output_linear = nn.Linear(num_hidden_nodes, output_dimension)

    def forward(self, input_vector):
        out = self.input_linear(input_vector)
        out = F.tanh(out)
        out = self.output_linear(out)
        out = F.softmax(out)
        return out

In [6]:
# argparser = argparse.ArgumentParser()
# argparser.add_argument("--data", type=str, default="atis.small.pkl.gz", help="The zipped dataset")

# parsed_args = argparser.parse_args(sys.argv[1:])

filename = "atis.small.pkl.gz"
f = gzip.open(filename,'rb')
train_set, valid_set, test_set, dicts = cPickle.load(f)

# print "train_set ", train_set

train_lex, _, train_y = train_set
valid_lex, _, valid_y = valid_set
test_lex,  _,  test_y  = test_set

# print "train_lex ", train_lex
# print "train_y ", train_y

idx2label = dict((k,v) for v,k in dicts['labels2idx'].iteritems())
idx2word  = dict((k,v) for v,k in dicts['words2idx'].iteritems())

'''
To have a look what the original data look like, commnet them before your submission
'''
print "length train data ", len(train_lex), " ", len(train_y)

length train data  3983   3983


In [244]:
word_embeddings = torch.rand(VOCAB_SIZE, 300)
#     word_embeddings = torch.eye(VOCAB_SIZE, VOCAB_SIZE)
tag_embeddings = torch.eye(NUM_LABELS+1, NUM_LABELS+1)
def create_embedding(train_x, train_y):
    NUM_LABELS = len(idx2label)
    VOCAB_SIZE = len(idx2word)
    word_embedding_list = []
    label_list = []
    
    for sentence, labels in zip(train_x, train_y):
        prev_label = tag_embeddings[NUM_LABELS]
        for word, label in zip(sentence, labels):
            word_embedding = word_embeddings[word]
            input_vector = torch.cat((word_embedding.view(1,-1), prev_label.view(1,-1)), 1)
            word_embedding_list.append(input_vector)
            prev_label = tag_embeddings[label]
            # for mse loss
            label_tensor = torch.LongTensor(NUM_LABELS).zero_().view(1,-1)
            label_tensor[0,label] = 1
            label_tensor = label_tensor.float()
            label_list.append(label_tensor)
            
            # for cross entropy loss since multi target not supported
#             label_tensor = torch.LongTensor([label.item()])
#             label_tensor = label_tensor.long()
#             label_list.append(label_tensor)
    print "word embedding list ", len(word_embedding_list)
    print "label list ", len(label_list)
#     print "label list 0 ", label_list[0]
    return word_embedding_list, label_list

In [251]:
word_embedding_list, label_list = create_embedding(train_lex, train_y)

'''
implement you training loop here
'''
# NUM_LABELS = len(idx2label)
VOCAB_SIZE = len(idx2word)
HIDDEN_NODES = 1000
NUM_LABELS = len(idx2label)
word_embedding_list = torch.stack(word_embedding_list)
word_embedding_list = torch.squeeze(word_embedding_list)
label_list = torch.stack(label_list)
label_list = torch.squeeze(label_list)
NUM_INPUT_NODES = word_embedding_list[0].size()[0]
print "number of input nodes ", NUM_INPUT_NODES
print "word_embeddings ", word_embedding_list.size()
print "label list ", label_list.size()

model = NeuralNet(NUM_INPUT_NODES, HIDDEN_NODES, NUM_LABELS)


loss_function = nn.MSELoss()
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1
# #                       , momentum=0.9
#                      )
words = autograd.Variable(word_embedding_list
#                           , requires_grad=True
                         )
label = autograd.Variable(label_list
                          , requires_grad=False
                         )


word embedding list  45388
label list  45388
number of input nodes  428
word_embeddings  torch.Size([45388, 428])
label list  torch.Size([45388, 127])


In [340]:
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [449]:
for epoch in range(10000):
    probs = model(words)
    loss = loss_function(probs, label)
    print "loss ", loss.data[0]
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

loss  0.00133857852779
loss  0.00133808888495
loss  0.00133757316507
loss  0.00133706117049
loss  0.001336540794
loss  0.00133599853143
loss  0.00133547571022
loss  0.00133495568298
loss  0.00133446638938
loss  0.00133399281185
loss  0.00133343169
loss  0.00133290269878


KeyboardInterrupt: 

In [450]:
def get_input_vector(word_embeddings, prev_label, word):
    word_embedding = word_embeddings[word]
    word_feature = autograd.Variable(word_embeddings[word])
    prev_label = autograd.Variable(prev_label)
    input_vector = torch.cat((word_feature.view(1,-1), prev_label.view(1,-1)), 1)
#     print "input vector ", input_vector
    return input_vector

In [451]:
def greedy_inference(model, sentence, word_embeddings, tag_embeddings, NUM_LABELS):
    output_labels = np.zeros(len(sentence))
    prev_label = tag_embeddings[NUM_LABELS]
    prob_list = []
    for i, word in enumerate(sentence):
        input_vector =  get_input_vector(word_embeddings, prev_label, word)
        probs = model(input_vector)
#         print "probs greedy", probs.data
        prob_list.append(probs)
        max_val, predicted_label = torch.max(probs, 1)
        predicted_label = predicted_label.data[0]
        prev_label = tag_embeddings[predicted_label]
        output_labels[i] = predicted_label
    print output_labels
    print prob_list
    return output_labels

In [452]:
def viterbi_inference(model, sentence, word_embeddings, tag_embeddings, NUM_LABELS):
        dp = np.zeros((NUM_LABELS, len(sentence)+1))
        back_pointers = np.zeros((NUM_LABELS, len(sentence)))
        dp[0][0] = 1
        for i in range(len(sentence)):
            word_table = np.zeros((NUM_LABELS, NUM_LABELS))
            if i == 0:
                input_vector = get_input_vector(word_embeddings, tag_embeddings[NUM_LABELS], sentence[i])
#                 print "input_vector ", input_vector
                probs = model(input_vector)
                probs = probs.data.numpy()
#                 print "probs is ", probs
#                 word_table[:,0] = np.log(dp[0, 0]) + np.log(probs)
#                 word_table[:,0] = dp[0,0] + probs
                word_table[:,0] = np.multiply(dp[0, 0], probs)
#                 word_table[:,0] = np.log(dp[0, 0]) + np.log(probs)
#                 print "word table 0", word_table[:, 0]
                dp[:,i+1] = word_table[:,0]
                back_pointers[:,i] = 128
                continue
#             print i
            for j in range(NUM_LABELS):
                input_vector = get_input_vector(word_embeddings, tag_embeddings[j], sentence[i])
                probs = model(input_vector)
                probs = probs.data.numpy()
#                 if j == 126:
#                     print " probs ", probs
#                 print "dp array ", dp[:,i]
#                     print "np product ", np.multiply(dp[j, i], probs)
#                 word_table[:,j] = dp[:, i] + probs
                word_table[:,j] = np.multiply(dp[j, i], probs)
#                 word_table[:,j] = np.log(dp[:, i]) + np.log(probs)
#             for ind, array in enumerate(word_table):
#                 print "word table index ", ind, " ", array
            dp[:,i+1] = word_table.max(1)
#             print "dp ", i+1, " ", dp[:,i+1]
#             break
            for k in range(NUM_LABELS):
                for index, element in enumerate(word_table[k]):
                    if element == dp[k, i+1]:
                        back_pointers[k, i] = index
#                 back_pointers[k, i] = word_table[k].index(dp[k,i+1])

#         print "back_pointers ", back_pointers
#         print "dp matrix ", dp[:,19]
        output_labels = np.zeros(len(sentence), dtype = np.int)
        label_index = len(sentence) - 1
        max_val = dp[:, len(sentence)].max()
        for index, element in enumerate(dp[:, len(sentence)]):
            if element == max_val:
                output_labels[label_index] = index
#                 print "debug2 ", index
                break
#         print "output labels ", output_labels
        for i in range(len(sentence)-1, 0, -1): #18 to 1
#             print "debug ", output_labels[label_index]
            row = back_pointers[output_labels[label_index], i]
            label_index -= 1
            output_labels[label_index] = row
#         for i in np.nditer(dp,order='F'):
#             print i
#         print [dp[:, i] for i in range(dp.shape[1])]
#         print [back_pointers[:, i] for i in range(back_pointers.shape[1])]
#         print "output labels ", output_labels
        return output_labels

In [None]:
predictions_test = [ map(lambda t: idx2label[t], 
                             viterbi_inference(model, x, 
#                                               torch.eye(len(idx2word), len(idx2word))
                                              word_embeddings
                                              ,
                                        tag_embeddings, NUM_LABELS)) 
                        for x in test_lex
                   ]

In [None]:
print "predictions ", predictions_test[0]
groundtruth_test = [ map(lambda t: idx2label[t], y) for y in test_y ]

In [None]:
print "groundtruth ", groundtruth_test[0]
words_test = [ map(lambda t: idx2word[t], w) for w in test_lex ]
test_precision, test_recall, test_f1score = conlleval(predictions_test, groundtruth_test, words_test)

In [None]:
print test_precision, test_recall, test_f1score
# print idx2label