In [1]:
import subprocess
import argparse
import sys
import gzip
import cPickle

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np

class Classifier(object):
    def __init__(self):
        pass

    def train(self):
        """
        Override this method in your class to implement train
        """
        raise NotImplementedError("Train method not implemented")

    def inference(self):
        """
        Override this method in your class to implement inference
        """
        raise NotImplementedError("Inference method not implemented")



def conlleval(p, g, w, filename='tempfile.txt'):
    '''
    INPUT:
    p :: predictions
    g :: groundtruth
    w :: corresponding words

    OUTPUT:
    filename :: name of the file where the predictions
    are written. it will be the input of conlleval.pl script
    for computing the performance in terms of precision
    recall and f1 score
    '''
    out = ''
    for sl, sp, sw in zip(g, p, w):
        out += 'BOS O O\n'
        for wl, wp, ww in zip(sl, sp, sw):
            out += ww + ' ' + wl + ' ' + wp + '\n'
        out += 'EOS O O\n\n'

    f = open(filename, 'w')
    f.writelines(out)
    f.close()

    return get_perf(filename)

def get_perf(filename):
    ''' run conlleval.pl perl script to obtain precision/recall and F1 score '''
    _conlleval = 'conlleval.pl'

    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate(open(filename).read())
    for line in stdout.split('\n'):
        if 'accuracy' in line:
            out = line.split()
            break

    precision = float(out[6][:-2])
    recall    = float(out[8][:-2])
    f1score   = float(out[10])

    return (precision, recall, f1score)

class MyNNClassifier(Classifier):
    def __init__(self):
        pass

    def train(self):
        pass

    def inference(self):
        pass

In [2]:
class NeuralNet(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_input_nodes, num_hidden_nodes, output_dimension):
        super(NeuralNet, self).__init__()
        self.input_linear = nn.Linear(num_input_nodes, num_hidden_nodes)
        self.output_linear = nn.Linear(num_hidden_nodes, output_dimension)

    def forward(self, input_vector):
        out = self.input_linear(input_vector)
        out = F.tanh(out)
        out = self.output_linear(out)
        out = F.softmax(out)
        return out

In [3]:
# argparser = argparse.ArgumentParser()
# argparser.add_argument("--data", type=str, default="atis.small.pkl.gz", help="The zipped dataset")

# parsed_args = argparser.parse_args(sys.argv[1:])

filename = "atis.small.pkl.gz"
f = gzip.open(filename,'rb')
train_set, valid_set, test_set, dicts = cPickle.load(f)

# print "train_set ", train_set

train_lex, _, train_y = train_set
valid_lex, _, valid_y = valid_set
test_lex,  _,  test_y  = test_set

# print "train_lex ", train_lex
# print "train_y ", train_y

idx2label = dict((k,v) for v,k in dicts['labels2idx'].iteritems())
idx2word  = dict((k,v) for v,k in dicts['words2idx'].iteritems())

'''
To have a look what the original data look like, commnet them before your submission
'''
print "length train data ", len(train_lex), " ", len(train_y)

length train data  3983   3983


In [4]:
VOCAB_SIZE = len(idx2word)
word_embeddings = torch.rand(VOCAB_SIZE, 300)
NUM_LABELS = len(idx2label)
#     word_embeddings = torch.eye(VOCAB_SIZE, VOCAB_SIZE)
tag_embeddings = torch.eye(NUM_LABELS+1, NUM_LABELS+1)
def create_embedding(train_x, train_y):
    NUM_LABELS = len(idx2label)
    VOCAB_SIZE = len(idx2word)
    word_embedding_list = []
    label_list = []
    
    for sentence, labels in zip(train_x, train_y):
        prev_label = tag_embeddings[NUM_LABELS]
        for word, label in zip(sentence, labels):
            word_embedding = word_embeddings[word]
            input_vector = torch.cat((word_embedding.view(1,-1), prev_label.view(1,-1)), 1)
            word_embedding_list.append(input_vector)
            prev_label = tag_embeddings[label]
            # for mse loss
            label_tensor = torch.LongTensor(NUM_LABELS).zero_().view(1,-1)
            label_tensor[0,label] = 1
            label_tensor = label_tensor.float()
            label_list.append(label_tensor)
            
            # for cross entropy loss since multi target not supported
#             label_tensor = torch.LongTensor([label.item()])
#             label_tensor = label_tensor.long()
#             label_list.append(label_tensor)
    print "word embedding list ", len(word_embedding_list)
    print "label list ", len(label_list)
#     print "label list 0 ", label_list[0]
    return word_embedding_list, label_list

In [5]:
word_embedding_list, label_list = create_embedding(train_lex, train_y)

'''
implement you training loop here
'''
# NUM_LABELS = len(idx2label)
VOCAB_SIZE = len(idx2word)
HIDDEN_NODES = 1000
NUM_LABELS = len(idx2label)
word_embedding_list = torch.stack(word_embedding_list)
word_embedding_list = torch.squeeze(word_embedding_list)
label_list = torch.stack(label_list)
label_list = torch.squeeze(label_list)
NUM_INPUT_NODES = word_embedding_list[0].size()[0]
print "number of input nodes ", NUM_INPUT_NODES
print "word_embeddings ", word_embedding_list.size()
print "label list ", label_list.size()

model = NeuralNet(NUM_INPUT_NODES, HIDDEN_NODES, NUM_LABELS)


loss_function = nn.MSELoss()
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1
# #                       , momentum=0.9
#                      )
words = autograd.Variable(word_embedding_list
#                           , requires_grad=True
                         )
label = autograd.Variable(label_list
                          , requires_grad=False
                         )


word embedding list  45388
label list  45388
number of input nodes  428
word_embeddings  torch.Size([45388, 428])
label list  torch.Size([45388, 127])


In [72]:
optimizer = optim.Adam(model.parameters(), lr=0.00005)
count = 0

In [91]:
for epoch in range(10000):
#     count += 1
    probs = model(words)
    loss = loss_function(probs, label)
    print "loss ", loss.data[0] 
#     + "epoch ", count
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

loss  0.000886002439074
loss  0.000885973044205
loss  0.000885978806764
loss  0.000885977642611
loss  0.000885967048816
loss  0.000885956105776
loss  0.000885950110387
loss  0.000885917805135
loss  0.000885944697075
loss  0.000885924324393
loss  0.00088591966778
loss  0.000885897141416
loss  0.000885898247361
loss  0.00088587513892
loss  0.000885880144779
loss  0.000885872053914
loss  0.000885855697561
loss  0.000885844347067
loss  0.000885824905708
loss  0.000885834742803
loss  0.000885797722731
loss  0.000885804416612
loss  0.000885781948455
loss  0.000885791494511
loss  0.000885787769221
loss  0.000885769084562
loss  0.000885749934241
loss  0.000885756628122
loss  0.000885751622263
loss  0.000885743647814
loss  0.000885732413735
loss  0.000885736430064
loss  0.000885726360139
loss  0.000885704706889
loss  0.000885700283106
loss  0.000885676825419
loss  0.000885666930117
loss  0.00088565639453
loss  0.000885648536496
loss  0.000885648129042
loss  0.000885628105607
loss  0.00088561989

loss  0.000883124535903
loss  0.000883118365891
loss  0.000883103290107
loss  0.000883101427462
loss  0.000883076281752
loss  0.000883093976881
loss  0.000883067783434
loss  0.000883064582013
loss  0.00088304001838
loss  0.000883040542249
loss  0.000883042928763
loss  0.000883048167452
loss  0.000883033324499
loss  0.000883034605067
loss  0.000882992520928
loss  0.00088298326591
loss  0.000882992171682
loss  0.000882970460225
loss  0.000883005734067
loss  0.000882959109731
loss  0.00088296161266
loss  0.000882946769707
loss  0.000882980413735
loss  0.00088299147319
loss  0.000882958702277
loss  0.000882955850102
loss  0.000882914231624
loss  0.000882904743776
loss  0.00088287179824
loss  0.000882855791133
loss  0.000882875930984
loss  0.000882869178895
loss  0.00088284316007
loss  0.00088284839876
loss  0.000882796070073
loss  0.000882796302903
loss  0.000882773019839
loss  0.000882781401742
loss  0.000882773951162
loss  0.000882738037035
loss  0.000882729713339
loss  0.000882756954525

KeyboardInterrupt: 

In [92]:
torch.save(model.state_dict(), 'parameters.pt')
torch.save(word_embeddings, 'word_embeddings.pt')

In [93]:
def get_input_vector(word_embeddings, prev_label, word):
    word_embedding = word_embeddings[word]
    word_feature = autograd.Variable(word_embeddings[word])
    prev_label = autograd.Variable(prev_label)
    input_vector = torch.cat((word_feature.view(1,-1), prev_label.view(1,-1)), 1)
#     print "input vector ", input_vector
    return input_vector

In [94]:
def greedy_inference(model, sentence, word_embeddings, tag_embeddings, NUM_LABELS):
    output_labels = np.zeros(len(sentence))
    prev_label = tag_embeddings[NUM_LABELS]
    prob_list = []
    for i, word in enumerate(sentence):
        input_vector =  get_input_vector(word_embeddings, prev_label, word)
        probs = model(input_vector)
        prob_list.append(probs)
        max_val, predicted_label = torch.max(probs, 1)
        predicted_label = predicted_label.data[0]
        prev_label = tag_embeddings[predicted_label]
        output_labels[i] = predicted_label
#     print output_labels
    return output_labels

In [95]:
def viterbi_inference(model, sentence, word_embeddings, tag_embeddings, NUM_LABELS):
        dp = np.zeros((NUM_LABELS, len(sentence)+1))
        back_pointers = np.zeros((NUM_LABELS, len(sentence)))
        dp[0][0] = 1
        for i in range(len(sentence)):
            word_table = np.zeros((NUM_LABELS, NUM_LABELS))
            if i == 0:
                input_vector = get_input_vector(word_embeddings, tag_embeddings[NUM_LABELS], sentence[i])
                probs = model(input_vector)
                probs = probs.data.numpy()
                word_table[:,0] = np.multiply(dp[0, 0], probs)
                dp[:,i+1] = word_table[:,0]
                back_pointers[:,i] = 128
                continue
            for j in range(NUM_LABELS):
                input_vector = get_input_vector(word_embeddings, tag_embeddings[j], sentence[i])
                probs = model(input_vector)
                probs = probs.data.numpy()
                word_table[:,j] = np.multiply(dp[j, i], probs)
            dp[:,i+1] = word_table.max(1)
            for k in range(NUM_LABELS):
                for index, element in enumerate(word_table[k]):
                    if element == dp[k, i+1]:
                        back_pointers[k, i] = index
        output_labels = np.zeros(len(sentence), dtype = np.int)
        label_index = len(sentence) - 1
        max_val = dp[:, len(sentence)].max()
        for index, element in enumerate(dp[:, len(sentence)]):
            if element == max_val:
                output_labels[label_index] = index
                break
        for i in range(len(sentence)-1, 0, -1): #18 to 1
            row = back_pointers[output_labels[label_index], i]
            label_index -= 1
            output_labels[label_index] = row
        return output_labels

In [98]:
predictions_valid = [ map(lambda t: idx2label[t],
    viterbi_inference(model, x, word_embeddings, tag_embeddings, NUM_LABELS)) for x in valid_lex]
# print "predictions ", predictions_valid[0]
groundtruth_valid = [ map(lambda t: idx2label[t], y) for y in valid_y ]

# print "groundtruth ", groundtruth_valid[0]
words_valid = [ map(lambda t: idx2word[t], w) for w in valid_lex ]
valid_precision, valid_recall, valid_f1score = conlleval(predictions_valid, groundtruth_valid, words_valid)
print "Validation results ", valid_precision, valid_recall, valid_f1score

Validation results  66.38 66.13 66.25


In [99]:
predictions_test = [ map(lambda t: idx2label[t],
 viterbi_inference(model, x, word_embeddings, tag_embeddings, NUM_LABELS)) for x in test_lex]
# print "predictions ", predictions_test[0]
groundtruth_test = [ map(lambda t: idx2label[t], y) for y in test_y ]

# print "groundtruth ", groundtruth_test[0]
words_test = [ map(lambda t: idx2word[t], w) for w in test_lex ]
test_precision, test_recall, test_f1score = conlleval(predictions_test, groundtruth_test, words_test)
print "Test Results ", test_precision, test_recall, test_f1score

Test Results  65.09 63.24 64.15


In [83]:
print len(tag_embeddings)
len(word_embeddings)

128


572