In [1]:
import pyvw
import random
import pandas as pd
import cPickle

pd.options.display.max_colwidth = 250

# Understand ATIS dataset structure

In [2]:
with open("/tweetsdb/ner/datasets/atis.pkl") as f:
    train, test, dicts = cPickle.load(f)

In [3]:
idx2words = {v: k for k, v in dicts['words2idx'].iteritems()}
idx2labels = {v: k for k, v in dicts['labels2idx'].iteritems()}
idx2tables = {v: k for k, v in dicts['tables2idx'].iteritems()}

In [28]:
print train[0][0] #contains sequence of integers, each of which maps to a word
print ' '.join(map(lambda x: idx2words[x], train[0][0])) #map word indices to words in dictionary
print ''
print train[2][0] #contains sequence of integers, each of which maps to a label
print ' '.join(map(lambda x: idx2labels[x], train[2][0])) #map label indices to label names

[232 542 502 196 208  77  62  10  35  40  58 234 137  62  11 234 481 321]
i want to fly from boston at DIGITDIGITDIGIT am and arrive in denver at DIGITDIGITDIGITDIGIT in the morning

[126 126 126 126 126  48 126  35  99 126 126 126  78 126  14 126 126  12]
O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day


## Generate ATIS dataset

In [21]:
#Convert the dataset into a format compatible with Vowpal Wabbit
training_set = []
for i in xrange(len(train[0])):
    zip_label_ent_idx = zip(train[2][i], train[0][i])
    label_ent_actual = [(int(i[0]), idx2words[i[1]]) for i in zip_label_ent_idx]
    training_set.append(label_ent_actual)
random.shuffle(training_set)

In [30]:
training_set[:3] #training_set is a list of lists, where each sublist contains (label, word) tuples

[[(126, 'show'),
  (126, 'me'),
  (126, 'all'),
  (126, 'flights'),
  (126, 'from'),
  (48, 'charlotte')],
 [(126, 'show'),
  (126, 'all'),
  (44, 'nonstop'),
  (126, 'flights'),
  (126, 'from'),
  (48, 'boston'),
  (126, 'to'),
  (78, 'atlanta')],
 [(126, 'in'),
  (126, 'flight'),
  (51, 'meal'),
  (48, 'oakland'),
  (126, 'to'),
  (78, 'philadelphia'),
  (26, 'saturday')]]

In [23]:
# Similarly for the test set
test_set = []
for i in xrange(len(test[0])):
    zip_label_ent_idx = zip(test[2][i], test[0][i])
    label_ent_actual = [(int(i[0]), idx2words[i[1]]) for i in zip_label_ent_idx]
    test_set.append(label_ent_actual)

In [26]:
test_set[0]

[(126, 'i'),
 (126, 'would'),
 (126, 'like'),
 (126, 'to'),
 (126, 'find'),
 (126, 'a'),
 (126, 'flight'),
 (126, 'from'),
 (48, 'charlotte'),
 (126, 'to'),
 (78, 'las'),
 (123, 'vegas'),
 (126, 'that'),
 (126, 'makes'),
 (126, 'a'),
 (126, 'stop'),
 (126, 'in'),
 (71, 'st.'),
 (119, 'louis')]

# Define Vowpal Wabbit class

In [9]:
class SequenceLabeler(pyvw.SearchTask):
    def __init__(self, vw, sch, num_actions):
        # you must must must initialize the parent class
        # this will automatically store self.sch <- sch, self.vw <- vw
        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
        
        # set whatever options you want
        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES )

    def _run(self, sentence):   # it's called _run to remind you that you shouldn't call it directly!
        output = []
        for n in range(len(sentence)):
            pos,word = sentence[n]
            # use "with...as..." to guarantee that the example is finished properly
            with self.vw.example({'w': [word]}) as ex:
                pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos, condition=[(n,'p'), (n-1, 'q')])
                output.append(pred)
        return output

In [10]:
vw = pyvw.vw("--search 3 --search_task hook --ring_size 1024")

## Training

In [11]:
sequenceLabeler = vw.init_search_task(SequenceLabeler)
for i in xrange(3):
    sequenceLabeler.learn(training_set[:10])

## Test

When Vowpal Wabbit makes a prediction on test set sentences, it only outputs labels 1 & 2. The actual labels go from 0 to 126.

In [18]:
pred = []
for i in random.sample(xrange(len(test_set)), 10):
    test_example = [ (999, word[1]) for word in test_set[i] ]
    test_labels  = [ label[0] for label in test_set[i] ]
    print 'input sentence:', ' '.join([word[1] for word in test_set[i]])
    print 'actual labels:', ' '.join([str(label) for label in test_labels])
    print 'predicted labels:', ' '.join([str(pred) for pred in sequenceLabeler.predict(test_example)])
    print ''

input sentence: what is the lowest fare from washington dc to montreal
actual labels: 126 126 126 21 126 126 48 49 126 78
predicted labels: 1 1 1 1 1 1 2 1 1 2

input sentence: flight numbers on american airlines from phoenix to milwaukee
actual labels: 126 126 126 2 83 126 48 126 78
predicted labels: 1 1 1 2 1 1 2 1 1

input sentence: list airports in la
actual labels: 126 126 126 17
predicted labels: 1 1 1 2

input sentence: i would like to see flights from cincinnati to salt lake city
actual labels: 126 126 126 126 126 126 126 48 126 78 123 123
predicted labels: 1 1 1 1 2 1 1 2 1 1 2 1

input sentence: get last flight from oakland to salt lake city on wednesday or first flight from oakland to salt lake city on thursday
actual labels: 126 42 126 126 48 126 78 123 123 126 26 56 42 126 126 48 126 78 123 123 126 26
predicted labels: 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 2

input sentence: list flights from oakland to salt lake city leaving after DIGITDIGITDIGITDIGIT wednesday
actual

In [14]:
print idx2labels[1], idx2labels[2]

B-airline_code B-airline_name
