In [50]:
import os
import sys
import re
import numpy as np
from sklearn.linear_model import LogisticRegression
from IPython.lib import passwd
password = passwd('ai2')
password

'sha1:ccd3c0a10888:c93dea2ba57a49f2f27a006e5e52a94bff9ec2f6'

In [3]:
class example_ind(object):
    def __init__(self, sentences, mask, question, answer, hints):
        '''
            Object which contains relevant information for inputting into the
            model, but whose elements are integer indicies into a word vector
            matrix.
        '''
        self.sentences = sentences  # stored as a matrix, rows as sentences. Cols are zero-padded
        self.mask = mask            # boolean matrix M_{ij} = 1 if word j is in sentence i (0 for padding)
        self.question = question    # vector of indices
        self.answer = answer        # vector of indices
        self.hints = hints          # 0-1 vector equal to the length of the # of sentences. 1 iff sentence i is relevant

    def __repr__(self):
        return ("Training example: \n\t Info: %s \n\t Question: %s \n\t Answer: %s \n\t Hint: %s \n"
                % (self.sentences, self.question, self.answer, self.hints))

In [4]:
class example(object):
    def __init__(self, sentences, question, answer, hints):
        '''
            Object which contains relevant information for inputting into the
            model.
        '''
        self.sentences = sentences
        self.question = question
        self.answer = answer
        self.hints = hints

    def __repr__(self):
        return ("Training example: \n\t Info: %s \n\t Question: %s \n\t Answer: %s \n\t Hint: %s \n"
                % (self.sentences, self.question, self.answer, self.hints))

In [6]:
class wordVectors(object):
    def __init__(self, dataset):
        self.words_to_idx, self.idx_to_word = self._map_words_to_idx(dataset)

    def _map_words_to_idx(self, dataset):
        tokens = []
        for example in dataset:
            # add all supporting sentence words
            for sentence in example.sentences:
                tokens += tokenize(sentence)

            tokens += tokenize(example.question)
            tokens += tokenize(example.answer)

        tokens = set(tokens)

        # loop over the tokens and establish a canonical word <-> idx mapping
        words_to_idx = {}
        idx_to_words = {}
        counter = 0
        for token in tokens:
            token = token.lower()
            if token not in words_to_idx:
                words_to_idx[token] = counter
                idx_to_words[counter] = token
                counter += 1

        return words_to_idx, idx_to_words
    
    def get_wv_matrix(self, dimension, glove_dir=None):
        r = 0.001
        self.wv_matrix = np.random.rand(dimension, len(self.words_to_idx)) * 2 * r - r  # TODO: pick initialization carefully
        if glove_dir is not None:
            pretrained = load_glove_vectors(glove_dir, dimension)

            for word in self.words_to_idx:
                if word in pretrained:
                    self.wv_matrix[:, self.words_to_idx[word]] = pretrained[word].ravel()

        return self.wv_matrix

In [7]:
def examples_to_example_ind(wordVectors, examples):
    outputs = []
    for example in examples:
        new_sents = []
        for sentence in example.sentences:
            new_sents.append(np.array([wordVectors.words_to_idx[word] for word in tokenize(sentence)], dtype='int32'))

        sentences = np.zeros((len(new_sents), max(len(s) for s in new_sents)), dtype='int32')
        mask = np.zeros_like(sentences,  dtype='int32')
        for i, sent in enumerate(new_sents):
            sentences[i, :len(sent)] = sent
            mask[i, :len(sent)] = 1

        new_quest = np.array([wordVectors.words_to_idx[word] for word in tokenize(example.question)], dtype='int32')
        new_ans = np.array([wordVectors.words_to_idx[word] for word in tokenize(example.answer)], dtype='int32')

        new_hints = np.zeros((sentences.shape[0], ), dtype='int32')
        new_hints[example.hints] = 1
        outputs.append(example_ind(sentences, mask, new_quest, new_ans, new_hints))

    return outputs

In [8]:
def fix_directions(examples):
    directions = {'n': 'north', 'e': 'east', 's': 'south', 'w': 'west'}
    for example in examples:
        dirs = example.answer.split(',')
        newdirs = [directions[d] for d in dirs]
        example.answer = " ".join(newdirs)

In [9]:
def file_to_examples(file):
    f = open(file, "r")
    lines = f.readlines()
    information = []
    questans = []

    # Want tuples (information, information ..., information, answer)
    for line in lines:
        split = line.strip().split('\t')
        linesplit = split[0].split(' ')
        linenum = int(linesplit[0])
        sentence = " ".join(linesplit[1:]).strip()

        # Signals start of new set
        if linenum == 1:
            information = []
            hint_to_arr_idx = {}
            diff = 1

        # For each question, add as the information all of the previous
        # sentences that could have been relevent.
        if sentence[-1] == "?":
            question = sentence
            answer = split[1]
            hints = map(int, split[2].split(' '))

            hint_idxs = [hint_to_arr_idx[i] for i in hints]

            questans.append(example(sentences=list(information),
                                    answer=answer,
                                    question=question,
                                    hints=hint_idxs))
            diff += 1
        else:
            information.append(sentence)
            hint_to_arr_idx[linenum] = linenum - diff

    return questans

In [10]:
def file_to_relevant_examples(file):
    f = open(file, "r")
    lines = f.readlines()
    information = []
    questans = []
    all_info = []

    # Want tuples (information, information ..., information, answer)
    for line in lines:
        split = line.strip().split('\t')
        linesplit = split[0].split(' ')
        linenum = int(linesplit[0])
        sentence = " ".join(linesplit[1:]).strip()

        # Signals start of new set
        if linenum == 1:
            information = []
            all_info = []

        all_info.append(sentence)
        # For each question, add as the information all of the previous
        # sentences that could have been relevent.
        if sentence[-1] == "?":
            question = sentence
            answer = split[1]
            hint = split[2]

            relevant = [all_info[i-1] for i in map(int, hint.split(' '))]
            questans.append(example(sentences=list(relevant),
                                    answer=answer,
                                    question=question,
                                    hints=hint))
        else:
            information.append(sentence)

    return questans

In [11]:
def tokenize(sentence):
    return [token.lower() for token in re.findall(r"[\w']+|[.,!?;]", sentence)]

In [12]:
def get_file_path(datadir, tasknum, test=False):
    fnames = {
        1: "single-supporting-fact",
        2: "two-supporting-facts",
        3: "three-supporting-facts",
        4: "two-arg-relations",
        5: "three-arg-relations",
        6: "yes-no-questions",
        7: "counting",
        8: "lists-sets",
        9: "simple-negation",
        10: "indefinite-knowledge",
        11: "basic-coreference",
        12: "conjunction",
        13: "compound-coreference",
        14: "time-reasoning",
        15: "basic-deduction",
        16: "basic-induction",
        17: "positional-reasoning",
        18: "size-reasoning",
        19: "path-finding",
        20: "agents-motivations"
    }
    if(tasknum < 1 or tasknum > 20):
        raise NotImplementedError("Task %d is not valid" % tasknum)

    traintest = "test" if test else "train"
    fname = ("qa%d_%s_%s.txt") % (tasknum, fnames[tasknum], traintest)
    return datadir + fname

In [14]:
def get_relevant_data(datadir, tasknum, test=False):
    train_examples = file_to_relevant_examples(get_file_path(datadir, tasknum, False))
    test_examples = file_to_relevant_examples(get_file_path(datadir, tasknum, True))

    if tasknum == 19:
        # hack to replace directions with their actual words
        fix_directions(train_examples)
        fix_directions(test_examples)
    if test:
        print( 'WARNING: Loading TEST SET')
        return train_examples, test_examples
    else:
        return train_examples, None

In [24]:
def get_data(datadir, tasknum, test=False):
    train_examples = file_to_examples(get_file_path(datadir, tasknum, False))
    test_examples = file_to_examples(get_file_path(datadir, tasknum, True))

    if tasknum == 19:
        # hack to replace directions with their actual words
        fix_directions(train_examples)
        fix_directions(test_examples)
    if test:
        return train_examples, test_examples
    else:
        return train_examples, None

In [20]:
def load_glove_vectors(dimension):
    if dimension not in [50, 100, 200, 300]:
        raise NotImplementedError('No Glove Vectors with dimension %d' % dimension)
    file_name = 'glove.6B.%dd.txt' % dimension
    file_path = '/media/ai2-rey/data_disk/data_sets/glove.6B/' + file_name
    wvecs = {}
    print( 'loading glove vectors')
    with open(file_path) as f_glove:
        for i, line in enumerate(f_glove):
            elems = line.split()
            word = elems[0]
            vec = np.array([float(x) for x in elems[1:]]).reshape(-1, 1)
            wvecs[word] = vec
            if i % 20000 == 0:
                print( i)
    print ('done')
    return wvecs

In [22]:
def get_design_matrix(dset):
    X = np.zeros((len(dset), 2 * n))
    for i, ex in enumerate(dset):
        for sentence in ex.sentences:
            for word in tokenize(sentence):
                X[i, words_to_idx[word]] += 1
        for word in tokenize(ex.question):
            X[i, n + words_to_idx[word]] += 1
    y = np.array([words_to_idx[tokenize(ex.answer)[0]] for ex in dset])
    return X, y

In [29]:
datadir = '/media/ai2-rey/data_disk/data_sets/bAbI/tasks_1-20_v1-2/en-10k/'
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')

for task in range(1,21):
    train_ex, test_ex = get_data(datadir, task, test=True)
    
    word_vectors = wordVectors(train_ex)
    words_to_idx, idx_to_words = word_vectors.words_to_idx, word_vectors.idx_to_word
    n = len(words_to_idx.keys())
    
    X_train, y_train = get_design_matrix(train_ex)
    X_test, y_test = get_design_matrix(test_ex)
    
    clf.fit(X_train, y_train)
    print (task)
    print('Training Accuracy: ', clf.score(X_train, y_train))
    print('Testing Accuracy: ', clf.score(X_test, y_test))

1
Training Accuracy:  0.4285
Testing Accuracy:  0.437
2
Training Accuracy:  0.3501
Testing Accuracy:  0.362
3
Training Accuracy:  0.2995
Testing Accuracy:  0.259
4
Training Accuracy:  0.6946
Testing Accuracy:  0.695
5
Training Accuracy:  0.6416
Testing Accuracy:  0.613
6
Training Accuracy:  0.5146
Testing Accuracy:  0.474
7
Training Accuracy:  0.7175
Testing Accuracy:  0.735
8
Training Accuracy:  0.7042
Testing Accuracy:  0.69
9
Training Accuracy:  0.6278
Testing Accuracy:  0.616
10
Training Accuracy:  0.4773
Testing Accuracy:  0.452
11
Training Accuracy:  0.413
Testing Accuracy:  0.407
12
Training Accuracy:  0.417
Testing Accuracy:  0.402
13
Training Accuracy:  0.4473
Testing Accuracy:  0.393
14
Training Accuracy:  0.502
Testing Accuracy:  0.456
15
Training Accuracy:  0.556
Testing Accuracy:  0.57
16
Training Accuracy:  0.4731
Testing Accuracy:  0.489
17
Training Accuracy:  0.5133
Testing Accuracy:  0.511
18
Training Accuracy:  0.529
Testing Accuracy:  0.558
19
Training Accuracy:  0.2

In [49]:
train_ex[1]

Training example: 
	 Info: ['Sumit is tired.', 'Sumit went back to the bedroom.'] 
	 Question: Why did sumit go to the bedroom? 
	 Answer: tired 
	 Hint: [0] 

In [43]:
X_train[0][:]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.])

In [47]:
y_train[1]

22

In [48]:
idx_to_words[22]

'tired'