In [42]:
import numpy as np
import nltk
import sys
import re
import string
import collections

from nengo import spa
from nengo.spa import pointer

In [43]:
with open('data/en/qa1_single-supporting-fact_test.txt','r') as f:
    text = f.read()  

tokens = nltk.word_tokenize(text)
tokens = [x.lower() for x in set(tokens)]
tokens = [x for x in tokens if x not in string.punctuation]
tokens = [x for x in tokens if not x.isdigit()]

pos_tags = nltk.data.load('help/tagsets/upenn_tagset.pickle')

In [44]:
class TextVocabulary(spa.Vocabulary):
    def __getitem__(self, key):
        value = self.pointers.get(key, None)
        if value is None:
            value = self.create_pointer()
            self.add(key, value)
        return value

    def add(self, key, p):
        if not isinstance(p, pointer.SemanticPointer):
            p = pointer.SemanticPointer(p)

        if key in self.pointers:
            raise KeyError("The semantic pointer '%s' already exists" % key)

        self.pointers[key] = p
        self.keys.append(key)
        self.vectors = np.vstack([self.vectors, p.v])

        # Generate vector pairs
        if self.include_pairs and len(self.keys) > 1:
            for k in self.keys[:-1]:
                self.key_pairs.append('%s*%s' % (k, key))
                v = (self.pointers[k] * p).v
                self.vector_pairs = np.vstack([self.vector_pairs, v])

D = 512
    
wrd_voc = TextVocabulary(D)
pos_voc = TextVocabulary(D)

for token in tokens:
    wrd_voc[token]
    
for pos in pos_tags:
    pos_voc[pos]

In [45]:
print pos_voc.keys
print ''
print wrd_voc.keys

['PRP$', 'VBG', 'VBD', '``', 'VBN', ',', "''", 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', '$', 'NN', ')', '(', 'FW', 'POS', '.', 'TO', 'LS', 'RB', ':', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'PDT', 'RBS', 'RBR', 'CD', 'PRP', 'EX', 'IN', 'WP$', 'MD', 'NNPS', '--', 'JJS', 'JJR', 'SYM', 'UH']

['office', 'is', 'moved', 'back', 'daniel', 'bedroom', 'john', 'mary', 'bathroom', 'to', 'travelled', 'hallway', 'garden', 'sandra', 'where', 'the', 'kitchen', 'journeyed', 'went']


In [46]:
from collections import namedtuple


Fact = namedtuple('Fact', ['sentence'])
Query = namedtuple('Query', ['sentence', 'answer', 'support'])


def load(filename):
    stories = [[]]

    with open(filename) as f:
        for line in f:
            idx, parsed = parse_line(line)
            if idx < len(stories[-1]):
                stories.append([])
            stories[-1].append(parsed)

    return stories


def parse_line(line):
    idx, sentence = line.split(' ', 1)
    idx = int(idx)
    if '?' in sentence:
        parsed = parse_query(sentence)
    else:
        parsed = parse_fact(sentence)
    return idx, parsed


def parse_fact(fact):
    return Fact(fact.strip())

def parse_query(sentence):
    query, answer, support = sentence.split('\t')
    return Query(query.strip(), answer.strip(), int(support))

In [49]:
stories = load('data/en/qa1_single-supporting-fact_test.txt')
story = stories[0]

for item in story:
    if isinstance(item, Fact):
        tagged = nltk.pos_tag(item.sentence.split())
        print tagged
        sen_sum = pointer.SemanticPointer(np.zeros(D))
        for pair in tagged:
            wrd = pair[0]
            pos = pair[1]
            sen_sum += wrd_voc[wrd] * pos_voc[pos]
        break

unbind = sen_sum * ~wrd_voc['travelled']
unbind.normalize()

[('John', 'NNP'), ('travelled', 'VBD'), ('to', 'TO'), ('the', 'DT'), ('hallway.', 'NNP')]


In [50]:
print pos_voc.text(unbind, minimum_count=4)

0.41VBD;0.08.;0.08);0.06PRP
