In [1]:
# Concatenate all the training data as another text file: output_train.txt
filenames_tr = ['qa1_single-supporting-fact_train.txt', 'qa2_two-supporting-facts_train.txt','qa3_three-supporting-facts_train.txt'
            ,'qa4_two-arg-relations_train.txt','qa5_three-arg-relations_train.txt','qa6_yes-no-questions_train.txt',
            'qa7_counting_train.txt','qa8_lists-sets_train.txt','qa9_simple-negation_train.txt',
            'qa10_indefinite-knowledge_train.txt','qa11_basic-coreference_train.txt','qa12_conjunction_train.txt',
            'qa13_compound-coreference_train.txt','qa14_time-reasoning_train.txt','qa15_basic-deduction_train.txt',
            'qa16_basic-induction_train.txt','qa17_positional-reasoning_train.txt','qa18_size-reasoning_train.txt',
            'qa19_path-finding_train.txt','qa20_agents-motivations_train.txt']
with open('output_train.txt', 'w') as outfile:
    for fname in filenames_tr:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)
                


In [2]:
# Concatenate all the testing data as another text file: output_test.txt
filenames_te = ['qa1_single-supporting-fact_test.txt', 'qa2_two-supporting-facts_test.txt','qa3_three-supporting-facts_test.txt'
            ,'qa4_two-arg-relations_test.txt','qa5_three-arg-relations_test.txt','qa6_yes-no-questions_test.txt',
            'qa7_counting_test.txt','qa8_lists-sets_test.txt','qa9_simple-negation_test.txt',
            'qa10_indefinite-knowledge_test.txt','qa11_basic-coreference_test.txt','qa12_conjunction_test.txt',
            'qa13_compound-coreference_test.txt','qa14_time-reasoning_test.txt','qa15_basic-deduction_test.txt',
            'qa16_basic-induction_test.txt','qa17_positional-reasoning_test.txt','qa18_size-reasoning_test.txt',
            'qa19_path-finding_test.txt','qa20_agents-motivations_test.txt']
with open('output_test.txt', 'w') as outfile:
    for fname in filenames_te:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [3]:
from __future__ import print_function

#Import the libries needed in this project including the keras,functools,numpy,re and string

from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import numpy as np
import re,string

"""
   General description of the functions:
   The input text files will be read in get_stories. 
   In this function, parse_stories is ultilzed to parse texts in every line of the text file.
   And stories, questions and answers are divided.
   The tokenize function excludes the punctuations.
   The parse texts can return a large 3D lists consisted of each story, question and answer.
   The vectorize_stories can convert the lists obtained in parse_stories into three matrixes with corresponding numbers.
   The three matrixes are story matrix, question matrix and answer matrix
"""

def tokenize(sent):
    #Return the tokens of a sentence without punctuation.

    return [x.strip() for x in re.split('(\W+)?', sent.translate(None, string.punctuation)) if x.strip()]


def parse_stories(lines, only_supporting=False):
    #Parse stories provided in the bAbi tasks format
    #If only_supporting is true, only the sentences that support the answer are kept.

    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            # Divide the data into three parts including the stories, questions and answers 
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            a = tokenize(a)
            substory = None
            # Parse the supporting sentences
            if only_supporting:
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                substory = [x for x in story if x]
            # Put the three types of data into lists respectively
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    # Read the file,retrieve the stories,
    #and then convert the sentences into a single story.
    #If max_length is supplied,
    #any stories longer than max_length tokens will be discarded.
    
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    #Covert every word in the targeted data into corresponding numbers
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[w] for w in answer]
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))


# Input the training data files and the testing data files 
with open('output_train.txt') as inputfile:
        train_stories=get_stories(inputfile)

with open('output_test.txt') as inputfile:
        test_stories=get_stories(inputfile)

# Specify the unique words in the data and then caculate the whole word size
vocab = set()
for story, q, answer in train_stories + test_stories:
        vocab |= set(story + q + answer)
vocab = sorted(vocab)
vocab_size = len(vocab) + 1
# Caculate the max length of stories and queries
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

# Print the information obtained 
print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')

# Function to retrieve the unique words in the data and then put them into a dictionary
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

# Convert the obtained lists of words into array with represented numbers
inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
                                                               word_idx,
                                                               story_maxlen,
                                                              query_maxlen)

inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
                                                            word_idx,
                                                            story_maxlen,
                                                            query_maxlen)


# Concatenate the story array and question array into one matrix
matrix_train = np.concatenate((inputs_train,queries_train), axis =1)
matrix_test = np.concatenate((inputs_test,queries_test), axis =1)


Using TensorFlow backend.


-
Vocab size: 191 unique words
Story max length: 1120 words
Query max length: 11 words
Number of training stories: 20000
Number of test stories: 20000
-


In [4]:
# Import randomforest calssifier
from sklearn.ensemble import RandomForestClassifier

# Input the training samples into the classifier, and then compare them with the testing smaples
# Finally obtian the scores which represents the mean accuracy
score = RandomForestClassifier(n_estimators=100).fit(matrix_train,answers_train).score(matrix_test,answers_test)
print ('The final mean accuracy is', score)




The final mean accuracy is 0.4214
