In [1]:
from __future__ import print_function

#Import the libries needed in this project including the keras,functools,numpy,re and string

from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import numpy as np
import re,string
from glob import glob

def tokenize(sent):
    #Return the tokens of a sentence without punctuation.

    return [x.strip() for x in re.split('(\W+)?', sent.translate(None, string.punctuation)) if x.strip()]


def parse_stories(lines, only_supporting=False):
    #Parse stories provided in the bAbi tasks format
    #If only_supporting is true, only the sentences that support the answer are kept.

    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            # Divide the data into three parts including the stories, questions and answers 
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            a = tokenize(a)
            substory = None
            # Parse the supporting sentences
            if only_supporting:
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                substory = [x for x in story if x]
            # Put the three types of data into lists respectively
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    # Read the file,retrieve the stories,
    #and then convert the sentences into a single story.
    #If max_length is supplied,
    #any stories longer than max_length tokens will be discarded.
    
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    #Covert the targeted data into vectors which are represented by unique numbers
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[w] for w in answer]
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

train_data_path = [glob('qa' + str(t) + '_*_train.txt')[0] for t in xrange(1, 21)]
test_data_path = [glob('qa' + str(t) + '_*_test.txt')[0] for t in xrange(1, 21)]

for i in range(0,20):
# Input the training data files and the testing data files 
    with open(train_data_path[i]) as inputfile:
            train_stories=get_stories(inputfile)

    with open(test_data_path[i]) as inputfile:
            test_stories=get_stories(inputfile)

    # Specify the unique words in the data and then caculate the whole word size
    vocab = set()
    for story, q, answer in train_stories + test_stories:
            vocab |= set(story + q + answer)
    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    # Caculate the max length of stories and queries
    story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
    query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

    # Print the information obtained 
    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Story max length:', story_maxlen, 'words')
    print('Query max length:', query_maxlen, 'words')
    print('Number of training stories:', len(train_stories))
    print('Number of test stories:', len(test_stories))
    print('-')

    # Function to retrieve the unique words in the data and then put them into a dictionary
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    # Convert the obtained lists of words into arraies with represented numbers
    inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
                                                                   word_idx,
                                                                   story_maxlen,
                                                                  query_maxlen)

    inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
                                                                word_idx,
                                                                story_maxlen,
                                                                query_maxlen)


    # Concatenate the story array and question array into one matrix
    matrix_train = np.concatenate((inputs_train,queries_train), axis =1)
    matrix_test = np.concatenate((inputs_test,queries_test), axis =1)
    #print (matrix_train)
    # Import randomforest calssifier
    from sklearn.ensemble import RandomForestClassifier

# Input the training samples into the classifier, and then compare them with the testing smaples
# Finally obtian the scores which represents the mean accuracy
    score = RandomForestClassifier(n_estimators=100).fit(matrix_train,answers_train).score(matrix_test,answers_test)
    print (i+1,score)

Using TensorFlow backend.


-
Vocab size: 20 unique words
Story max length: 56 words
Query max length: 3 words
Number of training stories: 1000
Number of test stories: 1000
-




1 0.381
-
Vocab size: 34 unique words
Story max length: 464 words
Query max length: 4 words
Number of training stories: 1000
Number of test stories: 1000
-
2 0.196
-
Vocab size: 35 unique words
Story max length: 1120 words
Query max length: 7 words
Number of training stories: 1000
Number of test stories: 1000
-
3 0.158
-
Vocab size: 16 unique words
Story max length: 14 words
Query max length: 6 words
Number of training stories: 1000
Number of test stories: 1000
-
4 0.677
-
Vocab size: 40 unique words
Story max length: 525 words
Query max length: 7 words
Number of training stories: 1000
Number of test stories: 1000
-
5 0.424
-
Vocab size: 36 unique words
Story max length: 130 words
Query max length: 5 words
Number of training stories: 1000
Number of test stories: 1000
-
6 0.535
-
Vocab size: 44 unique words
Story max length: 182 words
Query max length: 6 words
Number of training stories: 1000
Number of test stories: 1000
-
7 0.705
-
Vocab size: 46 unique words
Story max length: 300 word