In [15]:
import re
from nltk import wordpunct_tokenize, FreqDist
import nltk
import itertools
import numpy as np
from pprint import pprint
from time import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline
from rnn import softmax, CustomRNN, train_with_sgd

raw_corpus = open('corpus.txt', 'r', encoding='utf-8-sig').read()

## Training corpus pre-processing
A single book text file is provided for the traning corpus. Pre-processing is very light here.

In [16]:
## Training corpus pre-processing
A single book text file is provided for the traning corpus. Pre-processing is very light here.title_match_regex = '\n{3,}\s+THE SECRET CACHE\n{3,}.*' # used to remove headers, toc, etc.
corpus = re.search(title_match_regex, raw_corpus, flags=re.M+re.S).group()
corpus = corpus.replace('\n', ' ') 
corpus = re.sub(r' {2,}', ' ', corpus) # replace multiple blanks by one
corpus = corpus.replace('----', '') # remove consecutive hyphens that we'll as a tag for the be verb
print('Corpus length after preprocessing:')
print('- {} characters\n- {} words'.format(len(corpus), len(corpus.split())))

Corpus length after preprocessing:
- 3301707 characters
- 585838 words


## Training set creation
For the training set creation 

* targets are extracted ("be" forms)
* targets are removed from text
* a small text window (typically 10 words) is extracted around each target. This small text will be used for features creation

In [17]:
be_forms = ['am','are','were','was','is','been','being','be']
substitute = '----'
tokens = wordpunct_tokenize(corpus)

def find_targets(tokens):
    """ Return a list of found 'be' formed in a tokenized text """
    return [t for t in tokens if t in be_forms]
    
def remove_targets(tokens):
    """ Replace targets with a substitute in a tokenized text"""
    return [substitute if t in be_forms else t for t in tokens]

targets = find_targets(tokens)

tokens = remove_targets(tokens)

def create_windows(tokens, window_size=5):
    """ Create windows surrouding be forms. """
    left_contexts = []
    right_contexts = []
    for i, word in enumerate(tokens):
        if word == substitute:
            window = tokens[i-window_size:i]
            window = ' '.join(window)
            left_contexts.append(window)
            
            window = tokens[i+1:i+window_size+1][::-1]
            window = ' '.join(window)
            right_contexts.append(window)
    return left_contexts, right_contexts

l_contexts, r_contexts = create_windows(tokens, window_size=10)

# Replace target names with integer label
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(targets)

In [18]:
# Compute target distribution for later over-sampling
distrib = np.bincount(y)
prob = 1/distrib[y].astype(float)
prob /= prob.sum()

left_tokenized_sentences = [c.split() for c in l_contexts]
right_tokenized_sentences = [c.split() for c in r_contexts]

In [6]:
vocabulary_size = 5000
unknown_token = "UNKNOWN_TOKEN"

tokens = left_tokenized_sentences + right_tokenized_sentences
word_freq = nltk.FreqDist(itertools.chain(*tokens))
print("Found %d unique words tokens." % len(word_freq.items()))
 
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
 
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(left_tokenized_sentences):
    left_tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

for i, sent in enumerate(right_tokenized_sentences):
    right_tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

# Create the training data
X_train_left = np.asarray([[word_to_index[w] for w in sent] for sent in left_tokenized_sentences])
X_train_right = np.asarray([[word_to_index[w] for w in sent] for sent in right_tokenized_sentences])
y_train = np.asarray(target_encoder.fit_transform(targets))

Found 18836 unique words tokens.


## Train Bidirectional-RNN
Train a RNN on left-windowed text and an other one on right-windowed text. Their output prediction will used afterwards by a meta-model using prediction from both following and preceeding words. 

Note that this network is not a Bidirectional RNN stricly speaking, as the two networks are trained separately.

In [20]:
n_samples = 10000 # total number of samples used to train RNNs and meta-model
n_train_rnn = 7000 # number of samples used to train both RNNs
output_dim=len(np.unique(y))

# Randomly select examples with over-sampling to handle class imbalance
full_mask = np.random.choice(range(len(y)), size=n_samples, replace=True, p=prob)
mask = full_mask[:n_train_rnn]
mask_blend = full_mask[n_train_rnn:]

y_pred = {}
y_blend = {}

for side, data in [('left', X_train_left), ('right', X_train_right)]:
    print('Training RNN on preceeding words')
    model = CustomRNN(vocabulary_size, output_dim=output_dim, bptt_truncate=5)
    losses = train_with_sgd(model, data[mask], y_train[mask], 
                            nepoch=10, evaluate_loss_after=5, learning_rate=0.01)
    print('Training a second RNN on following words')
    y_pred[side] = model.predict_probas(data[mask])
    print('Score ' + side + ': %.03f' % accuracy_score(y_train[mask], np.argmax(y_pred[side], axis=1)))
    y_blend[side] = model.predict_probas(data[mask_blend])

Training RNN on preceeding words
2017-06-07 01:39:32: Loss after num_examples_seen=0 epoch=0: 2.080189
2017-06-07 01:40:03: Loss after num_examples_seen=3500 epoch=5: 1.755644
Training a second RNN on following words
Score left: 0.439
Training RNN on preceeding words
2017-06-07 01:40:35: Loss after num_examples_seen=0 epoch=0: 2.079493
2017-06-07 01:41:07: Loss after num_examples_seen=3500 epoch=5: 1.853613
Training a second RNN on following words
Score right: 0.414


In [23]:
# Analyse predictions of left and right RNNs
print('Left-RNN predictions:')
counts_left = np.unique(target_encoder.inverse_transform(np.argmax(y_pred['left'], axis=1)), return_counts=True)
print(list(zip(counts_left[0], counts_left[1])))

print('Right-RNN predictions:')
counts_right = np.unique(target_encoder.inverse_transform(np.argmax(y_pred['right'], axis=1)), return_counts=True)
print(list(zip(counts_right[0], counts_right[1])))

Left-RNN predictions:
[('am', 77), ('are', 28), ('be', 5), ('been', 227), ('being', 59), ('is', 105), ('was', 77), ('were', 122)]
Right-RNN predictions:
[('am', 136), ('are', 73), ('be', 151), ('been', 143), ('is', 166), ('was', 31)]


## Blended meta-model
Use predictions from the left RNN and right RNN in a meta-model, to predict class (i.e. the right "be" form).

The meta-model trains on a held-out training set, unseen while training the RNNs.

In [30]:
# Meta-model input features are the concatenation of RNNs predictions
X_blend = np.concatenate((y_blend['left'], y_blend['right']), axis=1)
y_true = y[mask_blend]
clf = LogisticRegression(C=100.) # Use a large C to avoid strong regularization
clf.fit(X_blend, y_true)
counts = np.unique(target_encoder.inverse_transform(clf.predict(X_blend)), return_counts=True)
print('Meta-model average accuracy: %.3f ' % clf.score(X_blend, y_true))
print('Predictions distribution')
print(list(zip(counts[0], counts[1])))

Meta-model average accuracy: 0.347 
Predictions distribution
[('am', 38), ('are', 22), ('be', 33), ('been', 44), ('being', 25), ('is', 31), ('was', 4), ('were', 103)]
