In [1]:
import re
from nltk import wordpunct_tokenize, FreqDist
import nltk
import itertools
import numpy as np
from pprint import pprint
from time import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline

raw_corpus = open('corpus.txt', 'r', encoding='utf-8-sig').read()

## Preprocess training corpus

In [2]:
title_match_regex = '\n{3,}\s+THE SECRET CACHE\n{3,}.*' # used to remove headers, toc, etc.
corpus = re.search(title_match_regex, raw_corpus, flags=re.M+re.S).group()
corpus = corpus.replace('\n', ' ') 
corpus = re.sub(r' {2,}', ' ', corpus) # replace multiple blanks by one
corpus = corpus.replace('----', '') # remove consecutive hyphens that we'll as a tag for the be verb
print('Corpus length after preprocessing:')
print('- {} characters\n- {} words'.format(len(corpus), len(corpus.split())))

Corpus length after preprocessing:
- 3301707 characters
- 585838 words


In [3]:
be_forms = ['am','are','were','was','is','been','being','be']
substitute = '----'
tokens = wordpunct_tokenize(corpus)

def find_targets(tokens):
    return [t for t in tokens if t in be_forms]
    
def remove_targets(tokens):
    """ Replace targets with a substitute in a tokenized text"""
    return [substitute if t in be_forms else t for t in tokens]

targets = find_targets(tokens)

tokens = remove_targets(tokens)

def create_windows(tokens, window_size=5):
    """ Create windows surrouding be forms. """
    left_contexts = []
    right_contexts = []
    for i, word in enumerate(tokens):
        if word == substitute:
            window = tokens[i-window_size:i]
            window = ' '.join(window)
            left_contexts.append(window)
            
            window = tokens[i+1:i+window_size+1][::-1]
            window = ' '.join(window)
            right_contexts.append(window)
    return left_contexts, right_contexts

l_contexts, r_contexts = create_windows(tokens, window_size=10)

# Replace target names with integer label
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(targets)

Number of "be" verb occurences: 18647
Targets distribution:
	am: 255
	are: 1056
	be: 2483
	been: 1404
	being: 327
	is: 2867
	was: 8012
	were: 2243


### Build training set

In [4]:
# Compute target distribution for later over-sampling
distrib = np.bincount(y)
prob = 1/distrib[y].astype(float)
prob /= prob.sum()

In [5]:
left_tokenized_sentences = [c.split() for c in l_contexts]
right_tokenized_sentences = [c.split() for c in r_contexts]

In [6]:

vocabulary_size = 8000

unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

tokens = left_tokenized_sentences + right_tokenized_sentences
word_freq = nltk.FreqDist(itertools.chain(*tokens))
print("Found %d unique words tokens." % len(word_freq.items()))
 
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
 
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(left_tokenized_sentences):
    left_tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

for i, sent in enumerate(right_tokenized_sentences):
    right_tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

# Create the training data
X_train_left = np.asarray([[word_to_index[w] for w in sent] for sent in left_tokenized_sentences])
X_train_right = np.asarray([[word_to_index[w] for w in sent] for sent in right_tokenized_sentences])
y_train = np.asarray(target_encoder.fit_transform(targets))

Found 18836 unique words tokens.


### Custom implementation of RNN and Back-propagation through time (based on [this tutorial](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-1-introduction-to-rnns/))


In [8]:
from rnn import softmax, CustomRNN, train_with_sgd

## Train Bidirectional-RNN

In [11]:
np.random.seed(10)
output_dim=len(np.unique(y))
n_samples = 1000
n_train_rnn = 700

full_mask = np.random.choice(range(len(y)), size=n_samples, replace=True, p=prob)
mask = full_mask[:n_train_rnn]
mask_blend = full_mask[n_train_rnn:]

y_pred = {}
y_blend = {}

for side, data in [('left', X_train_left), ('right', X_train_right)]:
    model = CustomRNN(vocabulary_size, output_dim=output_dim, bptt_truncate=5)
    losses = train_with_sgd(model, data[mask], y_train[mask], 
                            nepoch=10, evaluate_loss_after=5, learning_rate=0.01)
    
    y_pred[side] = model.predict_probas(data[mask])
    print('Score ' + side + ': %.03f' % accuracy_score(y_train[mask], np.argmax(y_pred[side], axis=1)))
    y_blend[side] = model.predict_probas(data[mask_blend])

2017-06-07 01:13:58: Loss after num_examples_seen=0 epoch=0: 2.080306
2017-06-07 01:14:28: Loss after num_examples_seen=3500 epoch=5: 1.700008
Score left: 0.369
2017-06-07 01:14:59: Loss after num_examples_seen=0 epoch=0: 2.079476
2017-06-07 01:15:30: Loss after num_examples_seen=3500 epoch=5: 1.840428
Score right: 0.259


In [12]:
np.unique(target_encoder.inverse_transform(np.argmax(y_pred['left'], axis=1)), return_counts=True)

(array(['am', 'are', 'be', 'been', 'being', 'is', 'was'], 
       dtype='<U5'), array([128, 103,   2, 115, 182, 158,  12]))

In [13]:
np.unique(target_encoder.inverse_transform(np.argmax(y_pred['right'], axis=1)), return_counts=True)

(array(['am', 'are', 'been', 'being', 'is', 'was'], 
       dtype='<U5'), array([ 51, 332, 145,   6, 139,  27]))

## Blended meta-model
Use predictions from the left RNN and right RNN in a meta-model, to predict class (i.e. the right "be" form).

In [14]:
import sklearn

X_blend = np.concatenate((y_blend['left'], y_blend['right']), axis=1)
y_true = y[mask_blend]
clf = LogisticRegression()
clf.fit(X_blend, y_true)
print(np.unique(target_encoder.inverse_transform(clf.predict(X_blend)), return_counts=True))
print('Score %.3f ' % clf.score(X_blend, y_true))

(array(['am', 'are', 'being', 'were'], 
      dtype='<U5'), array([ 73,  33,  67, 127]))
Score 0.240 
