# Learning Word Vectors from Sherlock Holmes Series
Patrick Coady (pcoady@alum.mit.edu)

In [1]:
from wordvector import WordVector
from windowmodel import WindowModel
import docload

import numpy as np
import sklearn.utils
import matplotlib.pyplot as plt

### Load Books, Build Dictionary & Convert Books to Integer Vector
Start with these 3 books (all written by Sir Arthor Conan Doyle):
1. The Adventures of Sherlock Holmes
2. The Hound of the Baskervilles
3. The Sign of the Four

Load the books and build a dictionary of all unique words. The dictionary maps each unique word to an integer. All words are converted to lower case. And punctuation are treated as words (i.e. " , . ? and !). If the size of the book vocabulary exceeds the pre-set limit (**vocab_size**), then the most infrequent words are mapped to the last integer in the dictionary.

In [2]:
files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)

print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))

Document loaded and processed: 24080 lines, 244986 words.


## Neural Net Architecture
![](notebook_images/NN_diagram.png)

In [3]:
print('Building training set ...')
x, y = WindowModel.build_training_set(word_array)

# shuffle and split 10% validation data
x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
split = round(x_shuf.shape[0]*0.9)
x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
x_train, y_train = (x[:split, :], y[:split, :])

print('Training set built.')
graph_params = {'batch_size': 32,
                'vocab_size': np.max(x)+1,
                'embed_size': 64,
                'hid_size': 64,
                'neg_samples': 64,
                'learn_rate': 0.01,
                'momentum': 0.9,
                'embed_noise': 0.1,
                'hid_noise': 0.1,
                'optimizer': 'Momentum'}
model = WindowModel(graph_params)
print('Model built. Vocab size = {}. Document length = {} words.'
      .format(np.max(x)+1, len(word_array)))

print('Training ...')
results = model.train(x_train, y_train, x_val, y_val, epochs=120, verbose=False)

word_vector_embed = WordVector(results['embed_weights'], dictionary)
word_vector_nce = WordVector(results['nce_weights'], dictionary)

Building training set ...
Training set built.
Model built. Vocab size = 11750. Document length = 244986 words.
Training ...
End Training: total batches = 826800. train loss = 1.93, val loss = 2.06


### 100 Most Common Words

In [4]:
print(word_vector_embed.words_in_range(0,100))

[',', '.', 'the', '"', 'and', 'i', 'of', 'to', 'a', 'that', 'it', 'in', 'he', 'you', 'was', 'his', 'is', 'my', 'have', 'had', 'with', 'as', 'at', '?', 'for', 'which', 'we', 'but', 'be', 'not', 'me', 'this', 'there', 'upon', 'him', 'said', 'from', 'so', 'no', 'on', 'one', 'all', 'holmes', 'been', 'her', 'were', 'what', 'very', 'by', 'your', 'an', 'she', 'are', '!', 'would', 'man', 'out', 'could', 'then', 'if', 'our', 'up', 'when', 'has', 'do', 'will', "'", 'us', 'who', 'some', 'into', 'sir', 'now', 'see', 'down', 'they', 'or', 'should', 'little', 'mr', 'well', 'more', 'over', 'can', 'may', 'know', 'about', 'am', 'them', 'think', 'only', 'must', 'did', 'here', 'come', 'time', 'than', 'how', 'two', 'before']


## Word Similarities
The model learns 2 word vector representations. 
1. The embedding vector from the one-hot input
2. The vector from the hidden layer to the network output

In general, the output layer vector seems to learn more meaningful vector representation of words. We quickly check the closest words (cosine similarity) to the word "six". Remember, this model had no human-labeled data or any data sources outside of the raw book text. The hidden layer to output matrix correctly finds that other numbers are most similar to "six".

In [19]:
word = "won't"
print('Embedding layer: 8 closest words to:', "'" + word + "'")
print(word_vector_embed.n_closest(word=word, num_closest=8, metric='cosine'), '\n')
print('Hidden-to-output layer: 8 closest words to:', "'" + word + "'")
print(word_vector_nce.n_closest(word=word, num_closest=8, metric='cosine'))

Embedding layer: 8 closest words to: 'won't'
['would', 'will', 'clogs', 'grip', 'indeed', 'refusal', 'ryder', 'unraveling'] 

Hidden-to-output layer: 8 closest words to: 'won't'
['alternately', 'll', 'would', 'may', 'pray', 'must', "didn't", "can't"]


In [27]:
word = "running"
print('Embedding layer: 8 closest words to:', "'" + word + "'")
print(word_vector_embed.n_closest(word=word, num_closest=8, metric='cosine'), '\n')
print('Hidden-to-output layer: 8 closest words to:', "'" + word + "'")
print(word_vector_nce.n_closest(word=word, num_closest=8, metric='cosine'))

Embedding layer: 8 closest words to: 'running'
["'i've", 'glances', "wi'", 'delved', 'swiftly', 'natur', 'chesterfield', 'lust'] 

Hidden-to-output layer: 8 closest words to: 'running'
['tugging', 'sharpened', 'wriggled', 'rushing', 'porters', 'sidled', 'trailing', "'singular"]


In [20]:
word = "seven"
print('Embedding layer: 8 closest words to:', "'" + word + "'")
print(word_vector_embed.n_closest(word=word, num_closest=8, metric='cosine'), '\n')
print('Hidden-to-output layer: 8 closest words to:', "'" + word + "'")
print(word_vector_nce.n_closest(word=word, num_closest=8, metric='cosine'))

Embedding layer: 8 closest words to: 'seven'
['ticking', 'drift', 'obstacles', 'sundials', 'exacted', 'ten', 'trip', 'rained'] 

Hidden-to-output layer: 8 closest words to: 'seven'
['five', 'eight', 'four', 'peace', 'six', 'many', 'twelve', 'greater']


In [8]:
# skipping first 100 words (i.e. 'the', 'if', 'and', '.', ',', ...) gives more
# interesting visualization
embed_2d, word_list = word_vector_nce.project_2d(100, 600) # t-sne projection
reverse_dict = word_vector_nce.get_reverse_dict()

In [9]:
print(word_vector_embed.analogy('duke', 'king', 'princess', 5))
print(word_vector_nce.analogy('duke', 'king', 'princess', 5))

['king', 'lonely', 'corps', 'saints', 'princess']
['princess', 'undertaking', 'reincarnation', "'77", 'crumbling']


In [10]:
print(word_vector_embed.analogy('remember', 'forget', 'accept', 5))
print(word_vector_nce.analogy('remember', 'forget', 'accept', 5))

['accept', 'forget', 'strategically', 'particulars', 'morstan']
['accept', 'commence', 'playing', 'entering', 'rejoin']


In [11]:
passage = [x for x in map(lambda x: reverse_dict[x], word_array[12200:12300])]

In [12]:
readable = ''
for word in passage:
    if word == '"':
        readable += word
    elif word in ['?', '!', '.', ',']:
        readable += word + ' '
    else: 
        readable += ' ' + word
print(readable)

"" well,  it is just as i have been telling you,  mr.  sherlock holmes, " said jabez wilson,  mopping his forehead" i have a small pawnbroker's business at coburg square,  near the city.  it's not a very large affair,  and of late years it has not done more than just give me a living.  i used to be able to keep two assistants,  but now i only keep one and i would have a job to pay him but that he is willing to come for half wages


In [14]:
x, y = WindowModel.build_training_set(word_array[(12200-2):(12300+2)])
y_hat = model.predict(x, 120)
passage_predict = [x for x in map(lambda x: reverse_dict[x], y_hat[0])]

In [15]:
readable = ''
for word in passage_predict:
    if word == '"':
        readable += word
    elif word in ['?', '!', '.', ',']:
        readable += word + ' '
    else: 
        readable += ' ' + word
print(readable)

"" yes,  that was,  as i have been with you,  mr.  sherlock holmes, " said holmes. ,  with his. ,  i am a great blue.  at baskerville square,  with the house.  did not a little white time,  and of a.  it is not been more than i let me a week.  i came to be able to have a hours,  and as the shall an,  which i should have been time and get.  but what it is able to be in an was
