In [1]:
"""
Learning word vectors from Sherlock Holmes series

Patrick Coady (pcoady@alum.mit.edu)
"""

from wordvector import WordVector
from windowmodel import WindowModel
import docload
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [2]:
# UNCOMMENT below to load and process a document for first time

files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)
# save processed book for quick future load
docload.save_word_array('../data/aofsh', word_array, dictionary)

print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))

Document loaded and processed: 24080 lines, 244986 words.


In [3]:
## UNCOMMENT below to load previously processed book
## aofsh = previously processed Adventures of Sherlock Holmes

# word_array, dictionary = docload.load_word_array('../data/aofsh')

In [4]:
print('Building training set ...')
x, y = WindowModel.build_training_set(word_array)

# shuffle and split 10% validation data
x, y = shuffle(x, y, random_state=0)
split = round(x.shape[0]*0.9)
x_val, y_val = (x[split:, :], y[split:, :])
x, y = (x[:split, :], y[:split, :])

print('Training set built.')
graph_params = {'batch_size': 32,
                'vocab_size': np.max(x)+1,
                'embed_size': 64,
                'hid_size': 64,
                'neg_samples': 64,
                'learn_rate': 0.002,
                'name': 'sherlock'}  # name for model save
model = WindowModel(graph_params)
print('Model built. Vocab size = {}. Document length = {} words.'
      .format(np.max(x)+1, len(word_array)))

print('Training ...')
results = model.train(x, y, x_val, y_val, epochs=30)

word_vector_embed = WordVector(results['embed_weights'], dictionary)
word_vector_nce = WordVector(results['nce_weights'], dictionary)

Building training set ...
Training set built.
Model built. Vocab size = 11750. Document length = 244986 words.
Training ...
epoch 1: total batches = 6890. train loss = 113.16, val loss = 77.61
epoch 2: total batches = 13780. train loss = 62.71, val loss = 51.11
epoch 3: total batches = 20670. train loss = 43.10, val loss = 35.76
epoch 4: total batches = 27560. train loss = 31.17, val loss = 26.08
epoch 5: total batches = 34450. train loss = 23.10, val loss = 19.96
epoch 6: total batches = 41340. train loss = 17.65, val loss = 15.01
epoch 7: total batches = 48230. train loss = 13.76, val loss = 12.38
epoch 8: total batches = 55120. train loss = 11.06, val loss = 9.80
epoch 9: total batches = 62010. train loss = 9.09, val loss = 8.28
epoch 10: total batches = 68900. train loss = 7.76, val loss = 7.18
epoch 11: total batches = 75790. train loss = 6.82, val loss = 6.48
epoch 12: total batches = 82680. train loss = 6.19, val loss = 5.99
epoch 13: total batches = 89570. train loss = 5.79, va

In [5]:
print('100 most common words')
print(word_vector_embed.most_common(100))

100 most common words
[',', '.', 'the', '"', 'and', 'i', 'of', 'to', 'a', 'that', 'it', 'in', 'he', 'you', 'was', 'his', 'is', 'my', 'have', 'had', 'with', 'as', 'at', '?', 'for', 'which', 'we', 'but', 'be', 'not', 'me', 'this', 'there', 'upon', 'him', 'said', 'from', 'so', 'no', 'on', 'one', 'all', 'holmes', 'been', 'her', 'were', 'what', 'very', 'by', 'your', 'an', 'she', 'are', 'would', '!', 'man', 'out', 'could', 'then', 'if', 'our', 'up', 'when', 'has', 'do', 'will', "'", 'us', 'who', 'some', 'into', 'sir', 'now', 'see', 'down', 'they', 'or', 'should', 'little', 'mr', 'well', 'more', 'over', 'can', 'may', 'know', 'about', 'am', 'think', 'them', 'only', 'must', 'did', 'here', 'come', 'time', 'than', 'how', 'two', 'before']


In [6]:
word = "think"
print('10 closest words to: "', word, '". (based on cosine similarity)')
print(word_vector_embed.n_closest(word=word, num_closest=10, metric='cosine'))

10 closest words to: " think ". (based on cosine similarity)
['been', 'einen', 'sunlight', 'chesterfield', 'imply', 'desperately', 'burglary', 'island', 'objections', 'contact']


In [7]:
word = "who"
print('10 closest words to: "', word, '". (based on cosine similarity)')
print(word_vector_nce.n_closest(word=word, num_closest=10, metric='cosine'))

10 closest words to: " who ". (based on cosine similarity)
['farintosh', 'abrupt', 'mastery', 'scents', 'firelight', 'effected', 'texas', 'peeped', 'shed', "arthur's"]


In [8]:
embed_2d, word_list = word_vector2.project_2d(200, 700)
reverse_dict = word_vector2.get_reverse_dict()
minx, maxx, miny, maxy = (-10, 0, 20, 30)
# plt.ylim(miny, maxy)
# plt.xlim(minx, maxx)
plt.scatter(embed_2d[:,0], embed_2d[:,1])
for i in range(500):
    plt.text(embed_2d[i,0], embed_2d[i,1], reverse_dict[i], clip_on=True)
plt.show()

NameError: name 'word_vector2' is not defined

In [None]:
word_vector_embed.analogy('gentleman', 'lady', 'lord', 5)

In [20]:
x = np.array([[9,62,711,51]])
y_hat = model.predict(x, epoch=29)

In [21]:
y_hat[0].shape


(1,)

In [23]:
y_hat[0][0]

0

In [19]:
word_array[9000:9010]

array([  27,   10,   16, 1159,    9,   62,   51,  711,   51,   84], dtype=int32)