In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

In [3]:
# Utility function to clean text
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'
    # Better get rid of it
    text = re.sub(r'--', ' ', text)
    
    # Get rid of headings in square brackets
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles
    text = re.sub(r'Chapter \d+', '', text)
    
    # Get rid of extra whitespace
    text = ' '.join(text.split())
    
    return text

# Import all the Austen in the Project Gutenberg corpus
austen = ""
for novel in ['persuasion', 'emma']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work
    
# Clean the data
austen_clean = text_cleaner(austen)

In [5]:
# Parse the data.  This can take some time.
nlp = spacy.load('en', max_length = 1339689)
austen_doc = nlp(austen_clean)

In [6]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
We have 12591 sentences and 1339688 tokens.


In [23]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4, # Number of threads to run in parallel (if your comp does parallel processing)
    min_count=10, # Min word count threshold
    window=10, # Number of words around target word to consider
    sg=0, # Use CBOW because our corpus is small
    sample=1e-3, # Penalize frequent words
    size=300, # word vector length
    hs=1 # Use hierarchical softmax
    )

print('done!')

done!


In [24]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('mr', 0.8983513712882996), ('benwick', 0.773814857006073), ('harville', 0.7161682844161987), ('wentworth', 0.6972966194152832), ('symptom', 0.6767303943634033), ('anne', 0.6697307825088501), ('daughter', 0.6626896858215332), ('louisa', 0.6307271718978882), ('god', 0.6211845874786377), ('niece', 0.6106087565422058)]
0.6879075
marriage


  if np.issubdtype(vec.dtype, np.int):
  if sys.path[0] == '':
