### Load dependencies

In [70]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *

In [71]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/nahmed/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nahmed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [72]:
import string

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

import spacy # for a lemmatization example

from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure

### Load Data

In [73]:
from nltk.corpus import gutenberg
#from nltk import punkt

In [74]:
print(nltk.data.find('tokenizers/punkt'))


/Users/nahmed/nltk_data/tokenizers/punkt


In [75]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [76]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [77]:
gberg_sents[4][14]

'father'

#### Iteratively preprocess a sentence

a tokenized sentence:

In [78]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

to lowercase:

In [79]:
[w.lower() for w in gberg_sents[4]]

['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

remove stopwords and punctuation:

In [80]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [81]:
stpwrds

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [82]:
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwrds]

['youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']

stem words:

In [83]:
stemmer = PorterStemmer()

In [84]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w.lower() not in stpwrds]

['youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period']

a lemmatization example:

In [85]:
#spacy.cli.download("en_core_web_sm")
# Uncomment the line above to download the model if not already installed
# Load the small English model for lemmatization
nlp = spacy.load('en_core_web_sm') 

In [86]:
gutenberg.raw()[291:477]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [87]:
spacy_doc = nlp(gutenberg.raw()[291:477])

In [88]:
[w.lemma_ for w in spacy_doc]

['she',
 'be',
 'the',
 'young',
 'of',
 'the',
 'two',
 'daughter',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 '\n',
 'indulgent',
 'father',
 ';',
 'and',
 'have',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 '\n',
 'be',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

handle bigram collocations:

In [89]:
phrases = Phrases(gberg_sents) # train detector

In [90]:
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences

In [91]:
bigram.phrasegrams # output score of each bigram

{'two_daughters': 11.966813731181547,
 'her_sister': 17.7960829227865,
 "'_s": 31.066242737744524,
 'very_early': 11.01214147275924,
 'Her_mother': 13.529425062715127,
 'long_ago': 63.22343628984789,
 'more_than': 29.023584433996874,
 'had_been': 22.306024648925288,
 'an_excellent': 39.063874851750626,
 'Miss_Taylor': 453.75918026073305,
 'very_fond': 24.134280468850747,
 'passed_away': 12.35053642325912,
 'too_much': 31.376002029426687,
 'did_not': 11.72841621714281,
 'any_means': 14.096964108090186,
 'wedding_-': 17.4695197740113,
 'Her_father': 13.129571562488772,
 'after_dinner': 21.528548116881705,
 'self_-': 47.79018053120332,
 'sixteen_years': 107.0461671612265,
 'five_years': 40.128755673408115,
 'years_old': 54.735425236061104,
 'seven_years': 52.59411150244507,
 'each_other': 79.4168405322873,
 'a_mile': 12.783091600264584,
 'must_be': 10.229989650632808,
 'difference_between': 220.52537305244678,
 'could_not': 10.870983286982371,
 'having_been': 11.53801833156938,
 'miles_of

In [92]:
tokenized_sentence = "Jon lives in New York City".split()

In [93]:
tokenized_sentence

['Jon', 'lives', 'in', 'New', 'York', 'City']

In [94]:
bigram[tokenized_sentence]

['Jon', 'lives', 'in', 'New_York', 'City']

Preprocess the corpus

In [95]:
# as in Maas et al. (2001):
# - leave in stop words ("indicative of sentiment")
# - no stemming ("model learns similar representations of words of the same stem when data suggests it")
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w.lower() not in list(string.punctuation)])

In [96]:
lower_sents[0:5]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period']]

In [97]:
lower_bigram = Phraser(Phrases(lower_sents))

In [98]:
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston

{'two_daughters': 11.080802900992637,
 'her_sister': 16.93971298099339,
 'very_early': 10.516998773665177,
 'her_mother': 10.70812618607742,
 'long_ago': 59.22644201533601,
 'more_than': 28.529926612065935,
 'had_been': 21.583193129694834,
 'an_excellent': 37.41859680854167,
 'sixteen_years': 131.42913000977518,
 'miss_taylor': 420.4340982546865,
 'mr_woodhouse': 104.19907841850323,
 'very_fond': 24.185726346489627,
 'passed_away': 11.751473221742694,
 'too_much': 30.363090173835406,
 'did_not': 10.846196223896685,
 'any_means': 14.294148100212627,
 'after_dinner': 18.60737125272944,
 'mr_weston': 91.63290824201266,
 'five_years': 37.66428596665674,
 'years_old': 48.59909444619029,
 'seven_years': 50.3345604292756,
 'each_other': 71.31277029783762,
 'well_informed': 14.185028016786626,
 'a_mile': 11.700110753652233,
 'difference_between': 207.86784241868986,
 'mrs_weston': 180.6778969011602,
 'could_not': 10.213333164207082,
 'having_been': 10.723750443105281,
 'sixteen_miles': 105.040

In [99]:
lower_bigram["jon lives in new york city".split()]

['jon', 'lives', 'in', 'new_york', 'city']

In [100]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams

{'miss_taylor': 156.44059469941823,
 'mr_woodhouse': 82.04651843976633,
 'mr_weston': 75.87438262077481,
 'mrs_weston': 160.6848509325892,
 'great_deal': 93.36368125424357,
 'mr_knightley': 161.74131790625913,
 'miss_woodhouse': 229.03802722366902,
 'years_ago': 74.31594785893046,
 'mr_elton': 121.3990121932397,
 'dare_say': 89.94000515807346,
 'frank_churchill': 1316.4456593286038,
 'miss_bates': 276.3958829169252,
 'drawing_room': 84.91494947493561,
 'mrs_goddard': 143.57843432545658,
 'miss_smith': 73.03442128232508,
 'few_minutes': 204.16834974753786,
 'john_knightley': 83.03755747111268,
 'don_t': 250.30957446808512,
 'good_natured': 88.69936184891343,
 'few_moments': 107.77584531675087,
 'thousand_pounds': 166.51834523092802,
 'o_clock': 89.14789088153574,
 'jane_fairfax': 654.556591758761,
 'miss_fairfax': 196.19987447261062,
 'ma_am': 157.25846601094193,
 'mrs_elton': 93.08931456265867,
 'forty_years': 90.60220877269607,
 'cut_off': 129.60290535032792,
 'ten_thousand': 84.00099

In [101]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [102]:
clean_sents[0:9]

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
 

In [103]:
clean_sents[6] 

['sixteen',
 'years',
 'had',
 'miss_taylor',
 'been',
 'in',
 'mr_woodhouse',
 's',
 'family',
 'less',
 'as',
 'a',
 'governess',
 'than',
 'a',
 'friend',
 'very',
 'fond',
 'of',
 'both',
 'daughters',
 'but',
 'particularly',
 'of',
 'emma']

#### Run word2vec

In [104]:
# max_vocab_size can be used instead of min_count (which has increased here)
#model = Word2Vec(sentences=clean_sents, vector_size=64, sg=1, window=10, epochs=5, min_count=10, workers=4)
#model.save('clean_gutenberg_model.w2v')

#### Explore model

In [105]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('clean_gutenberg_model.w2v') 

In [106]:
len(model.wv.index_to_key) # would be 17k if we carried out no preprocessing

10329

In [107]:
model.wv['dog']

array([ 0.11170352, -0.01975164, -0.06077615,  0.0746507 ,  0.52928394,
       -0.4380223 ,  0.38018367,  0.01629548, -0.56401396,  0.04204961,
        0.18780634, -0.20944749, -0.21192774,  0.24859811, -0.05702229,
       -0.4978539 , -0.5740705 ,  0.4117754 , -0.20773852,  0.18435024,
       -0.06542065,  0.20911497,  0.07929189, -0.08785108,  0.12042707,
        0.2207345 , -0.49650133,  0.27293816, -0.01228218, -0.10952681,
        0.08144296,  0.18765336, -0.20427647, -0.06749586,  0.1120729 ,
       -0.65185034,  0.34308186, -0.19132315,  0.2511206 ,  0.15088737,
       -0.23394755, -0.21594656, -0.36667964, -0.51579547,  0.02793126,
        0.05298718, -0.24000369,  0.16590333, -0.31613466,  0.10384604,
       -0.3846141 ,  0.1235968 ,  0.3039016 ,  0.13040595,  0.47042772,
        0.713325  ,  0.3713773 ,  0.2473759 ,  0.19535154,  0.00370864,
       -0.073861  , -0.2059934 , -0.149284  , -0.18297836], dtype=float32)

In [108]:
len(model.wv['dog'])

64

In [109]:
model.wv.most_similar('dog', topn=3)

[('puppy', 0.7888665795326233),
 ('pet', 0.7561296224594116),
 ('thief', 0.7467076182365417)]

In [110]:
model.wv.most_similar('eat', topn=3)

[('bread', 0.8263170719146729),
 ('drink', 0.8125216960906982),
 ('meat', 0.7664123773574829)]

In [111]:
model.wv.most_similar('day', topn=3)

[('week', 0.7419387102127075),
 ('morning', 0.7376074194908142),
 ('month', 0.6915972828865051)]

In [112]:
model.wv.most_similar('father', topn=3)

[('mother', 0.8248094320297241),
 ('brother', 0.7591659426689148),
 ('sister', 0.7110846042633057)]

In [113]:
model.wv.most_similar('ma_am', topn=3) 

[('betty', 0.8638188242912292),
 ('madam', 0.8344510793685913),
 ('m_sure', 0.8287560343742371)]

In [114]:
model.wv.doesnt_match("mother father sister brother dog".split())

'dog'

In [115]:
model.wv.similarity('father', 'dog')

0.5373756

In [116]:
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

[('mother', 0.8166472911834717),
 ('sister', 0.8013228178024292),
 ('daughter', 0.7926044464111328),
 ('wife', 0.7692066431045532),
 ('husband', 0.740737795829773),
 ('brother', 0.6997798681259155),
 ('tamar', 0.6961928009986877),
 ('rebekah', 0.6762732267379761),
 ('child', 0.6743687391281128),
 ('handmaid', 0.6735405921936035)]

In [117]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

[('wife', 0.7362794876098633),
 ('sister', 0.723841667175293),
 ('daughter', 0.6996491551399231),
 ('maid', 0.6623563766479492),
 ('mother', 0.6585159301757812),
 ('widow', 0.643583357334137),
 ('conceived', 0.6223747134208679),
 ('harlot', 0.615094006061554),
 ('child', 0.6141168475151062),
 ('womb', 0.6082120537757874)]

#### Reduce word vector dimensionality with t-SNE

In [118]:
#tsne = TSNE(n_components=2, random_state=42)  # Remove n_iter

In [119]:
#X_2d = tsne.fit_transform(model.wv[model.wv.index_to_key])

In [120]:
#coords_df = pd.DataFrame(X_2d, columns=['x','y'])
#coords_df['token'] = model.wv.key_to_index.keys()

In [121]:
#coords_df.head()

In [122]:
#coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)

#### Visualise

In [123]:
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')

In [124]:
output_notebook()

In [125]:
subset_df = coords_df.sample(n=5000)

In [126]:
p = figure(width=800, height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [127]:
show(p)