In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD


import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models, similarities, matutils
from textblob import TextBlob

import string
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()



Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
all_books_nosw_singularized = pd.read_pickle('data/allbooks_sing_nosw.pkl')


In [3]:
all_books_nosw_singularized

Unnamed: 0,chapter_title,book_title,cumulative_chapter_number,text
0,Dragonmount,The Eye of the world,1,palace still shook occasionally earth rumbled ...
1,An Empty Road,The Eye of the world,2,wheel time turn age come pas leaving memory be...
2,Strangers,The Eye of the world,3,rand mat carried first barrel common room mast...
3,The Peddler,The Eye of the world,4,cluster pot clattered banged wagon rumbled hea...
4,The Gleeman,The Eye of the world,5,door inn banged shut behind whitehaired spun a...
...,...,...,...,...
672,To Awaken,A Memory of Light,673,rand broke free darknes entered pattern fully ...
673,Watching the Flow Writhe,A Memory of Light,674,fought barely forsaken leaned rock ledge short...
674,A Brilliant Lance,A Memory of Light,675,many dead hundred thousand man trolloc lying p...
675,Light and Shadow,A Memory of Light,676,beneath emptines wolf dream consumed perrin co...


In [11]:
chapters = list(all_books_nosw_singularized['text'])
sentences = []
for chapter in chapters:
    sentences.append(chapter.split())


In [12]:
sentences[0]

['palace',
 'still',
 'shook',
 'occasionally',
 'earth',
 'rumbled',
 'memory',
 'groaned',
 'would',
 'deny',
 'happened',
 'bar',
 'sunlight',
 'cast',
 'rent',
 'wall',
 'made',
 'mote',
 'dust',
 'glitter',
 'yet',
 'hung',
 'air',
 'scorchmark',
 'marred',
 'wall',
 'floor',
 'ceiling',
 'broad',
 'black',
 'smear',
 'crossed',
 'blistered',
 'paint',
 'gilt',
 'oncebright',
 'mural',
 'soot',
 'overlaying',
 'crumbling',
 'frieze',
 'man',
 'animal',
 'seemed',
 'attempted',
 'walk',
 'madnes',
 'grew',
 'quiet',
 'dead',
 'lay',
 'everywhere',
 'man',
 'woman',
 'child',
 'struck',
 'attempted',
 'flight',
 'lightning',
 'flashed',
 'every',
 'corridor',
 'seized',
 'fire',
 'stalked',
 'sunken',
 'stone',
 'palace',
 'stone',
 'flowed',
 'sought',
 'almost',
 'alive',
 'stillnes',
 'came',
 'odd',
 'counterpoint',
 'colorful',
 'tapestry',
 'painting',
 'masterwork',
 'hung',
 'undisturbed',
 'except',
 'bulging',
 'wall',
 'pushed',
 'awry',
 'finely',
 'carved',
 'furnishing

In [13]:
model = models.Word2Vec(sentences=sentences)

In [35]:
def analogy(model, word1, word2, word3):
    """
    Word1 is to word2 as word3 is to ____
    
    """
    return model.wv.most_similar(positive=[word3, word2], negative=[word1])

In [59]:
analogy(model, 'death', 'feather', 'mountain')

[('patch', 0.8808349370956421),
 ('pile', 0.8698686361312866),
 ('mound', 0.8676391243934631),
 ('branch', 0.8640056252479553),
 ('charred', 0.8566128611564636),
 ('dirt', 0.8488328456878662),
 ('pine', 0.8482270240783691),
 ('dusty', 0.8478937149047852),
 ('chimney', 0.842915415763855),
 ('fountain', 0.8424304723739624)]

In [21]:
model.wv['rand']

array([ 1.2647893 ,  0.87539184, -0.14690855, -1.4398797 , -0.72815233,
        0.62691957,  1.2964183 ,  1.5620862 , -1.053662  , -0.07106052,
       -1.4280834 ,  0.06553512, -0.1678091 , -0.16337052, -1.4460572 ,
        1.2082964 ,  0.22239584, -1.8622708 ,  0.23363753,  0.8319343 ,
        0.59509164, -0.28850845,  0.99668145, -0.08815145, -0.35512275,
        0.39267936, -1.1862912 ,  0.6414035 , -1.1267184 , -2.5286522 ,
        1.1055104 , -0.6944974 ,  1.4864856 ,  0.1675514 ,  0.30319718,
       -1.1520995 , -1.1195104 ,  0.35014102, -2.1543245 ,  0.85262704,
       -0.43884754,  0.20891573, -0.62485915,  0.11979114,  0.2933149 ,
       -0.1665919 ,  0.72766197, -1.3317894 , -0.51869184,  0.13167895,
       -0.27207398,  0.5205721 ,  0.900303  , -0.44251072, -1.2829995 ,
       -1.051829  ,  1.0972345 , -0.2610633 , -1.7302569 ,  0.68326503,
       -0.07234459, -0.94209826, -0.22066213, -1.6859617 ,  0.80562705,
       -1.0232506 , -0.79005635,  1.2233381 ,  1.845674  ,  1.13