In [1]:
import numpy as np
import pandas as pd

In [2]:
import gensim

In [3]:
import os

In [4]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [5]:
import nltk
from nltk.corpus import stopwords

In [6]:
books = []
for filename in os.listdir('Data'):
    filepath = os.path.join('Data',filename)
    with open(filepath, encoding='ISO-8859-1') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            books.append(simple_preprocess(sent))   

In [7]:
len(books)

145020

In [8]:
books

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [9]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [10]:
model.build_vocab(books)

In [11]:
print(model.wv.key_to_index) # vocabulary



In [12]:
model.train(books, total_examples=model.corpus_count, epochs = model.epochs)

(6570455, 8628190)

In [13]:
model.wv.most_similar('game')

[('roads', 0.7324135899543762),
 ('maesters', 0.7006590962409973),
 ('enemy', 0.6913532614707947),
 ('slavers', 0.6800565719604492),
 ('whores', 0.6764947175979614),
 ('reason', 0.675395667552948),
 ('country', 0.6752565503120422),
 ('ways', 0.6732255816459656),
 ('immortal', 0.6721555590629578),
 ('trouble', 0.657966673374176)]

In [14]:
model.wv.doesnt_match(['game','throne','king','queen'])

'game'

In [15]:
model.wv.most_similar('ankit') # out of vocabulary

KeyError: "Key 'ankit' not present in vocabulary"

In [16]:
model.wv['kings']

array([ 0.25428724, -0.5093093 , -0.30320838,  0.33222923, -1.769225  ,
        0.60981894, -0.02096075, -0.03360816, -0.07489543, -0.95036066,
       -1.0639912 ,  0.41214657,  0.39980862,  0.5056025 , -1.1743628 ,
       -0.06912631,  0.9914513 , -0.6590471 , -0.4357261 , -0.98325545,
       -0.05003123,  0.16053139,  0.01802812,  0.47534704, -1.3193599 ,
       -0.6084382 , -0.8308183 ,  0.22717033,  0.2663196 , -0.12810737,
       -1.4204059 ,  1.0651202 , -0.24102741, -0.33515826,  0.12611185,
        0.27207384, -1.0422441 , -0.5140524 , -0.76909405, -0.11877871,
       -0.95558935,  0.42009878, -0.58435774,  0.3200048 ,  0.8461805 ,
       -0.25737587,  0.52077454, -0.7414074 ,  0.93505317,  0.4550598 ,
       -0.19526315, -0.5989469 , -0.684046  , -1.774291  ,  0.5590645 ,
        1.0416232 ,  0.1179958 ,  0.18607472, -1.0305817 ,  0.8739114 ,
        0.637415  , -0.26194477, -0.10639223,  1.0980976 ,  0.2742103 ,
        0.27901486, -1.1819855 ,  0.07736759,  0.30095547, -0.25

In [17]:
model.wv.similarity('king','queen')

0.50075763

In [18]:
model.wv.similarity('arya','sansa')

0.8284079

In [19]:
model.wv.get_normed_vectors() # to see all vector together 

array([[-0.12663051, -0.06727301,  0.01881121, ..., -0.15728587,
         0.12267602,  0.1458247 ],
       [-0.11530391, -0.05488504,  0.12392443, ..., -0.08606807,
        -0.01076798,  0.0686364 ],
       [ 0.15016893, -0.09991801, -0.14354555, ..., -0.13042691,
         0.09734825, -0.06656829],
       ...,
       [ 0.03985105,  0.02423153, -0.05235585, ..., -0.09385668,
         0.08155461, -0.15068415],
       [-0.00107922, -0.00378898,  0.14944062, ..., -0.04264737,
         0.14138491, -0.07311466],
       [-0.0916058 ,  0.11560037,  0.12976128, ..., -0.07429948,
         0.11209701, -0.01485299]], dtype=float32)

In [20]:
model.wv.get_normed_vectors().shape

(17453, 100)

In [21]:
# to see which vector belongs to wchich word
y = model.wv.index_to_key

In [22]:
y

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'her',
 'in',
 'it',
 'had',
 'that',
 'she',
 'as',
 'with',
 'him',
 'not',
 'but',
 'for',
 'they',
 'is',
 'at',
 'on',
 'said',
 'my',
 'have',
 'be',
 'lord',
 'them',
 'no',
 'from',
 'would',
 'were',
 'me',
 'your',
 'one',
 'all',
 'when',
 'will',
 'ser',
 'if',
 'so',
 'their',
 'we',
 'could',
 'are',
 'man',
 'there',
 'this',
 'up',
 'been',
 'what',
 'did',
 'by',
 'king',
 'do',
 'men',
 'back',
 'out',
 'more',
 'or',
 'who',
 'down',
 'well',
 'than',
 'only',
 'like',
 'jon',
 'some',
 'father',
 'old',
 'hand',
 'even',
 'too',
 'tyrion',
 'before',
 'never',
 'an',
 'off',
 'see',
 'know',
 'into',
 'made',
 'now',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'then',
 'how',
 'long',
 'has',
 'can',
 'might',
 'us',
 'come',
 'where',
 'here',
 'through',
 'still',
 'face',
 'head',
 'red',
 'll',
 'way',
 'boy',
 'page',
 'must',
 'once',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 

In [23]:
len(y)

17453

In [24]:
from sklearn.decomposition import PCA

In [25]:
pca = PCA(n_components=3)

In [26]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [27]:
X.shape

(17453, 3)

In [28]:
import plotly.express as px
fig = px.scatter_3d(X[:100], x =0 , y =1,z=2,color=y[:100])
fig.show()

In [30]:
fig = px.scatter_3d(X[200:500], x =0 , y =1,z=2,color=y[200:500])
fig.show()