# Word Embeddings (word2vec)

**Word embeddings** is **vectors** representations of a word.

**Word2Vec** is one of the most popular technique to learn word embeddings using shallow neural network.

In [1]:
# installing packages
! pip install gensim



In [2]:
# importing packages
import nltk
import pandas as pd
import string
import gensim
import bokeh.io
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
from bokeh.resources import INLINE

%matplotlib inline
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/msonjap/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

### Get corpus of Project Gutenberg books

In [3]:
# getting public domain books
from nltk.corpus import gutenberg

In [4]:
# showing the public domain document names that are in the corpus
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
# getting total number of words in the corpus
len(gutenberg.words())

2621613

## Tokenize

In [6]:
# returning sentences tokenized
gberg_sents = gutenberg.sents()

In [7]:
# showing tokenized sents
gberg_sents[1]

['VOLUME', 'I']

## Process the corpus

In [8]:
# removing (only) capitalization and punctuation from the corpus
lower_sents = []
for sentence in gberg_sents:
    lower_sents.append([word.lower() for word in sentence if word.lower()
                        not in list(string.punctuation)])

## Word embeddings

In [None]:
# using word2vec model
model = Word2Vec(sentences=lower_sents, size=64,
                 sg=1, window=10, iter=5,
                 min_count=10, workers=4)

In [None]:
model.wv["dog"]

array([ 0.22279131,  0.29291525, -0.10614601,  0.04117888,  0.00455246,
       -0.18285047, -0.1974895 ,  0.1709207 , -0.00104911,  0.27018604,
       -0.1106019 ,  0.33000317, -0.2310855 ,  0.12785162, -0.14745755,
       -0.48652875, -0.21596232, -0.5863022 , -0.03720129, -0.5991778 ,
       -0.09051408,  0.01368778, -0.18260558, -0.05264063, -0.22875598,
        0.05219515, -0.28427622,  0.32841358, -0.2831135 , -0.43946558,
       -0.1576695 , -0.03575251, -0.11319762, -0.07995832, -0.32105467,
        0.10908943, -0.45934325,  0.2865859 , -0.29813007,  0.03949017,
       -0.55328125,  0.00140527, -0.02543302,  0.567693  ,  0.1200754 ,
        0.43267724,  0.7768422 ,  0.42127737, -0.33191466,  0.29339343,
       -0.16830774, -0.4158343 ,  0.11786711,  0.36775258,  0.12001032,
       -0.01372659,  0.11720549,  0.38484454, -0.36438483, -0.13066961,
       -0.3189706 , -0.18296607, -0.40573075,  0.28403422], dtype=float32)

In [None]:
# getting top n words most similar to a given word - using cosine similarity
model.wv.most_similar('father', topn=4)

[('mother', 0.8260801434516907),
 ('brother', 0.7541067004203796),
 ('cousin', 0.7053717374801636),
 ('uncle', 0.6931771039962769)]

In [None]:
# getting the similarity score between two words
model.wv.similarity('father','dog')

0.58774924

## Visualize the word similarities

In [None]:
# reducing the 64 dimensions down to 2 in order to visualize
tsne = TSNE(n_components=2, n_iter=1000)
X_2d = tsne.fit_transform(model.wv[model.wv.vocab])
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
# 2d scatterplot static visualization
_ = coords_df.plot.scatter('x','y', figsize=(12,12),
                    marker='.', s=10, alpha=0.2)

In [None]:
# creating a Bokeh interactive visualization
# output_notebook()
bokeh.io.output_notebook(INLINE)
subset_df = coords_df.sample(n=5000)
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
show(p)