In [None]:
# Notebook for visualizing word embeddings. 
#
# Uses t-SNE from sklearn.manifold followed by adjustText (https://github.com/Phlya/adjustText).
#
# For the similarity metric for t-SNE, I used cosine similarity since it tends to produce 
# better visualizations than Euclidean.
#
# The code below runs t-SNE on the GloVe (https://nlp.stanford.edu/projects/glove/) 
# embeddings for the most frequent 25K words, plots the top 3K words*, then uses 
# adjustText to spread out the text labels so they are more readable.
# 
# The files containing the embeddings (glove.840B.300d.top25k.txt) and vocabulary 
# (vocab.filt.top3k.txt) are provided with this download.
# 
# *Some manual filtering was done to the vocabulary to remove some NSFW word types.
#
# Kevin Gimpel
# 2019-2020

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.manifold import TSNE

random.seed()

def loadQueryWordsAsSet(filename):
    print("Loading query words (as a set) from file", filename)
    f = open(filename,'r')
    queryWords = set()
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        queryWords.add(word)
    print("Done. ",len(queryWords)," query words loaded!")
    return queryWords

def loadEmbeddings(filename):
    print("Loading embeddings from file", filename)
    f = open(filename,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [None]:
from adjustText import adjust_text

In [None]:
# We'll be generating big figures which helps in distinguishing nearby words.
plt.rcParams['figure.figsize'] = [100, 60]

In [None]:
# We'll use the top 25K most common words in the GloVe embeddings.
gembs = loadEmbeddings("glove.840B.300d.top25k.txt")

In [None]:
# Get the words in the order in which they are specified in the embeddings dictionary.
words = np.array(list(gembs.keys()))
# Create the matrix of vectors for running t-SNE by including row vectors for each embedding.
gX = np.array([gembs[word] for word in words])
# shape should be (25000, 300)
gX.shape

In [None]:
# Run t-SNE on the embeddings using cosine similarity as the similarity metric and with at most 2000 iterations
mytsne = TSNE(n_components=2,early_exaggeration=12,verbose=2,metric='cosine',init='pca',n_iter=2000)
gX_tsne = mytsne.fit_transform(gX)

In [None]:
# Load the words to plot. We only use 3000 so that we can more easily distinguish the words 
# visually, but note that we used many more embeddings when running t-SNE above, which helps 
# us learn a better projection. 
wordsToPlot = loadQueryWordsAsSet("vocab.filt.top3k.txt")

In [None]:
fig = plt.figure()
alltexts = list()
# Go through all positions and words in words array.
for i, word in enumerate(words):
    # Only plot if the current word is a word we want to plot.
    if (word in wordsToPlot):
        # Place an invisible point.
        plt.scatter(gX_tsne[i,0], gX_tsne[i,1], s=0)
        # Create a text element at that point.
        currtext = plt.text(gX_tsne[i,0], gX_tsne[i,1], word, family='sans-serif')
        # Store the text element.
        alltexts.append(currtext)
    
# Save a pdf of the visualization before we run adjustText.
plt.savefig('wordembviz-glove-tsne25k-plot3k-noadj.pdf', format='pdf')
# Run adjust_text on the text elements (note: this may take a very long time).
print('now running adjust_text...')
# Note: using autoalign=True tends to give better results in my experience, but takes much longer.
#numiters = adjust_text(alltexts, autoalign=True, lim=200)
#numiters = adjust_text(alltexts, autoalign=True, lim=20, save_steps=True, add_step_numbers=False, save_prefix='wordembviz-glove-tsne25k-plot3k-autoalign-step', save_format='pdf')
#numiters = adjust_text(alltexts, autoalign=False, lim=20, save_steps=True, add_step_numbers=False, save_prefix='wordembviz-glove-tsne25k-plot3k-step', save_format='pdf')
numiters = adjust_text(alltexts, autoalign=False, lim=200)
print('done adjust_text, num iterations: ', numiters)
plt.savefig('wordembviz-glove-tsne25k-plot3k-adj.pdf', format='pdf')

plt.show