In [None]:
!wget http://vis.mit.edu/embedding-comparator/raw_data/histwords/embeddings_1800.tsv
!wget http://vis.mit.edu/embedding-comparator/raw_data/histwords/embeddings_1990.tsv
!wget http://vis.mit.edu/embedding-comparator/raw_data/histwords/words.tsv

In [None]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from tabulate import tabulate

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Welcome to our user study!
In this notebook, we have loaded in 2 learned word-embeddings. 

1.   `emb_1800`: word embeddings learned from Google Book data from 1800-1810. 
2.   `emb_1990`: word embeddings learned from Google Book data from 1990-2000.

Your job is to compare these two models. Feel free to write any code or use third-party tools that will help you understand the similarities and differences between these two spaces.

In [None]:
def load_emb(tsv_file):
  with open(tsv_file, 'r') as f:
    data = f.readlines()
  data = np.array([np.array([float(val) for val in datum.strip().split('\t')]) for datum in data])
  return data

def load_words(tsv_file):
  with open(tsv_file, 'r') as f:
    data = f.readlines()
  data = np.array([datum.strip().split('\t')[0] for datum in data])
  return data

emb_1800 = load_emb('embeddings_1800.tsv')
emb_1990 = load_emb('embeddings_1990.tsv')
words = load_words('words.tsv')

### The Data
We have loaded the word embeddings for you (`emb_1800` and `emb_1990`), along with a list of words (`words`). Each word (`word[i]`) corresponds to the embeddings at `emb_1800[i]` and `emb_1990[i]`. 

These embeddings come from the HistWords: Word Embeddings for Historical Text dataset. More information can be found: https://nlp.stanford.edu/projects/histwords/data_description.html.

In [None]:
print('Word Embeddings from 1800s')
print(emb_1800)
print(emb_1800.shape)

print('\nWord Embeddings for 1990s')
print(emb_1990)
print(emb_1990.shape)

print('\nWord Tokens')
print(list(words))
print(len(words))

Word Embeddings from 1800s
[[ 0.12144547  0.03922841 -0.00393031 ... -0.02984836 -0.01725622
  -0.07198833]
 [ 0.13627323 -0.03087659 -0.06814367 ...  0.00456374  0.02491411
  -0.03432105]
 [ 0.18926483  0.07692884 -0.01723753 ... -0.04495899  0.01214918
  -0.06378257]
 ...
 [-0.0720244   0.09037425 -0.02263902 ... -0.02500371  0.11514227
   0.02563751]
 [ 0.14392815  0.02951149  0.12126606 ... -0.03287033  0.03895128
  -0.00850182]
 [ 0.1341609   0.01522044  0.00886142 ...  0.01283991  0.05090913
   0.04606852]]
(6121, 300)

Word Embeddings for 1990s
[[ 0.07439162  0.1200703  -0.10586531 ... -0.10513076 -0.08256165
  -0.02761411]
 [ 0.06105045  0.01577934 -0.04813587 ...  0.03418795 -0.06492936
  -0.13170204]
 [ 0.07465506 -0.04898092 -0.11293666 ... -0.05590377  0.01517124
  -0.07435768]
 ...
 [-0.04167826  0.07050642 -0.02665427 ...  0.08708211  0.03796216
  -0.01513428]
 [ 0.04883374  0.00236569 -0.02426026 ... -0.01768698 -0.03616437
   0.00145427]
 [ 0.09784355  0.01847068  0.054

### Utility Functions
We have also provided a few utility functions you may find useful. Please compare these spaces how you regularily would. You are not obligated to use these functions, and you may change or amend them as you feel fit. 

You can also use the Embedding Projector (https://projector.tensorflow.org/) to explore an embedding space. To do so:


1.   Download the `embeddings_1800.tsv`, `embeddings_1990.tsv`, and `words.tsv` to your local machine by right clicking on the files on the left.
2.   Open the Embedding Projector and click `Load`.
3.   In step 1: upload an `embedding_xxxx.tsv` file, and in step 2: upload `words.tsv`.

You are also welcome (and encouraged) to use any other tools you are familiar with or feel would help you complete this task. 



In [None]:
# Dimensionality Reduction Utility Fucntions
def apply_pca(data, n_components=2):
  """ 
  Applies PCA to the data. Reduces the dimensionality to n_components.
  Args:
    data (np.array): embeddings of dimension (num_words, embedding_size)
    n_components (int): the reduced embedding size
  Returns: numpy array of dimension (num_words, n_components).
  """
  return PCA(n_components=n_components).fit_transform(data)


def apply_tsne(data, n_components=2):
  """ 
  Applies TSNE to the data. Reduces the dimensionality to n_components.
  Args:
    data (np.array): embeddings of dimension (num_words, embedding_size)
    n_components (int): the reduced embedding size
  Returns: numpy array of dimension (num_words, n_components).
  """
  return TSNE(n_components=n_components).fit_transform(data)


def plot_reduction(data, words, reducer, title):
  """ 
  Reduces the dimensionality of the data, and interactivley plots the result. 
  Args:
    data (np.array): embeddings of dimension (num_words, embedding_size)
    words (np.array): word labels corresponding to the embeddings (num_words) 
    reducer (function): function with parameters data and n_components that
                        reduces the dimensionality of the input data
    title (str): title for the plot
  """
  data_reduced = reducer(data, 2)
  data = pd.DataFrame({
    'x': [datum[0] for datum in data_reduced],
    'y': [datum[1] for datum in data_reduced],
    'word': words
  })

  graph = alt.Chart(data, title=title).mark_point().encode(
      x='x:Q',
      y='y:Q',
      tooltip=['word'],
  ).interactive()

  return graph

In [None]:
plot_reduction(emb_1800, words, apply_pca, '1800 Word Embeddings')

In [None]:
plot_reduction(emb_1990, words, apply_pca, '1990 Word Embeddings')

In [None]:
# Nearest Neighbors Utils
def k_nearest_neighbors(emb, word, k, words, distances):
  """
  Prints the k nearest neighbors of the given word.
  Args:
    emb (np.array): word embeddings
    word (str): word to get the nearest neighbors of
    k (int): number of nearest neighbors
    words (np.array): words corresponding to emb
    distance_metric (str): distance metric used to calculate nearest neighbors
  """
  word_i = np.argwhere(words == word)
  if len(word_i) == 0:
    raise ValueError('%s is not a valid word in the embedding space' %(word))
  word_i = word_i[0][0]
  min_neighbor_inds = distances[word_i].argsort()[:k+1]
  neighbors = [[words[neighbor_i], distances[word_i][neighbor_i]]
                        for neighbor_i in min_neighbor_inds[1:]]
  return neighbors

def show_k_nearest_neighbors(word, neighbors):
  print('word: %s\n' %(word))
  print(tabulate(neighbors, headers=['Neighbor', 'Distance']))
  print()

In [None]:
word = 'good'
distances_1800 = pairwise_distances(emb_1800, metric='cosine')
neighbors_1800 = k_nearest_neighbors(emb_1800, word, 5, words, distances_1800)
show_k_nearest_neighbors(word, neighbors_1800)

word: good

Neighbor      Distance
----------  ----------
bad           0.471445
luck          0.519251
harm          0.545968
doing         0.58445
excellent     0.584673



In [None]:
distances_1990 = pairwise_distances(emb_1990, metric='cosine')
neighbors_1990 = k_nearest_neighbors(emb_1990, word, 5, words, distances_1990)
show_k_nearest_neighbors(word, neighbors_1990)

word: good

Neighbor      Distance
----------  ----------
bad           0.441446
excellent     0.555576
bye           0.583102
pretty        0.58808
luck          0.588224



### Your Code Here!