In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(12,8)})

import requests
import zipfile
import imageio

import datashader as ds
import datashader.transfer_functions as tf
import colorcet

import umap
import MulticoreTSNE
import fitsne

In [2]:
%time df = pd.read_csv('GoogleNews-vectors-negative300.txt', nrows=200000, skiprows=1, header=None, sep=' ')

FileNotFoundError: File b'GoogleNews-vectors-negative300.txt' does not exist

In [3]:
df.set_index(0, inplace=True)

NameError: name 'df' is not defined

In [None]:
df.head(3)

In [None]:
data = np.array(df.values, dtype=np.float64)

Next we pull the object id's from the file names using pandas

### Now we have our data in a list of vectors.  Let's extract the object id's from the files and cast to data frame (in case we want to explore things further)

### Now let's use UMAP to embed these points into a two dimensional space.

In [None]:
fit = umap.UMAP(random_state=42, metric='cosine', gamma=2.0)
%time u = fit.fit_transform(data)

In [None]:
plt.scatter(u[:,0], u[:,1], s=0.005)

In [None]:
cvs = ds.Canvas(plot_width=800, plot_height=600)
agg = cvs.points(pd.DataFrame(u, columns=['x_col', 'y_col']), 'x_col', 'y_col')
img = tf.set_background(tf.interpolate(agg, cmap=colorcet.fire, how='log'),"black")
img

In [None]:
embedding = pd.DataFrame({'word':df.index, 'x':u[:,0], 'y':u[:,1]})
embedding.to_csv("embedding_word_200_umap1.csv")

# t-SNE

To get angulat distance we need to l2 normalize the data.

In [None]:
from sklearn.preprocessing import normalize

In [None]:
tsne_data = normalize(data, norm='l2')

In [None]:
fit_tsne = MulticoreTSNE.MulticoreTSNE(n_jobs=1, random_state=42)
%time u_tsne = fit_tsne.fit_transform(tsne_data)

In [None]:
embedding = pd.DataFrame({'word':df.index, 'x':u_tsne[:,0], 'y':u_tsne[:,1]})
embedding.to_csv("embedding_word_200_tsne1.csv")

In [None]:
plt.scatter(u_tsne[:,0], u_tsne[:,1], s=0.1)

In [None]:
cvs = ds.Canvas(plot_width=800, plot_height=600)
agg = cvs.points(pd.DataFrame(u_tsne, columns=['x_col', 'y_col']), 'x_col', 'y_col')
img = tf.set_background(tf.interpolate(agg, cmap=colorcet.fire, how='log'),"black")
img

### PCA

The old standby of PCA which is blindingly fast to compute and often used as an initialization to many of the more complex algorithms.  This isn't really a competitor but instead should be thought of as a strawman.  Given that other algorithms initialize with these values one would hope that they can do better.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
%time u_pca = pca.fit_transform(tsne_data)

In [None]:
plt.scatter(u_pca[:,0], u_pca[:,1],  s=0.005)

In [None]:
cvs = ds.Canvas(plot_width=800, plot_height=600)
agg = cvs.points(pd.DataFrame(u_pca, columns=['x_col', 'y_col']), 'x_col', 'y_col')
img = tf.set_background(tf.interpolate(agg, cmap=colorcet.fire, how='log'),"black")
img

## FIt-SNE

In [None]:
%time u_fitsne = fitsne.FItSNE(tsne_data.astype(np.double), nthreads=1, rand_seed=42)

In [None]:
embedding = pd.DataFrame({'word':df.index, 'x':u_fitsne[:,0], 'y':u_fitsne[:,1]})
embedding.to_csv("embedding_word_200_fitsne1.csv")

In [None]:
plt.scatter(u_fitsne[:,0], u_fitsne[:,1], s=0.1)

In [None]:
cvs = ds.Canvas(plot_width=800, plot_height=600)
agg = cvs.points(pd.DataFrame(u_fitsne, columns=['x_col', 'y_col']), 'x_col', 'y_col')
img = tf.set_background(tf.interpolate(agg, cmap=colorcet.fire, how='log'),"black")
img