In [1]:
import sys
sys.path.append('..')

In [2]:
from data import load_ted_data, split_dataset, TedDataset
from model import MLP

In [3]:
tokens_ted, labels = load_ted_data('../ted_en-20160408.xml')
tokens_train, tokens_dev, tokens_test = split_dataset(tokens_ted)
labels_train, labels_dev, labels_test = split_dataset(labels)

train_dataset = TedDataset(tokens_train,
                           labels_train,
                           min_frequency=10)

In [4]:
config = {
    'model_folder': '../tmp',
    'embedding_size': 50,
    'hidden_size': 25
}

model = MLP(config)
model.initialize_features(data=train_dataset)
model.build_model()

model.load(
    '{}/{}.torch'.format(model.config['model_folder'],
                         type(model).__name__.lower())
)

In [5]:
embedding = model.emb.weight.data.numpy()[:1000, :]
vocabulary = train_dataset.vocabulary[:1000]

In [6]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
embeddings_tsne = tsne.fit_transform(embedding)

In [7]:
embeddings_tsne

array([[ -8.78399754,  -1.74513268],
       [ -0.48540491,   8.24342346],
       [-12.68453693,   9.91200542],
       ..., 
       [ 12.32681274,   6.29628277],
       [  8.77551556, -15.62776661],
       [  8.847579  ,  15.92483711]], dtype=float32)

In [8]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=embeddings_tsne[:,0],
                                    x2=embeddings_tsne[:,1],
                                    names=vocabulary))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)