In [11]:
import os
import pickle
import spacy
import pandas as pd
import settings

# Distributed vector representation model

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [None]:
# Load trigram sentence
trigram_sentences = LineSentence(os.path.join(settings.DATA_PATH,'trigram_sentences.txt'))
# Path where model will be saved
word2vec_filepath = os.path.join(settings.DATA_PATH, 'word2vec_model')

### Start word2vec model training. Set vector dimension and epochs number.

In [None]:
# Run it to retrain our model. Model will be saved in "word2vec_filepath"
# Most important training parameters:
# size - word vector dimension
# window - context size

# Training parameters
# Word vector dimension 
vector_dim = 100
# Context size
context_size = 5
# Training epochs
epochs = 20

# Make False to use pretraind models
if True:
    # Take trigram text and start first epoch
    text2vec = Word2Vec(trigram_sentences, size=vector_dim, window=context_size,
                        min_count=20, sg=1, workers=4)
    # Save first iteration 
    text2vec.save(word2vec_filepath)
    # Train another epochs and save model in "word2vec_filepath"
    for i in range(1,epochs):
        text2vec.train(trigram_sentences)
        text2vec.save(word2vec_filepath)     

### Load trained word2vec model

In [None]:
# Load vector representation from trained 'word2vec_model'
text2vec = Word2Vec.load(word2vec_filepath)
text2vec.init_sims()

# Shows number of training epochs
print('{} training epochs.'.format(food2vec.train_count))

### Convert text word2vector to panda's data frame

In [12]:
# Create list of word2vector tuples
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in text2vec.vocab.items()]

# Sort oredred vocab by voc.count
ordered_vocab = sorted(ordered_vocab, key=lambda count: count[2]) # try to use -count[2]

# Make three lists of: terms, indices, counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# Create panda's data frame of word vector representation
word_vectors = pd.DataFrame(text2vec.syn0norm[term_indices, :],
                            index=ordered_terms)

### Words similarity function

In [None]:
# Get similar/realated word in context
def get_context_related_words(token, topn=10):
    '''Returns topn context related words as a dictionary'''
    word_sim = {}
    for word, similarity in text2vec.most_similar(positive=[token], topn=topn):
        word_sim.update({word:similarity})
    return word_sim

### Words meanings linear algebra function

In [None]:
def word_algebra(add=[], subtract=[], topn=1):
    '''Returns topn words as the result of operations 
    add=['token1','token2']
    subtract=['token1','token2']
    '''
    answers = text2vec.most_similar(positive=add, negative=subtract, topn=topn)
    for term, similarity in answers:
        print(term)

## t-SNE Distributed stochastic neighbor embedding
Map high dimensional data to low dimensions 2 or 3

In [None]:
from sklearn.manifold import TSNE

In [None]:
# Number of vectors to apply t-SNE 
tsne_vectors = 600

# Take data from panda's data frame. Remove stopwords from it.
tsne_input = word_vectors.drop(spacy.en.STOPWORDS, errors=u'ignore')

# Take the vectors  
tsne_input = tsne_input.head(tsne_vectors)

# Path to save model in binary file 'tsne_model'
tsne_filepath = os.path.join(settings.DATA_PATH, 'tsne_model')

# Path to save vectors in binary file 'tsne_model'
tsne_vectors_filepath = os.path.join(settings.DATA_PATH, 'tsne_vectors.npy')


### t-SNE training

In [None]:
# Trains t-SNE dimension reduction. !!!Check additional twicks.
# Saves t-sne model in file 'tsne_filepath'
# Saves t-sne vectors in file 'tsne_vectors_filepath'
if True:
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne, f)
    pd.np.save(tsne_vectors_filepath, tsne_vectors)

### Load pretrained t-SNE model

In [None]:
# Loads t-SNE models
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f)

# Loads t-SNE vectors from 'tsne_vectors_filepath'
tsne_vectors = pd.np.load(tsne_vectors_filepath)

# Converts tsne_vectors to panda's data frame
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

In [13]:
# Just shows everyting is OK.
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
work_capital,2.273404,10.040244
in_most_case,14.516599,4.91739
socalled,9.111805,4.795907
component,23.65216,3.79331
scope,-24.871328,-5.657409


In [14]:
# Just renames field of panda's data frame 
tsne_vectors['word'] = tsne_vectors.index

## Plot t-SNE data

In [15]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [16]:
# Add tsne_vectors from DataFrame to bokeh as ColumnDataSource 
plot_data = ColumnDataSource(tsne_vectors)

# Create plot
tsne_plot = figure(title='Word embeddings for M&A domain vector space',
                   plot_width = 800,
                   plot_height = 800,
                   tools= ('pan, wheel_zoom, box_zoom,'
                           'box_select, resize, reset'),
                   active_scroll='wheel_zoom')

# Add hover tool to plot
tsne_plot.add_tools(HoverTool(tooltips = '@word'))

# Plot words as circle
tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 color='blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color='black')
# Title
tsne_plot.title.text_font_size = value('16pt')

# Axis parameters
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None