In [1]:
import os
import pickle
import spacy
import pandas as pd
import settings

# Distributed vector representation model

In [2]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [4]:
NORMALIZED_TEXT_FILE_NAME = 'normalized_text.txt'
normalized_text_file = os.path.join(settings.NORMALIZED_DATA_PATH, NORMALIZED_TEXT_FILE_NAME)

In [5]:
# Load normalized text
normalized_text = LineSentence(normalized_text_file)
# Path where model will be saved
word2vec_filepath = os.path.join(settings.NORMALIZED_DATA_PATH, 'word2vec_model')

### Start word2vec model training. Set vector dimension and epochs number.

In [6]:
# Run it to retrain our model. Model will be saved in "word2vec_filepath"
# Most important training parameters:
# size - word vector dimension
# window - context size

# Training parameters
# Word vector dimension 
vector_dim = 100
# Context size
context_size = 5
# Training epochs
epochs = 20

# Make False to use pretraind models
if True:
    # Take trigram text and start first epoch
    text2vec = Word2Vec(normalized_text, size=vector_dim, window=context_size,
                        min_count=20, sg=1, workers=4)
    # Save first iteration 
    text2vec.save(word2vec_filepath)
    # Train another epochs and save model in "word2vec_filepath"
    for i in range(1,epochs):
        text2vec.train(normalized_text)
        print("Tis is epoch N-{}".format(i))
        text2vec.save(word2vec_filepath)     

### Load trained word2vec model

In [8]:
# Load vector representation from trained 'word2vec_model'
text2vec = Word2Vec.load(word2vec_filepath)
text2vec.init_sims()

# Shows number of training epochs
print('{} training epochs.'.format(text2vec.train_count))

20 training epochs.


### Convert text word2vector to panda's data frame

In [9]:
# Create list of word2vector tuples
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in text2vec.vocab.items()]

# Sort oredred vocab by voc.count
ordered_vocab = sorted(ordered_vocab, key=lambda count: count[2]) # try to use -count[2]

# Make three lists of: terms, indices, counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# Create panda's data frame of word vector representation
word_vectors = pd.DataFrame(text2vec.syn0norm[term_indices, :],
                            index=ordered_terms)

In [16]:
word_vectors[1:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
licensee,0.031852,0.019403,-0.097552,-0.038575,-0.041155,0.098296,0.010953,0.146101,0.158197,0.1132,...,-0.073482,0.042011,-0.070181,-0.041359,0.019073,-0.197888,-0.016013,0.020854,-0.09058,0.075486
smith_moore_leatherwood_llp,0.004333,-0.182786,0.085152,-0.168357,-0.007947,-0.048193,0.00903,-0.089799,0.10867,-0.038184,...,-0.147976,0.075361,-0.208663,0.064516,-0.119285,-0.019405,-0.126509,-0.197764,-0.062363,-0.010732
ktla,0.036297,-0.109484,-0.194115,0.053863,0.018302,0.056144,-0.231593,0.048295,0.037637,0.121789,...,0.115552,0.073199,-0.143269,-0.079307,0.214425,0.123593,0.01756,-0.025347,-0.115492,0.03055
land_building,0.153641,-0.061031,0.006827,0.115745,0.046312,-0.201267,-0.133011,-0.033139,-0.017308,0.077886,...,0.073858,0.112958,0.114405,0.066535,0.167094,0.080539,-0.20346,0.034857,-0.147164,0.001367
obligatory,0.043459,-0.250421,0.030282,0.031152,-0.02584,0.038683,-0.189824,0.171932,-0.05721,-0.094105,...,-0.047514,0.013574,-0.069723,-0.021799,-0.063731,-0.133714,-0.225109,-0.073817,0.020206,-0.010243
broadcasting_segment,-0.032649,-0.02078,-0.054819,0.029976,-0.049132,0.110964,-0.063405,0.129736,-0.119845,0.017012,...,0.035264,0.018775,-0.067737,-0.014907,0.204278,0.040504,0.043438,0.025313,-0.03863,0.024217
multi,0.021568,-0.07126,0.026943,-0.014172,0.058215,-0.007215,-0.022585,0.121711,0.048283,0.006675,...,-0.015075,0.216692,0.144236,-0.217891,0.033411,-0.073512,-0.144628,-0.072741,-0.101999,-0.045995
proposal,-0.128872,-0.065134,0.014567,0.207158,0.065669,-0.024745,-0.07025,-0.01182,-0.141841,-0.147899,...,0.150639,0.340918,0.039162,0.006713,-0.023356,-0.204673,-0.001672,-0.074338,-0.08188,-0.002166
phones,-0.064605,-0.065115,0.046106,0.107942,-0.177545,-0.072098,-0.112624,0.027657,0.078143,0.062857,...,0.150285,-0.042596,0.222061,0.046646,-0.02849,-0.089464,0.062914,0.141921,-0.111336,-0.083787
mct,0.075137,-0.003903,-0.06879,0.207596,0.009695,-0.052115,-0.103006,0.148466,0.09071,-0.272532,...,-0.095807,0.11844,-0.183096,-0.011689,-0.006163,-0.147188,0.102247,0.182102,0.081169,0.144527


### Words similarity function

In [18]:
# Get similar/realated word in context
def get_context_related_words(token, topn=10):
    '''Returns topn context related words as a dictionary'''
    word_sim = {}
    for word, similarity in text2vec.most_similar(positive=[token], topn=topn):
        word_sim.update({word:similarity})
    return word_sim

#### Testing model's performance. Trying to find semantially close words and phrases.

Find semantically close words to word 'cost'

In [21]:
get_context_related_words('cost', topn=20)

{'153.7': 0.38838207721710205,
 '2,472.9': 0.42199376225471497,
 '2,494.1': 0.4554964601993561,
 '27.7': 0.48493120074272156,
 '3,663.0': 0.4689284861087799,
 'amortize': 0.5120112895965576,
 'brokerage': 0.39271965622901917,
 'deferred': 0.3934595584869385,
 'discount': 0.40827229619026184,
 'expense': 0.44802170991897583,
 'expensing': 0.39666426181793213,
 'extraordinary': 0.39504194259643555,
 'gain': 0.3928598165512085,
 'human_resource': 0.43243271112442017,
 'legal': 0.414577454328537,
 'obstacle': 0.397826224565506,
 'pricing': 0.40854325890541077,
 'recognition': 0.4200395345687866,
 'retrospective_adoption_approximately': 0.4277220666408539,
 'technology': 0.39840424060821533}

### Words meanings linear algebra function

In [None]:
def word_algebra(add=[], subtract=[], topn=1):
    '''Returns topn words as the result of operations 
    add=['token1','token2']
    subtract=['token1','token2']
    '''
    answers = text2vec.most_similar(positive=add, negative=subtract, topn=topn)
    for term, similarity in answers:
        print(term)

## t-SNE Distributed stochastic neighbor embedding
Map high dimensional data to low dimensions 2 or 3

In [None]:
from sklearn.manifold import TSNE

In [None]:
# Number of vectors to apply t-SNE 
tsne_vectors = 600

# Take data from panda's data frame. Remove stopwords from it.
tsne_input = word_vectors.drop(spacy.en.STOPWORDS, errors=u'ignore')

# Take the vectors  
tsne_input = tsne_input.head(tsne_vectors)

# Path to save model in binary file 'tsne_model'
tsne_filepath = os.path.join(settings.DATA_PATH, 'tsne_model')

# Path to save vectors in binary file 'tsne_model'
tsne_vectors_filepath = os.path.join(settings.DATA_PATH, 'tsne_vectors.npy')


### t-SNE training

In [None]:
# Trains t-SNE dimension reduction. !!!Check additional twicks.
# Saves t-sne model in file 'tsne_filepath'
# Saves t-sne vectors in file 'tsne_vectors_filepath'
if True:
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne, f)
    pd.np.save(tsne_vectors_filepath, tsne_vectors)

### Load pretrained t-SNE model

In [None]:
# Loads t-SNE models
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f)

# Loads t-SNE vectors from 'tsne_vectors_filepath'
tsne_vectors = pd.np.load(tsne_vectors_filepath)

# Converts tsne_vectors to panda's data frame
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

In [13]:
# Just shows everyting is OK.
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
work_capital,2.273404,10.040244
in_most_case,14.516599,4.91739
socalled,9.111805,4.795907
component,23.65216,3.79331
scope,-24.871328,-5.657409


In [14]:
# Just renames field of panda's data frame 
tsne_vectors['word'] = tsne_vectors.index

## Plot t-SNE data

In [15]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [16]:
# Add tsne_vectors from DataFrame to bokeh as ColumnDataSource 
plot_data = ColumnDataSource(tsne_vectors)

# Create plot
tsne_plot = figure(title='Word embeddings for M&A domain vector space',
                   plot_width = 800,
                   plot_height = 800,
                   tools= ('pan, wheel_zoom, box_zoom,'
                           'box_select, resize, reset'),
                   active_scroll='wheel_zoom')

# Add hover tool to plot
tsne_plot.add_tools(HoverTool(tooltips = '@word'))

# Plot words as circle
tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 color='blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color='black')
# Title
tsne_plot.title.text_font_size = value('16pt')

# Axis parameters
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None