In [None]:
print("BEGIN visualize smart-filer_word2vec")

In [None]:
import time                               # perf mon 
import sklearn.manifold                   # dim-reduction for viz
import matplotlib.pyplot as plt           # plot
import seaborn                            # viz
import pandas as pd
import numpy as np
import pickle                             # load,save files
import random                             # randomize offsets

In [None]:
# intermediate variables
_points = []

def w2v_viz_filename():
    W2V_USERARG_PATH='reports/w2v__arg' 
    W2V_USERARG_PATH+='-dblim=' + str(USERARG_RECORD_LIMITER) 
    W2V_USERARG_PATH+='-pos=' + str(USERARG_FILTER_POS) 
    W2V_USERARG_PATH+='-wlim=' + str(USERARG_FILTER_WORDS)
    W2V_USERARG_PATH+='-qfea=' + str(USERARG_QTY_FEATURES) 
    W2V_USERARG_PATH+='-wmin=' + str(USERARG_QTY_MINWORDS) 
    W2V_USERARG_PATH+='-wndw=' + str(USERARG_SIZE_CONTEXT) 
    W2V_USERARG_PATH+='-samp=' + str(USERARG_DOWNSAMPLING) 
    W2V_USERARG_PATH+='-statespace=' + USERARG_DATABASEFILE
    return W2V_USERARG_PATH

#model_w2v = pickle.load( open('../models/w2v.model', "rb" ) )
#model_w2v

In [None]:
print("\tInitializing tSNE model (t-Distributed Stochastic Neighbor Embedding)...")
iTime = time.time()

tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

all_word_vectors_matrix = model_w2v.wv.vectors #for some reason, this 2-step is faster than nesting
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:
print("\tMapping words to 2D-vector space...")
iTime = time.time()

_points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[model_w2v.wv.vocab[word].index])
            for word in model_w2v.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

print("\tSaving point-data to data/w2v_tSNE_points.csv...")
_points.to_csv("data/w2v_tSNE_points.csv")

_points.head(10)

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:
print("\tCompiling keyword windows...")
iTime = time.time()

_dx = int(USERARG_WINDOWSIZEPCT*(max(_points.x)-min(_points.x))/100)
_dy = int(USERARG_WINDOWSIZEPCT*(max(_points.y)-min(_points.y))/100)

_points_kw = pd.DataFrame([
    (word, round(coords[0],2), round(coords[1],2), int(coords[0]-_dx), int(coords[0]+_dx), int(coords[1]-_dy), int(coords[1]+_dy)) 
    for word, coords in [
        (word, np.around(all_word_vectors_matrix_2d[model_w2v.wv.vocab[word].index],2))
        for word in model_w2v.wv.vocab if word in KEYWORDS
    ]
], columns=["word", "x", "y", "xLB", "xUB", "yLB", "yUB"])

_dx = None
_dy = None

print(_points_kw)

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:

def plot_region(x_bounds, y_bounds, label_all):
    # plot neighborhood
    slice = _points[
        (x_bounds[0] <= _points.x) &
        (_points.x <= x_bounds[1]) & 
        (y_bounds[0] <= _points.y) &
        (_points.y <= y_bounds[1])
    ]
    ax = slice.plot.scatter("x", "y", marker=".", s=10, c="skyblue")
    fig = plt.gcf()
    fig.set_size_inches(18, 10)
    
    # label neighborhood
    _tmp_words = []
    if label_all == 1:
        for i, point in slice.iterrows():
            ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)
            _tmp_words.append(point.word)

    # plot keywords
    slice = _points_kw[
        (x_bounds[0] <= _points_kw.x) &
        (_points_kw.x <= x_bounds[1]) & 
        (y_bounds[0] <= _points_kw.y) &
        (_points_kw.y <= y_bounds[1])
    ]
    slice.plot.scatter("x", "y", marker="*", s=20, c="red", ax=ax)
    fig = plt.gcf()
    fig.set_size_inches(18, 10)

    # label keywords
    for i, point in slice.iterrows():
        if point.word not in _tmp_words:        
            #ax.text(point.x + random.random(), point.y + random.random(), point.word, fontsize=11)
            ax.text(point.x, point.y, point.word, fontsize=11)


In [None]:
print("\tPlotting 2D-vector state space...")
iTime = time.time()

W2V_USERARG_PATH=w2v_viz_filename() + ".SVG"

plot_region( x_bounds=(min(_points.x),max(_points.x)), y_bounds=(min(_points.y),max(_points.y)), label_all=0)
plt.title('Word2Vec 2D State Space') 
plt.savefig(W2V_USERARG_PATH)
print("\tINFO: Saved to ",W2V_USERARG_PATH)

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:
print("\tPlotting keyword-specific regions in state space...")
iTime = time.time()

W2V_USERARG_PATH=w2v_viz_filename() + ".SVG"

try:
    for i,pt in _points_kw.iterrows():
        plot_region( x_bounds=(pt.xLB, pt.xUB), y_bounds=(pt.yLB, pt.yUB), label_all=1)
        plt.savefig(W2V_USERARG_PATH + "_" + pt.word + ".SVG")
        plt.title("w2v_" + pt.word)
        print("\tINFO: Saved to",W2V_USERARG_PATH + "_" + pt.word + ".SVG")
except:
    print("\tWARNING: Failed to zoom into keyword")

print("\tCompleted in", round(time.time() - iTime,3),"seconds")

In [None]:
print("END visualize smart-filer_word2vec")