In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data path
import pathlib
DATA_FOLDER = pathlib.Path("/home/jovyan/work/Dan/data")

import sys
sys.path.append('/home/jovyan/work/Dan/upload/src/')

# Math stuff
import numpy as np
from numpy import ndarray

# Data visualisation
import pandas as pd
from pandas import DataFrame

# Vectorize algotrithm
from gensim.models import Word2Vec

# Multidimensional space visualizer
from sklearn.manifold import TSNE

# Graphic visualization libraries
# Bokeh --
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# Plotly
import plotly.express as px


In [2]:
# Open pickle file and read data
PATH = DATA_FOLDER / 'pwdb/pickle/df_columns_labels.pkl'
df = pd.read_pickle(PATH)
# Set which data we want to use to train word2vec model
columns = df['Concatenated Data (clean)']
columns

0      [hardship, case, fund, safety, net, selfemploy...
1      [state, support, tourism, access, financeas, t...
2      [bank, guarantees, smes, oneperson, enterprise...
3      [emergency, measures, relating, shorttime, wor...
4      [airbus, agreement, making, unworked, hours, p...
                             ...                        
930    [taxation, measures, enterprisesas, consequenc...
931    [winwinloanthe, winwin, loan, system, encourag...
932    [support, measures, businesses, closed, since,...
933    [new, loans, available, enterprises, brusselst...
934    [financial, support, teleworkers, increase, te...
Name: Concatenated Data (clean), Length: 935, dtype: object

In [3]:
# Insert our data and set minimal word count to 10, and size of each word to 300 vectors
dfVec = Word2Vec(columns, window=5, min_count=10, size=300)
w2v_dict = {w: vec for w, vec in zip(dfVec.wv.index2word, dfVec.wv.syn0)}
w2v = dfVec.wv.syn0

In [4]:
# Test the result
dfVec.most_similar('covid')


[('pandemic', 0.9931578040122986),
 ('crisis', 0.9928868412971497),
 ('measures', 0.9882986545562744),
 ('economic', 0.9727696776390076),
 ('affected', 0.9532006978988647),
 ('government', 0.9519243836402893),
 ('health', 0.947557806968689),
 ('emergency', 0.9286361336708069),
 ('spread', 0.9258859753608704),
 ('coronavirus', 0.918663740158081)]

In [5]:
dfVec.wv.most_similar_cosmul(positive=['covid', 'health'], negative=['economic'])

[('crisis', 0.9850180149078369),
 ('measures', 0.983648419380188),
 ('pandemic', 0.9828519821166992),
 ('affected', 0.9613243937492371),
 ('government', 0.9601290822029114),
 ('public', 0.9562671780586243),
 ('sector', 0.9558254480361938),
 ('spread', 0.9493266344070435),
 ('act', 0.9464930295944214),
 ('emergency', 0.9463182687759399)]

In [6]:
print(dfVec.wv.doesnt_match("covid costs legal".split()))

covid


## Visualizing Word Embeddings using t-SNE


In [7]:
# Set words and their vectors in two different lists
# Word list
labels = []
# Vectors list
tokens = []

for word in dfVec.wv.vocab:
    tokens.append(dfVec[word])
    labels.append(word)

### Fit t-SNE model for all words

In [8]:
"""
:perplexity: The number of nearest neighbors that is used in other manifold learning algorithms
:n_components: Dimension of the embedded space
:init: Initialization of embedding
:n_iter: Maxim number of iterations for the optimization (!!! At least 250 !!!)
:random_state: Determines the random number generator
"""
tsne_model_all_words = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=32)
fit_all_words = tsne_model_all_words.fit_transform(tokens)

#### Bokeh

In [9]:
# display Bokeh plots inline
output_notebook()
# plot configuration
bokeh_figure = bp.figure(plot_width=700, plot_height=600, title="Words' plot from PWDB data",
                       tools = "pan, wheel_zoom, box_zoom, reset, hover",
                       x_axis_type=None, y_axis_type=None, min_border=1)
# Create DataFrame for tokens and labels that we have
df_all_words = pd.DataFrame(fit_all_words, columns=['x', 'y'])
df_all_words['words'] = list(dfVec.wv.vocab.keys())
# Create scatter plot
bokeh_figure.scatter(x='x', y='y', source=df_all_words)
hover = bokeh_figure.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(bokeh_figure)

### Fit t-SNE model for specific words

In [10]:
# key words we will use as base
key_words = ['president', 'covid', 'economic',
             'country', 'workers', 'health',
             'crisis', 'tax', 'law',
             'costs', 'legal', 'companys',
             'manufacturing', 'property']

# vectors of each word selected
embedding_clusters = []
# words selected, based on key words
word_clusters = []

# for every key word we find most similar
# words and set them into lists
for word in key_words:
    embedding = []
    words = []

    for similar_word, word_vector in dfVec.most_similar(word, topn=30):
        words.append(similar_word)
        embedding.append(dfVec[similar_word])

    embedding_clusters.append(embedding)
    word_clusters.append(words)



In [11]:
# Transform clusters into NumPy Array
embedding_clusters = np.array(embedding_clusters)
# Declare every axis of the embedding cluster
axis_0, axis_1, axis_2 = embedding_clusters.shape
"""
:perplexity: The number of nearest neighbors that is used in other manifold learning algorithms
:n_components: Dimension of the embedded space
:init: Initialization of embedding
:n_iter: Maxim number of iterations for the optimization (!!! At least 250 !!!)
:random_state: Determines the random number generator
"""
tsne_model_specific_words = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
fit_specific_words = tsne_model_specific_words.fit_transform(embedding_clusters.reshape(axis_0 * axis_1, axis_2))


In [61]:
# Create DataFrame for words and clusters of the specific words
data = {'words': key_words, 'clusters': word_clusters}
df_specific_words = pd.DataFrame(data, columns=['words', 'clusters'])
# Explode cluster column
df_specific_words_exploded = df_specific_words.explode('clusters')

df_specific_words_exploded

Unnamed: 0,words,clusters
0,president,established
0,president,cabinet
0,president,parliament
0,president,amended
0,president,published
...,...,...
13,property,process
13,property,à
13,property,parts
13,property,documents


In [67]:
# Scatter plot of specific words clusters
px.scatter(fit_specific_words, x=0, y=1,
           color=df_specific_words_exploded.words,
           labels={'color': 'words'},
           hover_name=df_specific_words_exploded.clusters
           )

In [14]:
# Save our trained model
# dfVec.save('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')

In [14]:
"""
1. for each cluster create 2 plots:
    - zoom in cluster
    - care pune clusterul in context la intreg spatiului de word embedding (
        determinam dim. min max pe axa y p/u toate word embedding )

"""