## Here will be defined functions that will be used for each model, to avoid code redundancy

In [25]:
#!pip install scipy

In [26]:
import numpy as np
import pandas as pd

In [27]:
# EVALUATE RESULTING EMBEDDINGS:
# Visually: reduce dimension with PCA and then plot specific set of words

from sklearn.decomposition import PCA
from gensim.models import Phrases
from scipy import spatial

In [28]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.graph_objs as go
from sklearn.preprocessing import minmax_scale

#### Plot a given number of words for the vocabulary for a specific model

In [29]:
def plot_words(X, nr_words, model):
    
    pca = PCA(n_components=2) # to have x and y for plotting
    
    scatter_plot_points = pca.fit_transform(X)
    some_words = scatter_plot_points[:nr_words]
    
    plt.figure(figsize=(20,10))
    plt.scatter(some_words[:,0],some_words[:,1],linewidths=10,color='blue')
    plt.xlabel("x",size=15)
    plt.ylabel("y",size=15)
    plt.title("Word Embedding Space",size=20)
    vocab=list(model.wv.vocab)[:nr_words]
    for i, word in enumerate(vocab):
        plt.annotate(word,xy=(some_words[i,0],some_words[i,1]))

#### Print a given number of closest words for a given list of words

In [30]:
def print_closest_words(model, words, nr):
    
    print('Closest words for word list:', words)
    print('---------------------------------')
    for x in model.wv.most_similar(positive = words, topn = nr):
        print(x)
    print()

#### Print a given number of closest words for a given list of words with cosmul

In [31]:
def print_closest_word_cosmul(model, words, nr):
    
    print('Closest cosmul words for word list:', words)
    print('---------------------------------')
    for x in model.wv.most_similar_cosmul(positive = words, topn = nr):
        print(x)
    print()

#### Print most similar words for a given word for a specific model

In [32]:
def print_most_similar_words(model, list_of_words):
    for word in list_of_words:
        print('Most similar words for word:', word)
        print('---------------------------------')
        for w in model.wv.most_similar(word):
            print(w)
        print()

#### Print closest words for these set of words - we will use same set of words for each model so we can compare them more efficiently

In [33]:
def print_closest_words_for_word_list(model):
    
    print_closest_words(model, ['boy', 'family'], 1)

    print_closest_words(model, ['great', 'amazing', 'cool'], 3)

    print_closest_words(model, ['movie', 'actor'], 5)

    print_closest_words(model, ['bad', 'awful'], 5)

In [34]:
def print_closest_words_for_word_list_cosmul(model):
    
    print_closest_word_cosmul(model, ['boy', 'family'], 1)

    print_closest_word_cosmul(model, ['great', 'amazing', 'cool'], 3)

    print_closest_word_cosmul(model, ['movie', 'actor'], 5)

    print_closest_word_cosmul(model, ['bad', 'awful'], 5)

#### Find 7 closest words for a given glove word embedding - we will use this to compare with our models

In [35]:
def find_closest_embeddings(embedding, glove_vectors, number_of_words):
    return sorted(glove_vectors.keys(), key=lambda word: spatial.distance.euclidean(glove_vectors[word], embedding))[1:number_of_words+1]

#### Print a vector representation of a specific word from glove and our own three models - unigram, bigram and trigram model

In [36]:
def print_embeddings_glove(glove_vectors, model_1, model_1_2, model_1_3, word):
    print("GloVe vectors for word: ", word)
    print(glove_vectors.get(word).astype(float))
    print()

    print("Word2vec vectors with unigram model for word: ", word)
    print(model_1[word])
    print()

    print("Word2vec vectors with bigram model for word: ", word)
    print(model_1_2[word])
    print()

    print("Word2vec vectors with trigram model for word: ", word)
    print(model_1_3[word])

#### Print closest glove embeddings for a specific list of words

In [37]:
def print_closest_glove_embeddings(glove_vectors, list_of_words, number_of_words):
    for word in list_of_words:
        print("Closest GloVe embeddings for word: ", word)
        print("----------------------------------------------------------")
        x = find_closest_embeddings(glove_vectors[word], glove_vectors, number_of_words)
        print(x)
        print()

#### Plot closest words (in 2D) for each word from a given list of words, as input we will use a number of similar words we want to plot and a dimensionality method we want to use - TSNE or PCA

In [38]:
def display_word_embedding_2D(dim_reduction_method, model, user_input=None, words=None, label=None, color_map=None, perplexity = 0, learning_rate = 0, iteration = 0, topn=5, sample=10):

    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
    
    word_vectors = np.array([model[w] for w in words])
    
    if(dim_reduction_method == 'TSNE'):
        two_dim = TSNE(n_components = 2, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:2]
    else:
        two_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:2]
        
    data = []

    count = 0
    for i in range (len(user_input)):

                trace = go.Scatter(
                    x = two_dim[count:count+topn,0],  
                    y = two_dim[count:count+topn,1],
                    text = words[count:count+topn],
                    name = user_input[i],
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 0.8,
                        'color': 2
                    }
       
                )
                        
                data.append(trace)
                count = count+topn

    trace_input = go.Scatter(
                    x = two_dim[count:,0], 
                    y = two_dim[count:,1], 
                    text = words[count:],
                    name = 'input words',
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 1,
                        'color': 'black'
                    }
                    )
            
    data.append(trace_input)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

#### Plot closest words (in 3D) for each word from a given list of words, as input we will use a number of similar words we want to plot and a dimensionality method we want to use - TSNE or PCA

In [39]:
def display_word_embedding_3D(dim_reduction_method, model, user_input=None, words=None, label=None, color_map=None, perplexity = 0, learning_rate = 0, iteration = 0, topn=5, sample=10):

    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
    
    word_vectors = np.array([model[w] for w in words])
    
    if(dim_reduction_method == 'TSNE'):
        three_dim = TSNE(n_components = 3, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:3]
    else:
        three_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:3]
    
    
    data = []

    count = 0
    for i in range (len(user_input)):

                trace = go.Scatter3d(
                    x = three_dim[count:count+topn,0], 
                    y = three_dim[count:count+topn,1],  
                    z = three_dim[count:count+topn,2],
                    text = words[count:count+topn],
                    name = user_input[i],
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 0.8,
                        'color': 2
                    }
       
                )
                           
                data.append(trace)
                count = count+topn

    trace_input = go.Scatter3d(
                    x = three_dim[count:,0], 
                    y = three_dim[count:,1],  
                    z = three_dim[count:,2],
                    text = words[count:],
                    name = 'input words',
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 1,
                        'color': 'black'
                    }
                    )
            
    data.append(trace_input)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

#### Append a word with its similar words

In [40]:
def append_list(sim_words, words):
    
    list_of_words = []
    
    for i in range(len(sim_words)):
        
        sim_words_list = list(sim_words[i])
        sim_words_list.append(words)
        sim_words_tuple = tuple(sim_words_list)
        list_of_words.append(sim_words_tuple)
        
    return list_of_words

#### Get a list of similar words for each word from input_words and call a function to plot them in 2D

In [41]:
def plot_similar_words_in_2D(input_words, model, number_of_words, dim_reduction_method):

    user_input = [x.strip() for x in input_words.split(',')]
    result_word = []
    
    for words in user_input:
        
        # if we are dealing with glove model, which is type dictionary:
        if type(model) == dict:
            sim_words = return_embedded_glove_tuple(model, words, number_of_words)
        
        else:
            sim_words = model.wv.most_similar(words, topn = number_of_words)
            
        sim_words = append_list(sim_words, words)
        result_word.extend(sim_words)
    
    similar_word = [word[0] for word in result_word]
    similarity = [word[1] for word in result_word] 
    similar_word.extend(user_input)
    labels = [word[2] for word in result_word]
    label_dict = dict([(y,x+1) for x,y in enumerate(set(labels))])
    color_map = [label_dict[x] for x in labels]

    display_word_embedding_2D(dim_reduction_method, model, user_input, similar_word, labels, color_map, 5, 500, 10000, number_of_words)

#### Get a list of similar words for each word from input_words and call a function to plot them in 3D

In [42]:
def plot_similar_words_in_3D(input_words, model, number_of_words, dim_reduction_method):

    user_input = [x.strip() for x in input_words.split(',')]
    result_word = []
    
    for words in user_input:
         # if we are dealing with glove model, which is type dictionary:
        if type(model) == dict:
            sim_words = return_embedded_glove_tuple(model, words, number_of_words)
        
        else:
            sim_words = model.wv.most_similar(words, topn = number_of_words)

        sim_words = append_list(sim_words, words)       
        result_word.extend(sim_words)
    
    similar_word = [word[0] for word in result_word]
    similarity = [word[1] for word in result_word] 
    similar_word.extend(user_input)
    labels = [word[2] for word in result_word]
    label_dict = dict([(y,x+1) for x,y in enumerate(set(labels))])
    color_map = [label_dict[x] for x in labels]

    display_word_embedding_3D(dim_reduction_method, model, user_input, similar_word, labels, color_map, 5, 500, 10000, number_of_words)

#### Load gloVe vectors with 50d, 100d and 200d to compare to our models

In [43]:
def load_glove_vectors(file, glove_vectors):
    for line in file:
        values = line.split()

        word  = values[0]
        vectors = np.asarray(values[1:], "float32")
        glove_vectors[word] = vectors

    file.close()

#### Get a word and its distance from a given word and return it a a topule (word, distance_from_similar_word)

In [44]:
def return_embedded_glove_tuple(model, words, number_of_words):
    closest_words_for_word = find_closest_embeddings(model[words],model,  number_of_words)
    
    tuple_list = []
    # now find the distances between the given word and each similar word and make a touple:
    for w in closest_words_for_word:
        distance = spatial.distance.euclidean(model[w],model[words])
        word_tuple = (w, distance)
        tuple_list.append(word_tuple)
    
    return tuple_list

#### Plot in 2D the same word for 2 different models (This is used in other scripts to compare the vector representation between our word2vec model and glove vector models) - Vector normalization is applied with minmax_scale, and the dimensionality reduction model which we want to apply is passed on as an argument. It can be 'TSNE' or 'PCA'

In [45]:
def display_word_embedding_form_2_models_2D(dim_reduction_method, model1, model2, word, perplexity = 0, learning_rate = 0, iteration = 0, sample=10):

    label = word
    
    word_vectors = np.array([model1[word], model2[word]])
    
    # Normalizing the vectors in range -1 to 1
    emb_scaled = minmax_scale(word_vectors, feature_range=(-1, 1))
    model_names = ['word2Vec', 'GloVe']
    if(dim_reduction_method == 'TSNE'):
        two_dim = TSNE(n_components = 2, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(emb_scaled)[:,:2]
    else:
        two_dim = PCA(random_state=0).fit_transform(emb_scaled)[:,:2]
        
    data = []

    count = 0
    for i in range (len(model_names)):
        trace = go.Scatter(
            x = two_dim[i:i+1,0], 
            y = two_dim[i:i+1,1],
            text = word,
            name = model_names[i],
            textposition = "top center",
            textfont_size = 20,
            mode = 'markers+text',
            marker = {
                'size': 10,
                'opacity': 0.8,
                'color': i+1
            }
       
        )
                        
        data.append(trace)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

In [46]:
def display_word_embedding_form_2_models_3D(dim_reduction_method, model1, model2, word, perplexity = 0, learning_rate = 0, iteration = 0, sample=10):

    label = word
    
    word_vectors = np.array([model1[word], model2[word]])
    
    # Normalizing the vectors in range -1 to 1
    emb_scaled = minmax_scale(word_vectors, feature_range=(-1, 1))
    model_names = ['word2Vec', 'GloVe']
    if(dim_reduction_method == 'TSNE'):
        three_dim = TSNE(n_components = 3, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(emb_scaled)[:,:3]
    else:
        three_dim = PCA(random_state=0).fit_transform(emb_scaled)[:,:3]
        
    data = []

    count = 0
    for i in range (len(model_names)):
        trace = go.Scatter3d(
            x = three_dim[i:i+1,0], 
            y = three_dim[i:i+1,1],
            z = three_dim[i:i+1,2],
            text = word,
            name = model_names[i],
            textposition = "top center",
            textfont_size = 20,
            mode = 'markers+text',
            marker = {
                'size': 10,
                'opacity': 0.8,
                'color': i+1
            }
       
        )
                        
        data.append(trace)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()