In [2]:
import warnings
import numpy as np
from keras.layers import Input, Dense, Lambda
from keras.layers.merge import concatenate as concat
from keras.models import Model
from keras import backend as K
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import keras

## Data import

### original embeddings

In [4]:
def get_original_embeddings(file_name):
    txt = open(file_name,"r") .read().split('\n')
    txt.pop(15000) # remove the last empty object

    tokens = []
    embeddings = []

    for i, line in enumerate(txt):
        tk = line.strip().split()
        tokens.append(tk[0])
        embeddings.append([float(i) for i in tk[1:]])

    embeddings = np.array(embeddings)
    tokens = np.array(tokens)
    return embeddings, tokens

In [5]:
w2v_embeddings, w2v_tokens = get_original_embeddings("../data/external/word2vec_original_15k_300d_train.txt")

In [6]:
glove_embeddings, glove_tokens = get_original_embeddings("../data/external/glove_original_15k_300d_train.txt")

In [7]:
spine_w2v_embeddings, spine_w2v_tokens = get_original_embeddings("../data/external/SPINE_word2vec.txt")

In [8]:
spine_glove_embeddings, spine_glove_tokens = get_original_embeddings("../data/external/SPINE_glove.txt")

In [9]:
train_size = int(15000 * .8)
test_size = int(15000 - train_size)

### shuffle indices

In [10]:
np.random.seed(33)

def shuffle_indices(cond_embeddings, cond_tokens, og_embeddings, og_tokens, train_size):
    # Train index
    train_inx = np.random.choice(list(range(cond_embeddings.shape[0])), size=train_size, replace=False)
    train_word_embed = cond_embeddings[train_inx, ]
    train_word_token = cond_tokens[train_inx]

    # Test index
    test_inx = list(set(np.arange(cond_embeddings.shape[0])) - set(train_inx))
    test_word_embed = cond_embeddings[test_inx, ]
    test_word_token = cond_tokens[test_inx]

    ## Train word2vec original
    train_w2v_embed = og_embeddings[train_inx, ]
    train_w2v_token = og_tokens[train_inx]
    
    ## Test word2vec original
    test_w2v_embed = og_embeddings[test_inx, ]
    test_w2v_token = og_tokens[test_inx]
    return train_inx, test_inx, train_word_embed, train_word_token, test_word_embed, test_word_token, train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token 

## conditions

In [11]:
def create_conditional_train_test(train_word_token, test_word_token, wordnet_conditioning, wordnet_cond_label):
    # train set
    train_wordnet_cond_matrix = []
    for word in train_word_token:
        if word in wordnet_conditioning:
            train_wordnet_cond_matrix.append(wordnet_conditioning[word])
        else:
            train_wordnet_cond_matrix.append([0]*len(wordnet_cond_label))
    train_wordnet_cond_matrix = np.array(train_wordnet_cond_matrix)
    
    train_wordnet_cond_matrix_df = pd.DataFrame(train_wordnet_cond_matrix)
    train_wordnet_cond_matrix_df.columns = wordnet_cond_label
    train_wordnet_cond_matrix_df.index = train_word_token

    # test set
    test_wordnet_cond_matrix = []
    for word in test_word_token:
        if word in wordnet_conditioning:
            test_wordnet_cond_matrix.append(wordnet_conditioning[word])
        else:
            test_wordnet_cond_matrix.append([0]*len(wordnet_cond_label))
    test_wordnet_cond_matrix = np.array(test_wordnet_cond_matrix)

    test_wordnet_cond_matrix_df = pd.DataFrame(test_wordnet_cond_matrix)
    test_wordnet_cond_matrix_df.columns = wordnet_cond_label
    test_wordnet_cond_matrix_df.index = test_word_token
    return train_wordnet_cond_matrix_df, test_wordnet_cond_matrix_df

### -> word2vec

In [12]:
# spine_w2v_train_inx, spine_w2v_test_inx, 
#     train_word_embed,e train_word_token, 
#     test_word_embed, test_word_token, 
#     train_w2v_embed, train_w2v_token, 
#     test_w2v_embed, test_w2v_token =  shuffle_indices(
#                                             spine_w2v_embeddings, 
#                                             spine_w2v_tokens, 
#                                             w2v_embeddings, 
#                                             w2v_tokens,
#                                             train_size
#                                         )
spine_w2v_train_inx, spine_w2v_test_inx, train_word_embed, train_word_token, test_word_embed, test_word_token, train_w2v_embed, train_w2v_token,test_w2v_embed, test_w2v_token =  shuffle_indices(spine_w2v_embeddings, spine_w2v_tokens, w2v_embeddings, w2v_tokens, train_size)

### -> glove

In [13]:
spine_glove_train_inx, spine_glove_test_inx, sg_train_word_embed, sg_train_word_token, sg_test_word_embed, sg_test_word_token, train_glove_embed, train_glove_token, test_glove_embed, test_glove_token =  shuffle_indices(spine_glove_embeddings, spine_glove_tokens, glove_embeddings, glove_tokens, train_size)

### WordNet Domains

In [14]:
import pickle
import pandas as pd

In [15]:
wordnet_conditioning = pickle.load(open("../data/raw/one-hot-categories-spine-word2vec.p", "rb" ) )
wordnet_cond_label = pickle.load(open("../data/raw/category_labels.p", "rb" ) )

In [16]:
w2v_train_wordnet_cond_matrix_df, w2v_test_wordnet_cond_matrix_df = create_conditional_train_test(train_word_token, test_word_token, wordnet_conditioning, wordnet_cond_label)

In [17]:
w2v_train_wordnet_cond_matrix_df.head()

Unnamed: 0,acoustics,administration,agriculture,anatomy,animal_husbandry,animals,anthropology,applied_science,archaeology,archery,...,time_period,topography,tourism,town_planning,transport,university,vehicles,veterinary,volleyball,wrestling
framework,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dancing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
needy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
algae,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
missions,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
glove_train_wordnet_cond_matrix_df, glove_test_wordnet_cond_matrix_df = create_conditional_train_test(sg_train_word_token, sg_test_word_token, wordnet_conditioning, wordnet_cond_label)

### Part of Speech

In [19]:
pos = pickle.load(open( "../data/raw/one-hot-pos.p", "rb" ) )
pos_labels = pickle.load(open( "../data/raw/one-hot-pos_labels.p", "rb" ) )

In [20]:
w2v_train_pos_cond_matrix_df, w2v_test_pos_cond_matrix_df = create_conditional_train_test(train_word_token, test_word_token, pos, pos_labels)

In [21]:
glove_train_pos_cond_matrix_df, glove_test_pos_cond_matrix_df = create_conditional_train_test(sg_train_word_token, sg_test_word_token, pos, pos_labels)

### Sentiment

In [22]:
sentiment = pickle.load(open( "../data/raw/one-hot-sentiments.p", "rb" ) )
sentiment_labels = pickle.load(open( "../data/raw/one-hot-sentiments_labels.p", "rb" ) )

In [23]:
w2v_train_sentiment_cond_matrix_df, w2v_test_sentiment_cond_matrix_df = create_conditional_train_test(train_word_token, test_word_token, sentiment, sentiment_labels)

In [24]:
glove_train_sentiment_cond_matrix_df, glove_test_sentiment_cond_matrix_df = create_conditional_train_test(sg_train_word_token, sg_test_word_token, sentiment, sentiment_labels)

### Named Entity Type

In [25]:
entities = pickle.load(open( "../data/raw/one-hot-entities.p", "rb" ) )
entities_labels = pickle.load(open( "../data/raw/one-hot-entities_labels.p", "rb" ) )

In [26]:
w2v_train_entities_cond_matrix_df, w2v_test_entities_cond_matrix_df = create_conditional_train_test(train_word_token, test_word_token, entities, entities_labels)

In [27]:
glove_train_entities_cond_matrix_df, glove_test_entities_cond_matrix_df = create_conditional_train_test(sg_train_word_token, sg_test_word_token, entities, entities_labels)

## CVAE

In [28]:
def sample_z(args):
    mu, l_sigma = args
    eps = K.random_normal(shape=(m, n_z), mean=0., stddev=1.)
    return mu + K.exp(l_sigma / 2) * eps

def vae_loss(x, x_decoded_mean):
    """Defines the VAE loss functions as a combination of MSE and KL-divergence loss."""
    # -ELBO = MSE + KL
    mse_loss = K.mean(keras.losses.mse(x, x_decoded_mean), axis=-1)
    kl_loss = - 0.5 * K.mean(1 + l_sigma - K.square(mu) - K.exp(l_sigma), axis=-1)
    return mse_loss + kl_loss

In [34]:
def find_closest_word(xhat, train_w2v_embed = train_w2v_embed, train_w2v_token= train_w2v_token, most_similar_n = 1):
    """Use cosine distance to find the most similar word to the decoder output"""
    # xhat = decoder.predict(sample_word_3)
    cos_sim = abs(cosine_similarity(xhat, train_w2v_embed)).flatten() # calculate dist
    inx = np.argsort(cos_sim)[::-1][:most_similar_n] # the most similar, index
    return train_w2v_token[inx] # most similar word

In [29]:
# define variables
m = 30 # batch size
n_z = 75 # latent space size
encoder_dim1 = 128 # dim of encoder hidden layer
decoder_dim = 128 # dim of decoder hidden layer
activ = 'relu'
optim = Adam(lr=0.001)
n_epoch = 50

### word2vec

spine

In [151]:
### PARAMETERS ###
decoder_out_dim = train_w2v_embed.shape[1] # dim of decoder output layer
n_x = train_w2v_embed.shape[1]
n_y = train_word_embed.shape[1]
train_embed = train_w2v_embed
train_word = train_word_embed
test_embed = test_w2v_embed
test_word = test_word_embed
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 1, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_47 (InputLayer)           (None, 1000)         0                                            
__________________________________________________________________________________________________
concatenate_31 (Concatenate)    (None, 1300)         0           input_46[0][0]                   
                                                                 input_47[0][0]                   
__________________________________________________________________________________________________
dense_76 (Dense)                (None, 128)          166528      concatenate_31[0][0]             
__________

In [85]:
def decode_words(word, dimension, train_w2v_embed, train_w2v_token, train_word_embed, cond_type="spine"):
    # get word index
    print("WORD: " + str(word))
    word_inx = np.where(train_w2v_token == word)[0][0]
    print(word_inx)
#     math_w2v = train_w2v_embed[word_inx]
    
    if cond_type == "spine":
        math_encoded = encoder.predict([np.repeat(np.expand_dims(train_w2v_embed[word_inx], axis = 0), m, axis=0), 
                                        np.repeat(np.expand_dims(train_word_embed[word_inx], axis = 0), m, axis=0)],
                                   batch_size = m)
    else:
        math_encoded = encoder.predict([np.repeat(np.expand_dims(train_w2v_embed[word_inx], axis = 0), m, axis=0), 
                                        np.repeat(np.expand_dims(train_word_embed.iloc[word_inx], axis = 0), m, axis=0)],
                                   batch_size = m)
    values_to_try = np.linspace(0,5,num=10)
    for val in values_to_try:
        print("VALUE: " + str(val))
        if cond_type == "spine":
            new_condition = train_word_embed[word_inx].copy()
        else:
            new_condition = train_word_embed.iloc[word_inx].copy()
        print(new_condition.shape)
        new_condition[dimension] = val # strengthen the condition, value needs to be higher than 1

        decoder_input_z2 = np.concatenate((math_encoded, 
                                            np.repeat(np.expand_dims(new_condition, axis=0), m, axis=0)),
                                          axis=1)
        z_decoded2 = decoder.predict(decoder_input_z2)

        print(find_closest_word(np.expand_dims(z_decoded2[0], axis=0), most_similar_n=10))
    return

In [152]:
find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, train_word_embed, "word2vec", "spine")

WORD: mathematics
11730
VALUE: 0.0
(1000,)
['mathematics' 'math' 'physics' 'mathematician' 'linguistics' 'science'
 'sociology' 'algebra' 'economics' 'anthropology']
VALUE: 0.5555555555555556
(1000,)
['mathematics' 'math' 'algebra' 'science' 'physics' 'linguistics'
 'anthropology' 'sociology' 'economics' 'biology']
VALUE: 1.1111111111111112
(1000,)
['mathematics' 'math' 'algebra' 'anthropology' 'science' 'linguistics'
 'sociology' 'physics' 'biology' 'calculus']
VALUE: 1.6666666666666667
(1000,)
['mathematics' 'algebra' 'math' 'anthropology' 'linguistics' 'science'
 'curriculum' 'sociology' 'calculus' 'biology']
VALUE: 2.2222222222222223
(1000,)
['algebra' 'mathematics' 'math' 'anthropology' 'curriculum' 'science'
 'linguistics' 'calculus' 'biology' 'sociology']
VALUE: 2.7777777777777777
(1000,)
['algebra' 'mathematics' 'math' 'curriculum' 'anthropology' 'calculus'
 'biology' 'science' 'linguistics' 'classes']
VALUE: 3.3333333333333335
(1000,)
['algebra' 'mathematics' 'math' 'curriculu

['icons' 'icon' 'button' 'buttons' 'chat' 'click' 'pop' 'drag' 'slides'
 'aol']
VALUE: 3.8888888888888893
(1000,)
['icons' 'icon' 'button' 'buttons' 'chat' 'click' 'drag' 'pop' 'slides'
 'beaver']
VALUE: 4.444444444444445
(1000,)
['icons' 'icon' 'button' 'buttons' 'drag' 'click' 'chat' 'slides' 'beaver'
 'pop']
VALUE: 5.0
(1000,)
['icons' 'icon' 'button' 'buttons' 'drag' 'click' 'chat' 'slides' 'beaver'
 'lester']
WORD: internet
503
VALUE: 0.0
(1000,)
['musicians' 'artists' 'bands' 'concerts' 'ensembles' 'musician' 'music'
 'composers' 'violinist' 'pianist']
VALUE: 0.5555555555555556
(1000,)
['musicians' 'artists' 'musician' 'music' 'bands' 'composers' 'concerts'
 'artist' 'gigs' 'pianist']
VALUE: 1.1111111111111112
(1000,)
['artists' 'musicians' 'musician' 'music' 'composers' 'blogs' 'gigs'
 'artist' 'bands' 'concerts']
VALUE: 1.6666666666666667
(1000,)
['musician' 'artists' 'musicians' 'blogs' 'music' 'websites' 'celebrities'
 'chat' 'gigs' 'myspace']
VALUE: 2.2222222222222223
(1000,

In [50]:
decode_words("mathematics", 954, train_w2v_embed, train_w2v_token)

WORD: mathematics
11730
VALUE: 0.0
['mathematics' 'math' 'physics' 'economics' 'mathematician' 'science'
 'theorem' 'linguistics' 'sociology' 'psychology']
VALUE: 0.5555555555555556
['mathematics' 'math' 'algebra' 'science' 'linguistics' 'economics'
 'physics' 'anthropology' 'sociology' 'theology']
VALUE: 1.1111111111111112
['mathematics' 'math' 'algebra' 'science' 'linguistics' 'anthropology'
 'economics' 'biology' 'calculus' 'physics']
VALUE: 1.6666666666666667
['mathematics' 'algebra' 'math' 'science' 'linguistics' 'curriculum'
 'anthropology' 'biology' 'calculus' 'economics']
VALUE: 2.2222222222222223
['mathematics' 'algebra' 'math' 'curriculum' 'science' 'biology'
 'anthropology' 'linguistics' 'vocational' 'calculus']
VALUE: 2.7777777777777777
['algebra' 'mathematics' 'math' 'curriculum' 'biology' 'vocational'
 'anthropology' 'science' 'linguistics' 'calculus']
VALUE: 3.3333333333333335
['algebra' 'mathematics' 'math' 'curriculum' 'biology' 'vocational'
 'anthropology' 'science' '

In [51]:
decode_words("mathematics", 427, train_w2v_embed, train_w2v_token)

WORD: mathematics
11730
VALUE: 0.0
['mathematics' 'math' 'algebra' 'linguistics' 'anthropology' 'sociology'
 'science' 'economics' 'theology' 'psychology']
VALUE: 0.5555555555555556
['mathematics' 'math' 'algebra' 'physics' 'science' 'economics'
 'linguistics' 'theorem' 'mathematician' 'anthropology']
VALUE: 1.1111111111111112
['mathematics' 'math' 'algebra' 'physics' 'science' 'theorem' 'economics'
 'mathematician' 'theory' 'calculus']
VALUE: 1.6666666666666667
['mathematics' 'math' 'physics' 'algebra' 'theorem' 'science'
 'mathematician' 'economics' 'theory' 'physicist']
VALUE: 2.2222222222222223
['mathematics' 'math' 'physics' 'theorem' 'algebra' 'science'
 'mathematician' 'theory' 'economics' 'physicist']
VALUE: 2.7777777777777777
['mathematics' 'math' 'physics' 'theorem' 'science' 'mathematician'
 'algebra' 'theory' 'irs' 'physicist']
VALUE: 3.3333333333333335
['mathematics' 'physics' 'math' 'theorem' 'science' 'mathematician'
 'theory' 'irs' 'algebra' 'physicist']
VALUE: 3.888888

In [52]:
decode_words("mathematics", 206, train_w2v_embed, train_w2v_token)

WORD: mathematics
11730
VALUE: 0.0
['mathematics' 'math' 'algebra' 'science' 'economics' 'physics'
 'linguistics' 'grammar' 'theorem' 'anthropology']
VALUE: 0.5555555555555556
['mathematics' 'math' 'algebra' 'linguistics' 'science' 'economics'
 'physics' 'anthropology' 'sociology' 'theology']
VALUE: 1.1111111111111112
['mathematics' 'math' 'linguistics' 'sociology' 'economics' 'anthropology'
 'physics' 'science' 'thesis' 'psychology']
VALUE: 1.6666666666666667
['mathematics' 'sociology' 'linguistics' 'anthropology' 'economics'
 'undergraduate' 'math' 'thesis' 'physics' 'psychology']
VALUE: 2.2222222222222223
['mathematics' 'sociology' 'linguistics' 'anthropology' 'undergraduate'
 'psychology' 'economics' 'thesis' 'science' 'professors']
VALUE: 2.7777777777777777
['sociology' 'mathematics' 'linguistics' 'anthropology' 'undergraduate'
 'psychology' 'professors' 'thesis' 'science' 'theology']
VALUE: 3.3333333333333335
['sociology' 'anthropology' 'linguistics' 'undergraduate' 'mathematics'

In [54]:
decode_words("remote", 948, train_w2v_embed, train_w2v_token)

WORD: remote
7807
VALUE: 0.0
['remote' 'northern' 'southern' 'western' 'aol' 'northwestern' 'northeast'
 'neighboring' 'weird' 'hannah']
VALUE: 0.5555555555555556
['remote' 'northwestern' 'northern' 'northeastern' 'southwestern'
 'southern' 'western' 'rural' 'central' 'northeast']
VALUE: 1.1111111111111112
['remote' 'northeastern' 'northwestern' 'southwestern' 'rural' 'northern'
 'western' 'southern' 'central' 'mountainous']
VALUE: 1.6666666666666667
['remote' 'northeastern' 'rural' 'southwestern' 'northwestern'
 'mountainous' 'northern' 'western' 'countryside' 'highlands']
VALUE: 2.2222222222222223
['northeastern' 'rural' 'southwestern' 'northwestern' 'remote'
 'mountainous' 'countryside' 'western' 'mountain' 'northern']
VALUE: 2.7777777777777777
['northeastern' 'rural' 'southwestern' 'northwestern' 'mountainous'
 'remote' 'countryside' 'mountain' 'highlands' 'wilderness']
VALUE: 3.3333333333333335
['northeastern' 'southwestern' 'rural' 'northwestern' 'mountainous'
 'mountain' 'countr

In [55]:
decode_words("remote", 473, train_w2v_embed, train_w2v_token)

WORD: remote
7807
VALUE: 0.0
['remote' 'southern' 'northern' 'aol' 'drift' 'polar' 'northwestern'
 'accidentally' 'iceland' 'lunar']
VALUE: 0.5555555555555556
['remote' 'northern' 'southern' 'western' 'northwestern' 'northeastern'
 'northeast' 'southwestern' 'isolated' 'northwest']
VALUE: 1.1111111111111112
['northern' 'remote' 'southern' 'northwestern' 'western' 'outlying'
 'isolated' 'northeastern' 'northeast' 'southwestern']
VALUE: 1.6666666666666667
['northern' 'outlying' 'southern' 'northeastern' 'northwestern' 'western'
 'remote' 'isolated' 'northeast' 'populated']
VALUE: 2.2222222222222223
['northern' 'outlying' 'southern' 'northeastern' 'northwestern' 'western'
 'northeast' 'isolated' 'neighboring' 'populated']
VALUE: 2.7777777777777777
['northern' 'outlying' 'southern' 'northeastern' 'northwestern' 'western'
 'northeast' 'neighboring' 'isolated' 'bordering']
VALUE: 3.3333333333333335
['northern' 'outlying' 'northeastern' 'southern' 'northwestern' 'western'
 'neighboring' 'nort

In [56]:
decode_words("remote", 777, train_w2v_embed, train_w2v_token)

WORD: remote
7807
VALUE: 0.0
['remote' 'northern' 'southern' 'northwestern' 'western' 'southwestern'
 'northeastern' 'northeast' 'polar' 'northwest']
VALUE: 0.5555555555555556
['remote' 'northern' 'southern' 'icons' 'hop' 'drag' 'drift' 'aol'
 'button' 'icon']
VALUE: 1.1111111111111112
['icons' 'button' 'remote' 'icon' 'hop' 'buttons' 'lets' 'flash' 'aol'
 'drag']
VALUE: 1.6666666666666667
['button' 'icons' 'buttons' 'icon' 'hop' 'lets' 'chat' 'armenian' 'ping'
 'remote']
VALUE: 2.2222222222222223
['button' 'icons' 'buttons' 'icon' 'hop' 'lets' 'chat' 'aol' 'armenian'
 'ping']
VALUE: 2.7777777777777777
['button' 'icons' 'icon' 'buttons' 'hop' 'lets' 'chat' 'aol' 'settings'
 'flash']
VALUE: 3.3333333333333335
['button' 'icons' 'icon' 'hop' 'buttons' 'aol' 'flash' 'lets' 'baltic'
 'chat']
VALUE: 3.8888888888888893
['button' 'icons' 'icon' 'hop' 'buttons' 'aol' 'baltic' 'flash' 'lets'
 'instantly']
VALUE: 4.444444444444445
['button' 'icons' 'icon' 'hop' 'buttons' 'aol' 'baltic' 'flash' 's

In [62]:
decode_words("internet", 995, test_w2v_embed, test_w2v_token)

WORD: internet
503
VALUE: 0.0
['musicians' 'artists' 'ensembles' 'concerts' 'bands' 'pianist'
 'violinist' 'composers' 'music' 'dancers']
VALUE: 0.5555555555555556
['musicians' 'artists' 'concerts' 'music' 'bands' 'musician' 'ensembles'
 'composers' 'pianist' 'painters']
VALUE: 1.1111111111111112
['artists' 'musicians' 'music' 'musician' 'bands' 'concerts' 'galleries'
 'gigs' 'painters' 'folk']
VALUE: 1.6666666666666667
['artists' 'musicians' 'websites' 'music' 'blogs' 'galleries'
 'contemporary' 'surf' 'musician' 'surfing']
VALUE: 2.2222222222222223
['artists' 'websites' 'blogs' 'contemporary' 'surf' 'chat' 'galleries'
 'surfing' 'music' 'musicians']
VALUE: 2.7777777777777777
['chat' 'websites' 'blogs' 'surf' 'artists' 'contemporary' 'hacker'
 'surfing' 'galleries' 'myspace']
VALUE: 3.3333333333333335
['chat' 'blogs' 'surf' 'websites' 'hacker' 'contemporary' 'surfing'
 'artists' 'cafe' 'cafes']
VALUE: 3.8888888888888893
['chat' 'surf' 'blogs' 'hacker' 'websites' 'surfing' 'contemporar

In [63]:
decode_words("internet", 555, test_w2v_embed, test_w2v_token)

WORD: internet
503
VALUE: 0.0
['musicians' 'artists' 'ensembles' 'concerts' 'bands' 'pianist'
 'violinist' 'composers' 'music' 'dancers']
VALUE: 0.5555555555555556
['musicians' 'artists' 'concerts' 'ensembles' 'bands' 'music' 'composers'
 'band' 'choral' 'folk']
VALUE: 1.1111111111111112
['musicians' 'music' 'artists' 'concerts' 'bands' 'ensembles' 'band'
 'choral' 'rehearsals' 'folk']
VALUE: 1.6666666666666667
['music' 'musicians' 'concerts' 'bands' 'ensembles' 'artists' 'choral'
 'band' 'rehearsals' 'instruments']
VALUE: 2.2222222222222223
['music' 'musicians' 'ensembles' 'concerts' 'instruments' 'bands' 'choral'
 'rehearsals' 'rituals' 'band']
VALUE: 2.7777777777777777
['music' 'rituals' 'scientific' 'instruments' 'networks' 'ensembles'
 'musicians' 'ncaa' 'manipulating' 'choral']
VALUE: 3.3333333333333335
['music' 'scientific' 'rituals' 'fitzgerald' 'jealous' 'networks' 'ncaa'
 'grange' 'manipulating' 'media']
VALUE: 3.8888888888888893
['music' 'scientific' 'fitzgerald' 'jealous' '

In [64]:
decode_words("internet", 76, test_w2v_embed, test_w2v_token)

WORD: internet
503
VALUE: 0.0
['musicians' 'artists' 'ensembles' 'concerts' 'bands' 'pianist'
 'violinist' 'composers' 'music' 'dancers']
VALUE: 0.5555555555555556
['musicians' 'artists' 'ensembles' 'concerts' 'composers' 'dancers'
 'comedians' 'pianist' 'music' 'bands']
VALUE: 1.1111111111111112
['musicians' 'artists' 'blogs' 'comedians' 'composers' 'concerts'
 'ensembles' 'performances' 'ambassadors' 'canucks']
VALUE: 1.6666666666666667
['blogs' 'website' 'blog' 'twitter' 'comedians' 'musicians' 'journalists'
 'websites' 'ambassadors' 'eurovision']
VALUE: 2.2222222222222223
['blogs' 'blog' 'website' 'websites' 'twitter' 'commentaries' 'facebook'
 'journalists' 'newspapers' 'gossip']
VALUE: 2.7777777777777777
['blogs' 'blog' 'twitter' 'websites' 'website' 'commentaries' 'facebook'
 'newspapers' 'gossip' 'journalists']
VALUE: 3.3333333333333335
['blogs' 'blog' 'twitter' 'websites' 'website' 'commentaries' 'facebook'
 'newspapers' 'gossip' 'columns']
VALUE: 3.8888888888888893
['blogs' '

wordnet domains

In [65]:
### PARAMETERS ###
decoder_out_dim = train_w2v_embed.shape[1] # dim of decoder output layer
n_x = train_w2v_embed.shape[1]
n_y = w2v_train_wordnet_cond_matrix_df.shape[1]
train_embed = train_w2v_embed
train_word = w2v_train_wordnet_cond_matrix_df
test_embed = test_w2v_embed
test_word = w2v_test_wordnet_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 169)          0                                            
__________________________________________________________________________________________________
concatenate_11 (Concatenate)    (None, 469)          0           input_16[0][0]                   
                                                                 input_17[0][0]                   
__________________________________________________________________________________________________
dense_26 (Dense)                (None, 128)          60160       concatenate_11[0][0]             
__________

In [72]:
print(sum(train_w2v_token == "mathematics"))
print(sum(test_w2v_token == "mathematics"))

1
0


In [91]:
w2v_train_wordnet_cond_matrix_df.head()

Unnamed: 0,acoustics,administration,agriculture,anatomy,animal_husbandry,animals,anthropology,applied_science,archaeology,archery,...,time_period,topography,tourism,town_planning,transport,university,vehicles,veterinary,volleyball,wrestling
framework,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dancing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
needy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
algae,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
missions,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
np.where(np.array(wordnet_cond_label) == 'computer_science')[0][0]

37

In [161]:
entities_labels[3]

'ORG'

In [117]:
np.where(np.array(entities_labels) == 'PRODUCT')[0][0]

6

In [154]:
pos_labels[8]

'NOUN'

In [133]:
np.where(np.array(sentiment_labels) == 'compound')[0][0]

3

In [131]:
sentiment_labels

['neg', 'neu', 'pos', 'compound']

In [125]:
def find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, train_word_embed, embed_type, cond_type):
    words = ["mathematics", "remote", "internet"]
    if embed_type == "word2vec" and cond_type == "spine":
        dimensions = [[954,427,206],[948,473,777],[995,555,400]]
    elif embed_type == "glove" and cond_type == "spine":
        dimensions = [[635,486,513],[873,793,484],[737,403,125]]
    elif cond_type == "wordnet":
        dimensions = [[89], [17, 90, 94], [37]]
    elif cond_type == "pos":
        dimensions = [[8],[0, 8],[8]]
    elif cond_type == "entity":
        dimensions = [[5, 6, 10],[5, 6],[2, 3, 5, 6]]
    elif cond_type == "sentiment":
        dimensions = [[0,1,2,3], [0,1,2,3], [0,1,2,3]]
    else:
        print("error")
    for index in range(len(words)):
        for d in dimensions[index]:
            if sum(train_w2v_token == words[index]) == 1:
                decode_words(words[index], d, train_w2v_embed, train_w2v_token, train_word_embed, cond_type)
            else:
                decode_words(words[index], d, test_w2v_embed, test_w2v_token, train_word_embed, cond_type)
    return 

In [119]:
find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, w2v_train_wordnet_cond_matrix_df, "word2vec", "wordnet")

WORD: mathematics
11730
VALUE: 0.0
(169,)
['nikki' 'whitman' 'lindsey' 'greer' 'mcmahon' 'rupert' 'colbert'
 'catholicism' 'hyde' 'williamson']
VALUE: 0.5555555555555556
(169,)
['nikki' 'whitman' 'lindsey' 'greer' 'mcmahon' 'rupert' 'colbert'
 'catholicism' 'hyde' 'williamson']
VALUE: 1.1111111111111112
(169,)
['calculation' 'variables' 'logic' 'strategy' 'hypothetical'
 'calculations' 'calculating' 'equals' 'mathematical' 'calculated']
VALUE: 1.6666666666666667
(169,)
['calculation' 'variables' 'logic' 'strategy' 'hypothetical'
 'calculations' 'calculating' 'equals' 'mathematical' 'calculated']
VALUE: 2.2222222222222223
(169,)
['calculation' 'variables' 'logic' 'methodology' 'calculations'
 'measurement' 'parameter' 'parameters' 'algorithms' 'mathematical']
VALUE: 2.7777777777777777
(169,)
['calculation' 'variables' 'logic' 'methodology' 'calculations'
 'measurement' 'parameter' 'parameters' 'algorithms' 'mathematical']
VALUE: 3.3333333333333335
(169,)
['calculation' 'methodology' 'pa

pos

In [120]:
### PARAMETERS ###
decoder_out_dim = train_w2v_embed.shape[1] # dim of decoder output layer
n_x = train_w2v_embed.shape[1]
n_y = w2v_train_pos_cond_matrix_df.shape[1]
train_embed = train_w2v_embed
train_word = w2v_train_pos_cond_matrix_df
test_embed = test_w2v_embed
test_word = w2v_test_pos_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 19)           0                                            
__________________________________________________________________________________________________
concatenate_13 (Concatenate)    (None, 319)          0           input_19[0][0]                   
                                                                 input_20[0][0]                   
__________________________________________________________________________________________________
dense_31 (Dense)                (None, 128)          40960       concatenate_13[0][0]             
__________

In [121]:
find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, w2v_train_pos_cond_matrix_df, "word2vec", "pos")

WORD: mathematics
11730
VALUE: 0.0
(19,)
['lindsey' 'nikki' 'arnold' 'armstrong' 'meyers' 'williamson' 'pete'
 'nigel' 'gerald' 'carlson']
VALUE: 0.5555555555555556
(19,)
['lindsey' 'nikki' 'arnold' 'armstrong' 'meyers' 'williamson' 'pete'
 'nigel' 'gerald' 'carlson']
VALUE: 1.1111111111111112
(19,)
['colbert' 'nikki' 'arnold' 'rupert' 'williamson' 'lindsey' 'mcmahon'
 'norman' 'greer' 'pete']
VALUE: 1.6666666666666667
(19,)
['colbert' 'nikki' 'arnold' 'rupert' 'williamson' 'lindsey' 'mcmahon'
 'norman' 'greer' 'pete']
VALUE: 2.2222222222222223
(19,)
['catholicism' 'colbert' 'filipinos' 'rupert' 'debates' 'bills'
 'politicians' 'teachings' 'buddha' 'stories']
VALUE: 2.7777777777777777
(19,)
['catholicism' 'colbert' 'filipinos' 'rupert' 'debates' 'bills'
 'politicians' 'teachings' 'buddha' 'stories']
VALUE: 3.3333333333333335
(19,)
['legislators' 'bills' 'politicians' 'polls' 'pornography' 'auctions'
 'teachings' 'proposals' 'legislature' 'debates']
VALUE: 3.8888888888888893
(19,)
['leg

sentiment

In [149]:
### PARAMETERS ###
decoder_out_dim = train_w2v_embed.shape[1] # dim of decoder output layer
n_x = train_w2v_embed.shape[1]
n_y = w2v_train_sentiment_cond_matrix_df.shape[1]
train_embed = train_w2v_embed
train_word = w2v_train_sentiment_cond_matrix_df
test_embed = test_w2v_embed
test_word = w2v_test_sentiment_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_43 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_44 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
concatenate_29 (Concatenate)    (None, 304)          0           input_43[0][0]                   
                                                                 input_44[0][0]                   
__________________________________________________________________________________________________
dense_71 (Dense)                (None, 128)          39040       concatenate_29[0][0]             
__________

In [155]:
sentiment_labels

['neg', 'neu', 'pos', 'compound']

In [150]:
find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, w2v_train_sentiment_cond_matrix_df, "word2vec", "sentiment")

WORD: mathematics
11730
VALUE: 0.0
(4,)
['nikki' 'whitman' 'greer' 'williamson' 'lindsey' 'colbert' 'hyde' 'pete'
 'mcmahon' 'hilary']
VALUE: 0.5555555555555556
(4,)
['foolish' 'stupid' 'whitman' 'hate' 'ought' 'immoral' 'silly' 'ugly'
 'embarrassed' 'horrible']
VALUE: 1.1111111111111112
(4,)
['embarrassed' 'foolish' 'cynical' 'embarrassing' 'infuriated' 'unhappy'
 'punish' 'sad' 'stupid' 'afraid']
VALUE: 1.6666666666666667
(4,)
['unhappy' 'infuriated' 'embarrassed' 'cynical' 'foolish' 'fearful'
 'angered' 'punish' 'embarrassing' 'unacceptable']
VALUE: 2.2222222222222223
(4,)
['unhappy' 'infuriated' 'cynical' 'fearful' 'embarrassed' 'angered'
 'unacceptable' 'wary' 'punish' 'foolish']
VALUE: 2.7777777777777777
(4,)
['unhappy' 'infuriated' 'cynical' 'fearful' 'angered' 'embarrassed' 'wary'
 'unacceptable' 'punish' 'concerned']
VALUE: 3.3333333333333335
(4,)
['unhappy' 'infuriated' 'cynical' 'fearful' 'angered' 'wary'
 'unacceptable' 'embarrassed' 'concerned' 'punish']
VALUE: 3.888888888

['cynical' 'unhappy' 'infuriated' 'arrogant' 'believe' 'deserved'
 'alienated' 'critics' 'angered' 'believing']
VALUE: 4.444444444444445
(4,)
['cynical' 'infuriated' 'unhappy' 'arrogant' 'deserved' 'believe'
 'already' 'critics' 'bowie' 'alienated']
VALUE: 5.0
(4,)
['cynical' 'infuriated' 'unhappy' 'already' 'bowie' 'deserved' 'a1' 'ms'
 'arrogant' 'critics']
WORD: remote
7807
VALUE: 0.0
(4,)
['nikki' 'whitman' 'greer' 'williamson' 'lindsey' 'colbert' 'hyde' 'pete'
 'mcmahon' 'hilary']
VALUE: 0.5555555555555556
(4,)
['nikki' 'lovely' 'guinness' 'howell' 'ought' 'rowe' 'jonas' 'disneyland'
 'norman' 'baird']
VALUE: 1.1111111111111112
(4,)
['lovely' 'brilliant' 'beautiful' 'characteristic' 'splendid' 'virtues'
 'glorious' 'liked' 'fascinating' 'madison']
VALUE: 1.6666666666666667
(4,)
['brilliant' 'lovely' 'splendid' 'characteristic' 'beautiful'
 'fascinating' 'inspiring' 'fantastic' 'amazing' 'glorious']
VALUE: 2.2222222222222223
(4,)
['brilliant' 'splendid' 'characteristic' 'inspiring'

named entity

In [123]:
### PARAMETERS ###
decoder_out_dim = train_w2v_embed.shape[1] # dim of decoder output layer
n_x = train_w2v_embed.shape[1]
n_y = w2v_train_entities_cond_matrix_df.shape[1]
train_embed = train_w2v_embed
train_word = w2v_train_entities_cond_matrix_df
test_embed = test_w2v_embed
test_word = w2v_test_entities_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           (None, 17)           0                                            
__________________________________________________________________________________________________
concatenate_17 (Concatenate)    (None, 317)          0           input_25[0][0]                   
                                                                 input_26[0][0]                   
__________________________________________________________________________________________________
dense_41 (Dense)                (None, 128)          40704       concatenate_17[0][0]             
__________

In [130]:
find_top_words(train_w2v_embed, train_w2v_token, test_w2v_embed, test_w2v_token, w2v_train_entities_cond_matrix_df, "word2vec", "entity")

WORD: mathematics
11730
VALUE: 0.0
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'pete' 'carlson']
VALUE: 0.5555555555555556
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'pete' 'carlson']
VALUE: 1.1111111111111112
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 1.6666666666666667
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 2.2222222222222223
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 2.7777777777777777
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 3.3333333333333335
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'mcmahon' 'colbert' 'peyton' 'hyde'
 'jonas' 'williamson']
VALUE: 3.8888888888888893
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'mcmahon' 'colbert' 'peyton' 'hyde'
 'jonas' 'williamson

['nikki' 'greer' 'whitman' 'lindsey' 'mcmahon' 'carlson' 'hyde' 'colbert'
 'peyton' 'williamson']
WORD: internet
503
VALUE: 0.0
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'pete' 'carlson']
VALUE: 0.5555555555555556
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'pete' 'carlson']
VALUE: 1.1111111111111112
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 1.6666666666666667
(17,)
['nikki' 'greer' 'lindsey' 'whitman' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 2.2222222222222223
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 2.7777777777777777
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'colbert' 'mcmahon' 'jonas' 'peyton'
 'hyde' 'pete']
VALUE: 3.3333333333333335
(17,)
['nikki' 'greer' 'whitman' 'lindsey' 'mcmahon' 'colbert' 'peyton' 'hyde'
 'jonas' 'williamson']
VALUE: 3.8888888888888893
(17,)


### glove

In [134]:
### PARAMETERS ###
decoder_out_dim = train_glove_embed.shape[1] # dim of decoder output layer
n_x = train_glove_embed.shape[1]
n_y = sg_train_word_embed.shape[1]
train_embed = train_glove_embed
train_word = sg_train_word_embed
test_embed = test_glove_embed
test_word = sg_test_word_embed
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 1, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_29 (InputLayer)           (None, 1000)         0                                            
__________________________________________________________________________________________________
concatenate_19 (Concatenate)    (None, 1300)         0           input_28[0][0]                   
                                                                 input_29[0][0]                   
__________________________________________________________________________________________________
dense_46 (Dense)                (None, 128)          166528      concatenate_19[0][0]             
__________

In [136]:
find_top_words(train_glove_embed, train_glove_token, test_glove_embed, test_glove_token, sg_train_word_embed, "glove","spine")

WORD: mathematics
8341
VALUE: 0.0
(1000,)
['disability' 'trouble' 'accident' 'situations' 'thread' 'grenade'
 'dragons' 'mistake' 'ricky' 'reeves']
VALUE: 0.5555555555555556
(1000,)
['disability' 'trouble' 'situations' 'accident' 'reeves' 'thread' 'ricky'
 'ghost' 'everybody' 'mike']
VALUE: 1.1111111111111112
(1000,)
['situations' 'disability' 'trouble' 'reeves' 'everybody' 'ricky' 'mike'
 'accident' 'ghost' 'somebody']
VALUE: 1.6666666666666667
(1000,)
['situations' 'disability' 'trouble' 'reeves' 'everybody' 'somebody'
 'mike' 'ricky' 'camera' 'ghost']
VALUE: 2.2222222222222223
(1000,)
['situations' 'trouble' 'disability' 'everybody' 'camera' 'reeves'
 'somebody' 'lights' 'mike' 'guy']
VALUE: 2.7777777777777777
(1000,)
['situations' 'camera' 'trouble' 'lights' 'disability' 'somebody'
 'everybody' 'guy' 'printer' 'reeves']
VALUE: 3.3333333333333335
(1000,)
['situations' 'camera' 'lights' 'guy' 'trouble' 'somebody' 'everybody'
 'disability' 'screen' 'chip']
VALUE: 3.8888888888888893
(1

['flag' 'picked' 'gum' 'whistle' 'implicated' 'locate' 'alerted' 'marshal'
 'tagged' 'identified']
VALUE: 3.8888888888888893
(1000,)
['flag' 'picked' 'gum' 'whistle' 'implicated' 'locate' 'marshal' 'alerted'
 'tagged' 'find']
VALUE: 4.444444444444445
(1000,)
['flag' 'picked' 'gum' 'whistle' 'find' 'implicated' 'marshal' 'tagged'
 'locate' 'identified']
VALUE: 5.0
(1000,)
['flag' 'picked' 'gum' 'find' 'whistle' 'recognize' 'marshal' 'dug'
 'tagged' 'identified']
WORD: internet
6647
VALUE: 0.0
(1000,)
['flag' 'retrieved' 'volunteered' 'ribbons' 'vital' 'gum' 'cigarettes'
 'manning' 'cigarette' 'revision']
VALUE: 0.5555555555555556
(1000,)
['flag' 'swami' 'ribbons' 'revision' 'locate' 'vital' 'wording'
 'retrieved' 'assigned' 'volunteered']
VALUE: 1.1111111111111112
(1000,)
['flag' 'swami' 'revision' 'locate' 'ribbons' 'refugees' 'krishna' 'share'
 'kyoto' 'wording']
VALUE: 1.6666666666666667
(1000,)
['flag' 'swami' 'disciple' 'refugees' 'krishna' 'revision' 'kyoto'
 'locate' 'astronomers

wordnet domains

In [139]:
### PARAMETERS ###
decoder_out_dim = train_glove_embed.shape[1] # dim of decoder output layer
n_x = train_glove_embed.shape[1]
n_y = glove_train_wordnet_cond_matrix_df.shape[1]
train_embed = train_glove_embed
train_word = glove_train_wordnet_cond_matrix_df
test_embed = test_glove_embed
test_word = glove_test_wordnet_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           (None, 169)          0                                            
__________________________________________________________________________________________________
concatenate_21 (Concatenate)    (None, 469)          0           input_31[0][0]                   
                                                                 input_32[0][0]                   
__________________________________________________________________________________________________
dense_51 (Dense)                (None, 128)          60160       concatenate_21[0][0]             
__________

In [141]:
find_top_words(train_glove_embed, train_glove_token, test_glove_embed, test_glove_token, glove_train_wordnet_cond_matrix_df, "glove", "wordnet")

WORD: mathematics
8341
VALUE: 0.0
(169,)
['lau' 'software' 'expulsion' 'immoral' 'developments' 'cleaning' 'prices'
 'freeing' 'freed' 'contamination']
VALUE: 0.5555555555555556
(169,)
['lau' 'software' 'expulsion' 'immoral' 'developments' 'cleaning' 'prices'
 'freeing' 'freed' 'contamination']
VALUE: 1.1111111111111112
(169,)
['rumor' 'rumors' 'lectured' 'navigator' 'coding' 'tactical' 'auction'
 'generic' 'impaired' 'soon']
VALUE: 1.6666666666666667
(169,)
['rumor' 'rumors' 'lectured' 'navigator' 'coding' 'tactical' 'auction'
 'generic' 'impaired' 'soon']
VALUE: 2.2222222222222223
(169,)
['navigator' 'rumor' 'rumors' 'drivers' 'lectured' 'coding' 'astronaut'
 'chief' 'supervised' 'competence']
VALUE: 2.7777777777777777
(169,)
['navigator' 'rumor' 'rumors' 'drivers' 'lectured' 'coding' 'astronaut'
 'chief' 'supervised' 'competence']
VALUE: 3.3333333333333335
(169,)
['navigator' 'chief' 'drivers' 'astronaut' 'coding' 'supervised'
 'commander' 'spelling' 'rumor' 'supervising']
VALUE: 3.

pos

In [143]:
### PARAMETERS ###
decoder_out_dim = train_glove_embed.shape[1] # dim of decoder output layer
n_x = train_glove_embed.shape[1]
n_y = glove_train_pos_cond_matrix_df.shape[1]
train_embed = train_glove_embed
train_word = glove_train_pos_cond_matrix_df
test_embed = test_glove_embed
test_word = glove_test_pos_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 19)           0                                            
__________________________________________________________________________________________________
concatenate_23 (Concatenate)    (None, 319)          0           input_34[0][0]                   
                                                                 input_35[0][0]                   
__________________________________________________________________________________________________
dense_56 (Dense)                (None, 128)          40960       concatenate_23[0][0]             
__________

In [144]:
find_top_words(train_glove_embed, train_glove_token, test_glove_embed, test_glove_token, glove_train_pos_cond_matrix_df, "glove", "pos")

WORD: mathematics
8341
VALUE: 0.0
(19,)
['software' 'condemned' 'pie' 'improve' 'expelled' 'electrons' 'phenomena'
 'camp' 'liberation' 'capt']
VALUE: 0.5555555555555556
(19,)
['software' 'condemned' 'pie' 'improve' 'expelled' 'electrons' 'phenomena'
 'camp' 'liberation' 'capt']
VALUE: 1.1111111111111112
(19,)
['pie' 'competitive' 'bench' 'shipbuilding' 'auto' 'automotive' 'table'
 'massa' 'expertise' 'baker']
VALUE: 1.6666666666666667
(19,)
['pie' 'competitive' 'bench' 'shipbuilding' 'auto' 'automotive' 'table'
 'massa' 'expertise' 'baker']
VALUE: 2.2222222222222223
(19,)
['pie' 'massa' 'competitive' 'bench' 'shipbuilding' 'auto' 'automotive'
 'mccain' 'baker' 'table']
VALUE: 2.7777777777777777
(19,)
['pie' 'massa' 'competitive' 'bench' 'shipbuilding' 'auto' 'automotive'
 'mccain' 'baker' 'table']
VALUE: 3.3333333333333335
(19,)
['massa' 'mccain' 'pie' 'bench' 'competitive' 'fungus' 'shipbuilding'
 'obama' 'craft' 'gardening']
VALUE: 3.8888888888888893
(19,)
['massa' 'mccain' 'pie' 'b

sentiment

In [145]:
### PARAMETERS ###
decoder_out_dim = train_glove_embed.shape[1] # dim of decoder output layer
n_x = train_glove_embed.shape[1]
n_y = glove_train_sentiment_cond_matrix_df.shape[1]
train_embed = train_glove_embed
train_word = glove_train_sentiment_cond_matrix_df
test_embed = test_glove_embed
test_word = glove_test_sentiment_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_38 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
concatenate_25 (Concatenate)    (None, 304)          0           input_37[0][0]                   
                                                                 input_38[0][0]                   
__________________________________________________________________________________________________
dense_61 (Dense)                (None, 128)          39040       concatenate_25[0][0]             
__________

In [146]:
find_top_words(train_glove_embed, train_glove_token, test_glove_embed, test_glove_token, glove_train_sentiment_cond_matrix_df, "glove", "sentiment")

WORD: mathematics
8341
VALUE: 0.0
(4,)
['share' 'pie' 'danish' 'kris' 'advanced' 'tim' 'hu' 'jobs' 'automotive'
 'ibm']
VALUE: 0.5555555555555556
(4,)
['share' 'kris' 'danish' 'pie' 'automotive' 'tim' 'hu' 'disability'
 'profession' 'jobs']
VALUE: 1.1111111111111112
(4,)
['ken' 'profession' 'kris' 'harley' 'stein' 'sophia' 'teaching' 'retiring'
 'danish' 'dedication']
VALUE: 1.6666666666666667
(4,)
['ken' 'harley' 'abd' 'sophia' 'kris' 'arabic' 'retiring' 'profession'
 'retired' 'teaching']
VALUE: 2.2222222222222223
(4,)
['abd' 'ken' 'arabic' 'harley' 'kris' 'sophia' 'kerr' 'stan' 'kmt'
 'mercedes']
VALUE: 2.7777777777777777
(4,)
['abd' 'ken' 'stan' 'syed' 'kerr' 'arabic' 'kmt' 'mercedes' 'hines' 'khan']
VALUE: 3.3333333333333335
(4,)
['abd' 'stan' 'ken' 'syed' 'khan' 'rico' 'excellence' 'kerr' 'mcgee'
 'ahmed']
VALUE: 3.8888888888888893
(4,)
['abd' 'stan' 'syed' 'khan' 'ken' 'mcgee' 'rubin' 'excellence' 'rico' 'ko']
VALUE: 4.444444444444445
(4,)
['abd' 'stan' 'khan' 'syed' 'ken' 'mcge

['kris' 'groin' 'concentration' 'artificial' 'gong' 'esteem' 'trophies'
 'evangelical' 'okay' 'europeans']
VALUE: 4.444444444444445
(4,)
['groin' 'concentration' 'kris' 'okay' 'artificial' 'gong' 'nerve'
 'esteem' 'evangelical' 'trophies']
VALUE: 5.0
(4,)
['groin' 'concentration' 'okay' 'nerve' 'gong' 'artificial' 'evangelical'
 'esteem' 'kris' 'mick']
WORD: remote
2616
VALUE: 0.0
(4,)
['share' 'pie' 'danish' 'kris' 'advanced' 'tim' 'hu' 'jobs' 'automotive'
 'ibm']
VALUE: 0.5555555555555556
(4,)
['shipbuilding' 'kris' 'competitive' 'shipyard' 'carlo' 'pie' 'share'
 'massa' 'automotive' 'danish']
VALUE: 1.1111111111111112
(4,)
['shipbuilding' 'carlo' 'massa' 'bench' 'leadership' 'competitive' 'stein'
 'kris' 'migratory' 'pie']
VALUE: 1.6666666666666667
(4,)
['shipbuilding' 'leadership' 'carlo' 'massa' 'competence' 'migratory'
 'bench' 'competitive' 'pie' 'stein']
VALUE: 2.2222222222222223
(4,)
['leadership' 'competence' 'shipbuilding' 'migratory' 'competitive'
 'carlo' 'massa' 'pie' 'he

named entity

In [147]:
### PARAMETERS ###
decoder_out_dim = train_glove_embed.shape[1] # dim of decoder output layer
n_x = train_glove_embed.shape[1]
n_y = glove_train_entities_cond_matrix_df.shape[1]
train_embed = train_glove_embed
train_word = glove_train_entities_cond_matrix_df
test_embed = test_glove_embed
test_word = glove_test_entities_cond_matrix_df
### PARAMETERS ###

# define encoder
X = Input(shape=(n_x,))
label = Input(shape=(n_y,))
inputs = concat([X, label])
encoder_h = Dense(encoder_dim1, activation=activ)(inputs)
mu = Dense(n_z, activation='linear')(encoder_h)
l_sigma = Dense(n_z, activation='linear')(encoder_h)

# sample latent space
z = Lambda(sample_z, output_shape = (n_z, ))([mu, l_sigma])
zc = concat([z, label])

# decoder
decoder_hidden = Dense(decoder_dim, activation=activ)
decoder_out = Dense(decoder_out_dim, activation='sigmoid')
h_p = decoder_hidden(zc)
outputs = decoder_out(h_p)

# define graphs
cvae = Model([X, label], outputs)
encoder = Model([X, label], mu)
d_in = Input(shape=(n_z+n_y,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

# print statistics
cvae.compile(optimizer=optim, loss=vae_loss)

cvae.summary()

cvae_hist = cvae.fit([train_embed, train_word], train_embed, verbose = 2, batch_size=m, 
                 epochs=n_epoch,
                 validation_data = ([test_embed, test_word], test_embed),
                 callbacks = [EarlyStopping(patience = 5)])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_40 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_41 (InputLayer)           (None, 17)           0                                            
__________________________________________________________________________________________________
concatenate_27 (Concatenate)    (None, 317)          0           input_40[0][0]                   
                                                                 input_41[0][0]                   
__________________________________________________________________________________________________
dense_66 (Dense)                (None, 128)          40704       concatenate_27[0][0]             
__________

In [148]:
find_top_words(train_glove_embed, train_glove_token, test_glove_embed, test_glove_token, glove_train_entities_cond_matrix_df, "glove", "entity")

WORD: mathematics
8341
VALUE: 0.0
(17,)
['share' 'lacking' 'kris' 'crashes' 'pie' 'constantine' 'automotive'
 'danish' 'arbor' 'aerospace']
VALUE: 0.5555555555555556
(17,)
['share' 'lacking' 'kris' 'crashes' 'pie' 'constantine' 'automotive'
 'danish' 'arbor' 'aerospace']
VALUE: 1.1111111111111112
(17,)
['share' 'lacking' 'kris' 'crashes' 'constantine' 'pie' 'automotive'
 'aerospace' 'danish' 'arbor']
VALUE: 1.6666666666666667
(17,)
['share' 'lacking' 'kris' 'crashes' 'constantine' 'pie' 'automotive'
 'aerospace' 'danish' 'arbor']
VALUE: 2.2222222222222223
(17,)
['share' 'lacking' 'automotive' 'kris' 'constantine' 'crashes' 'pie'
 'aerospace' 'ibm' 'arbor']
VALUE: 2.7777777777777777
(17,)
['share' 'lacking' 'automotive' 'kris' 'constantine' 'crashes' 'pie'
 'aerospace' 'ibm' 'arbor']
VALUE: 3.3333333333333335
(17,)
['share' 'automotive' 'kris' 'pie' 'lacking' 'constantine' 'aerospace'
 'crashes' 'ibm' 'danish']
VALUE: 3.8888888888888893
(17,)
['share' 'automotive' 'kris' 'pie' 'lacking'

['share' 'pie' 'automotive' 'kris' 'lacking' 'crashes' 'aerospace' 'jobs'
 'danish' 'constantine']
VALUE: 5.0
(17,)
['pie' 'share' 'automotive' 'kris' 'aerospace' 'danish' 'lacking' 'jobs'
 'crashes' 'hu']
WORD: internet
6647
VALUE: 0.0
(17,)
['share' 'lacking' 'kris' 'crashes' 'pie' 'constantine' 'automotive'
 'danish' 'arbor' 'aerospace']
VALUE: 0.5555555555555556
(17,)
['share' 'lacking' 'kris' 'crashes' 'pie' 'constantine' 'automotive'
 'danish' 'arbor' 'aerospace']
VALUE: 1.1111111111111112
(17,)
['share' 'lacking' 'kris' 'crashes' 'constantine' 'pie' 'automotive'
 'aerospace' 'danish' 'arbor']
VALUE: 1.6666666666666667
(17,)
['share' 'lacking' 'kris' 'crashes' 'constantine' 'pie' 'automotive'
 'aerospace' 'danish' 'arbor']
VALUE: 2.2222222222222223
(17,)
['share' 'lacking' 'automotive' 'kris' 'constantine' 'crashes' 'pie'
 'aerospace' 'ibm' 'arbor']
VALUE: 2.7777777777777777
(17,)
['share' 'lacking' 'automotive' 'kris' 'constantine' 'crashes' 'pie'
 'aerospace' 'ibm' 'arbor']
VAL

In [None]:
def find_closest_word(xhat, train_w2v_embed = train_w2v_embed, train_w2v_token= train_w2v_token, most_similar_n = 1):
    """Use cosine distance to find the most similar word to the decoder output"""
    # xhat = decoder.predict(sample_word_3)
    cos_sim = abs(cosine_similarity(xhat, train_w2v_embed)).flatten() # calculate dist
    inx = np.argsort(cos_sim)[::-1][:most_similar_n] # the most similar, index
    return train_w2v_token[inx] # most similar word