This replicates the tutorial at: https://kavita-ganesan.com/easily-access-pre-trained-word-embeddings-with-gensim/#.Xo4RAy-ZNTY

## Imports

In [33]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

## GloVe embeddings

In [5]:
# download the model and return as object ready for use
model_glove_twitter = api.load('glove-twitter-25')



In [35]:
# Similar words
model_glove_twitter.wv.most_similar("speed", topn = 10)



[('edge', 0.9273095726966858),
 ('power', 0.9033836126327515),
 ('switch', 0.8989549279212952),
 ('duty', 0.8903731107711792),
 ('led', 0.8883816003799438),
 ('heavy', 0.8850037455558777),
 ('machine', 0.8788420557975769),
 ('system', 0.8769183158874512),
 ('clear', 0.8725026249885559),
 ('remote', 0.8723142743110657)]

In [36]:
#what doesn't fit?
model_glove_twitter.wv.doesnt_match(["trump","bernie","obama","pelosi","orange"])

'orange'

In [37]:
# show weight vector for trump and obama
model_glove_twitter["trump"],model_glove_twitter['obama']

(array([-0.56174 ,  0.69419 ,  0.16733 ,  0.055867, -0.26266 , -0.6303  ,
        -0.28311 , -0.88244 ,  0.57317 , -0.82376 ,  0.46728 ,  0.48607 ,
        -2.1942  , -0.41972 ,  0.31795 , -0.70063 ,  0.060693,  0.45279 ,
         0.6564  ,  0.20738 ,  0.84496 , -0.087537, -0.38856 , -0.97028 ,
        -0.40427 ], dtype=float32),
 array([ 0.77126 ,  0.81259 , -0.5901  , -0.015908, -0.082797, -1.2261  ,
         0.098286,  0.087488,  0.012586, -0.35884 ,  0.80733 ,  0.12569 ,
        -4.0522  ,  0.14856 ,  0.6988  , -0.78948 , -0.77125 ,  0.49512 ,
         0.16366 , -0.9713  ,  0.95064 ,  0.19921 , -0.27903 , -1.6844  ,
        -0.79424 ], dtype=float32))

## Rank phrases by similarity

The goal here is given a query phrase, rank all other phrases by semantic similarity (using the glove twitter embeddings) and compare that with surface level similarity using jaccard similarity index

In [38]:
import pandas as pd
from sklearn.metrics import jaccard_similarity_score

phrases=["barrack obama","barrack h. obama","barrack hussein obama","michelle obama","donald trump","melania trump"]
query="barack hussain obama"

results_glove=[]
results_jaccard=[]

def compute_jaccard(t1,t2):
    
    intersect = [value for value in t1 if value in t2] 
    
    union=[]
    union.extend(t1)
    union.extend(t2)
    union=list(set(union))
    
    
    jaccard=(len(intersect))/(len(union)+0.01)
    return jaccard
    

for p in phrases:
    tokens_1=[t for t in p.split() if t in model.wv.vocab]
    tokens_2=[t for t in query.split() if t in model.wv.vocab]
    
    #compute jaccard similarity
    jaccard=compute_jaccard(tokens_1,tokens_2)
    results_jaccard.append([p,jaccard])
    
    #compute cosine similarity using word embedings 
    cosine=0
    if (len(tokens_1) > 0 and len(tokens_2)>0):
        cosine=model_glove_twitter.wv.n_similarity(tokens_1,tokens_2)
        results_glove.append([p,cosine])

print("Phrases most similar to '{0}' using glove word embeddings".format(query))
pd.DataFrame(results_glove,columns=["phrase","score"]).sort_values(by=["score"],ascending=False)

NameError: name 'model' is not defined

In [None]:
print("Phrases most similar to '{0}' using jaccard similarity".format(query))
pd.DataFrame(results_jaccard,columns=["phrase","score"]).sort_values(by=["score"],ascending=False)

# More from GloVe

In [None]:
#again, download and load the model
model_gigaword = api.load("glove-wiki-gigaword-100")

# find similarity
model_gigaword.wv.most_similar(positive=['dirty','grimy'],topn=10)

## Load a dataset and train a model

Instead of loading pre-trained embeddings, you can also load a corpus and train it on demand. This list of datasets that you can download can be found here: https://github.com/RaRe-Technologies/gensim-data#datasets

In [39]:
from gensim.models.word2vec import Word2Vec

# this loads the text8 dataset
corpus = api.load('text8')

# train a Word2Vec model
model_text8 = Word2Vec(corpus,iter=10,size=150, window=10, min_count=2, workers=10)  # train a model from the corpus

# similarity 
model_text8.wv.most_similar("shocked")

# similarity between two different words
model_text8.wv.similarity(w1="dirty",w2="smelly")

# Which one is the odd one out in this list?
model_text8.wv.doesnt_match(["cat","dog","france"])



'france'

[('surprised', 0.6910276412963867),
 ('outraged', 0.6755304932594299),
 ('offended', 0.6535800695419312),
 ('angered', 0.6412442922592163),
 ('overwhelmed', 0.6343708038330078),
 ('betrayed', 0.6204996705055237),
 ('disappointed', 0.6147398948669434),
 ('afraid', 0.6128906607627869),
 ('told', 0.6092778444290161),
 ('welcomed', 0.603984534740448)]