In [1]:
import gensim
import gensim.downloader
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
gensim.downloader.info()["models"]

{'fasttext-wiki-news-subwords-300': {'num_records': 999999,
  'file_size': 1005007116,
  'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)',
  'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py',
  'license': 'https://creativecommons.org/licenses/by-sa/3.0/',
  'parameters': {'dimension': 300},
  'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).',
  'read_more': ['https://fasttext.cc/docs/en/english-vectors.html',
   'https://arxiv.org/abs/1712.09405',
   'https://arxiv.org/abs/1607.01759'],
  'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af',
  'file_name': 'fasttext-wiki-news-subwords-300.gz',
  'parts': 1},
 'conceptnet-numberbatch-17-06-300': {'num_records': 1917247,
  'file_size': 1225497562,
  'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016',
  'reader_code': 'https:/

In [3]:
gensim.downloader.info()["models"].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [4]:
word2vec_300 = gensim.downloader.load("word2vec-google-news-300")



In [5]:
# get_vector for word embedding

vec1 = word2vec_300.get_vector("Superman")            # This google news model has case sensitive.
vec2 = word2vec_300.get_vector("Spiderman")
vec3 = word2vec_300.get_vector("Superhero")

In [6]:
cosine_similarity(vec1.reshape(1,-1),vec2.reshape(1,-1))             # Superman and Spiderman have 54% of similarity

array([[0.5477701]], dtype=float32)

In [7]:
cosine_similarity(vec1.reshape(1,-1),vec3.reshape(1,-1))            # Superman and Superhero have 54% of similarity

array([[0.5475111]], dtype=float32)

In [8]:
word2vec_300.most_similar("Capital")     # Most similar word for Capital in finance context

[('Captial', 0.6502077579498291),
 ('Capital_Partners', 0.6405684351921082),
 ('Financial_Cp_COF', 0.581904411315918),
 ('Ltd_ACAS', 0.562832772731781),
 ('Edinburgh_Inspiring', 0.5587600469589233),
 ('LLC_Currency_Currents', 0.5564382076263428),
 ('www.internetcapital.com', 0.5522413849830627),
 ('Mitchelle_Stephenson', 0.5467348098754883),
 ('Development_Fund_UNCDF', 0.5438245534896851),
 ('LONG_BEACH_Mffais.com_Munder', 0.5372943878173828)]

In [9]:
word2vec_300.most_similar("capital")

[('captial', 0.6443068981170654),
 ('worth_##mln_rub', 0.5211092829704285),
 ('worth_#.###bn_rub', 0.5162901282310486),
 ('worth_##.###bn_rub', 0.5028226971626282),
 ('Lima_Peruvians', 0.49432554841041565),
 ('thecapital', 0.4910687506198883),
 ('Bishkek_Otunbayeva', 0.4839523136615753),
 ('Andreessen_Horowitz_venture', 0.47973471879959106),
 ('EQUITY_Issued', 0.4725496470928192),
 ('liquidity', 0.4684329628944397)]

In [10]:
word2vec_300.most_similar("king")          # Most similar word for king

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

In [11]:
# get_mean_vector for sentence embedding

sent1 = "The President is speaking to the media"
sent2 = "Barack Obama is addressing the press"

vec1 = word2vec_300.get_mean_vector(sent1.split())
vec2 = word2vec_300.get_mean_vector(sent2.split())

In [12]:
cosine_similarity(vec1.reshape(1,-1),vec2.reshape(1,-1))      # Similarity between sentence 1 and 2 is 71%

array([[0.7156141]], dtype=float32)

In [13]:
# For Text classification

import pandas as pd
import numpy as np

In [14]:
spam_data = pd.read_csv("spam.csv", encoding='latin-1')

In [15]:
output = spam_data["Text"].apply(lambda x: word2vec_300.get_mean_vector(x.split()))

In [16]:
output

0       [0.017343024, 0.01556247, 0.002628992, 0.05181...
1       [-0.0421047, 0.028880663, 0.018837307, 0.02708...
2       [0.001205553, -0.027647695, -0.023975767, -0.0...
3       [-0.017826386, 0.015255115, 0.04653708, 0.0452...
4       [0.047918282, 0.027473427, 0.01856114, 0.05941...
                              ...                        
5567    [-0.0046107373, 0.011564551, 0.029548835, 0.03...
5568    [0.038656373, 0.04322333, 0.028277848, 0.05790...
5569    [0.02228614, 0.0064519444, 0.033777863, 0.0212...
5570    [0.032425433, 0.022816997, 0.013489032, 0.0376...
5571    [0.013331791, 0.0613884, 0.054488778, 0.020679...
Name: Text, Length: 5572, dtype: object

In [17]:
output[1]

array([-0.0421047 ,  0.02888066,  0.01883731,  0.02708547, -0.0238057 ,
        0.01182648, -0.03776784, -0.01704452, -0.01869419,  0.02374483,
       -0.03196044, -0.07632075, -0.10608982,  0.00310805, -0.04276608,
        0.06520871,  0.04272574,  0.01600952,  0.02860351, -0.04736773,
       -0.01502037, -0.01066177,  0.13683687, -0.00583687, -0.02046492,
        0.02294894, -0.06142147, -0.02458499, -0.00096316, -0.03443025,
       -0.00858044,  0.0346662 ,  0.01554852, -0.03529852, -0.04300201,
        0.00719438, -0.00493147,  0.0534094 , -0.01443013,  0.05284488,
       -0.02242807, -0.07494704,  0.1100094 ,  0.03379265,  0.04273374,
       -0.03027657,  0.0068957 , -0.05298407, -0.01162209,  0.04736043,
       -0.09307731,  0.03519025,  0.05872466,  0.05661248,  0.03705202,
        0.05284238, -0.06260754, -0.04092712,  0.01608422, -0.03271327,
        0.00946318,  0.00972249, -0.04190511, -0.00309412,  0.01432019,
       -0.05974102, -0.02545707,  0.00229635, -0.05444139,  0.01

In [18]:
output.shape

(5572,)