# Word2vec in Gensim

Implement a simple word2vec estimator using [Gensim](https://radimrehurek.com/gensim/). Use the small Wikipedia corpus from 'enlang1.txt'.

In [0]:
!wget https://raw.githubusercontent.com/mlcollege/deep-learning-rb/master/data/corpora/enlang1.txt

In [0]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = []
with open('enlang1.txt') as f:
    for line in f.readlines():
        sentences.append(line.strip().split())

model = gensim.models.Word2Vec(sentences, size = 50, min_count=3)

In [3]:
print(model.wv['car'])


[-1.1598498  -0.25791538 -1.6610552  -1.4129301  -0.42355385 -0.19417267
 -2.405079   -0.18649796 -0.09387748  0.65342534  0.17040008  0.82497376
 -3.1298604   1.4710575   0.34527615 -1.8955553   1.3746815   1.6183246
  1.637082    2.1225793  -0.8191242   1.1500102  -1.4001088  -1.789148
  2.7371156   0.6125855   0.00738211 -0.6791236   0.37266815  1.9376935
 -0.77046585  3.1481955   2.0597782  -2.6471891   0.33094633  2.5554118
  1.2720813  -2.31104     0.41289717 -0.7712517  -1.4555893  -1.639989
 -1.0318247   0.3031134  -0.24505097 -1.9277464   0.5364946   0.6615622
 -0.26627937  0.8106867 ]


In [5]:
model.wv.most_similar(positive=['cars', 'bus'], negative=['car'])

  if np.issubdtype(vec.dtype, np.int):


[('buses', 0.905258297920227),
 ('routes', 0.8908281326293945),
 ('roads', 0.8641472458839417),
 ('platforms', 0.841183066368103),
 ('trains', 0.8157158493995667),
 ('facilities', 0.8042889833450317),
 ('viaducts', 0.8022995591163635),
 ('bikes', 0.8017879128456116),
 ('intercity', 0.8017406463623047),
 ('rail', 0.8009991645812988)]

# Import better models

Import word vectors trained on [Common Crawl](https://fasttext.cc/docs/en/english-vectors.html) corpus (600 B tokens) and play with it.

In [0]:
!wget https://www.mlcollege.com/data/crawl-300.vec.bz2
!bunzip2 crawl-300.vec.bz2

In [0]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('crawl-300.vec', binary=False) 

In [19]:
word_vectors.most_similar(positive=['kings', 'queen'], negative=['king'])

  if np.issubdtype(vec.dtype, np.int):


[('queens', 0.8387579917907715),
 ('queen.', 0.6004167795181274),
 ('monarchs', 0.5899761915206909),
 ('Queen', 0.5859926342964172),
 ('empresses', 0.577515184879303),
 ('princes', 0.5499585270881653),
 ('QUEEN', 0.5448766350746155),
 ('royals', 0.5442696809768677),
 ('princesses', 0.5383292436599731),
 ('royal', 0.5232110023498535)]

In [20]:
word_vectors.most_similar(positive=['woman', 'husband'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('wife', 0.7529045343399048),
 ('daughter', 0.6500851511955261),
 ('mother-in-law', 0.6470040082931519),
 ('spouse', 0.6457177996635437),
 ('husbands', 0.6331113576889038),
 ('mother', 0.6005340218544006),
 ('ex-husband', 0.5952433347702026),
 ('daughter-in-law', 0.5948172807693481),
 ('ex-wife', 0.5728636980056763),
 ('daughters', 0.5600825548171997)]

In [21]:
word_vectors.most_similar(positive=['Paris', 'Spain'], negative=['France'])

  if np.issubdtype(vec.dtype, np.int):


[('Madrid', 0.8625079393386841),
 ('Barcelona', 0.7637038230895996),
 ('Sevilla', 0.6874053478240967),
 ('Seville', 0.6747831702232361),
 ('Malaga', 0.6494932174682617),
 ('Zaragoza', 0.645937442779541),
 ('Valencia', 0.6383105516433716),
 ('Alicante', 0.6115808486938477),
 ('Salamanca', 0.6041630506515503),
 ('Murcia', 0.6019026041030884)]

In [22]:
word_vectors.most_similar(positive=['Donald', 'Putin'], negative=['Trump'])

  if np.issubdtype(vec.dtype, np.int):


[('Vladimir', 0.6446309089660645),
 ('Medvedev', 0.6112760901451111),
 ('Sergei', 0.5950402021408081),
 ('Dmitry', 0.5793238878250122),
 ('Oleg', 0.5696351528167725),
 ('Denis', 0.5639138221740723),
 ('Mikhail', 0.5574286580085754),
 ('Anatoly', 0.5540498495101929),
 ('Igor', 0.5533066987991333),
 ('Ivan', 0.5529454946517944)]