# Word2vec in Gensim

Implement a simple word2vec estimator using [Gensim](https://radimrehurek.com/gensim/). Use the small Wikipedia corpus from 'enlang1.txt'.

In [0]:
!wget https://raw.githubusercontent.com/mlcollege/natural-language-processing/master/data/corpora/enlang1.txt

In [0]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = []
with open('enlang1.txt') as f:
    for line in f.readlines():
        sentences.append(line.strip().split())

model = gensim.models.Word2Vec(sentences, size = 50, min_count=3)

In [0]:
print(model.wv['car'])


In [4]:
model.wv.most_similar(positive=['cars', 'bus'], negative=['car'])

2019-12-17 10:00:11,494 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('buses', 0.8988571763038635),
 ('routes', 0.8762962222099304),
 ('roads', 0.8647735714912415),
 ('trains', 0.825829803943634),
 ('airports', 0.8146514892578125),
 ('rail', 0.8137536644935608),
 ('transport', 0.8113287687301636),
 ('connecting', 0.8025506734848022),
 ('intercity', 0.8015417456626892),
 ('terminal', 0.7987492084503174)]

# Import better models

Import word vectors trained on [Common Crawl](https://fasttext.cc/docs/en/english-vectors.html) corpus (600 B tokens) and play with it.

In [5]:
!wget https://www.mlcollege.com/data/crawl-300.vec.bz2
!bunzip2 crawl-300.vec.bz2

--2019-12-17 10:00:12--  https://www.mlcollege.com/data/crawl-300.vec.bz2
Resolving www.mlcollege.com (www.mlcollege.com)... 91.239.200.57, 2a00:1ed0:2:0:1:5bef:c839:1
Connecting to www.mlcollege.com (www.mlcollege.com)|91.239.200.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 287960075 (275M) [application/x-bzip2]
Saving to: ‘crawl-300.vec.bz2’


2019-12-17 10:00:40 (10.4 MB/s) - ‘crawl-300.vec.bz2’ saved [287960075/287960075]



In [6]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('crawl-300.vec', binary=False) 

2019-12-17 10:01:40,234 : INFO : loading projection weights from crawl-300.vec
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-12-17 10:04:02,867 : INFO : loaded (500000, 300) matrix from crawl-300.vec


In [7]:
word_vectors.most_similar(positive=['kings', 'queen'], negative=['king'])

2019-12-17 10:04:02,874 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('queens', 0.8387579917907715),
 ('queen.', 0.6004167795181274),
 ('monarchs', 0.5899761915206909),
 ('Queen', 0.5859926342964172),
 ('empresses', 0.577515184879303),
 ('princes', 0.5499585270881653),
 ('QUEEN', 0.5448766350746155),
 ('royals', 0.5442696809768677),
 ('princesses', 0.5383292436599731),
 ('royal', 0.5232110023498535)]

In [8]:
word_vectors.most_similar(positive=['woman', 'husband'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('wife', 0.7529045343399048),
 ('daughter', 0.6500851511955261),
 ('mother-in-law', 0.6470040082931519),
 ('spouse', 0.6457177996635437),
 ('husbands', 0.6331113576889038),
 ('mother', 0.6005340218544006),
 ('ex-husband', 0.5952433347702026),
 ('daughter-in-law', 0.5948172807693481),
 ('ex-wife', 0.5728636980056763),
 ('daughters', 0.5600825548171997)]

In [9]:
word_vectors.most_similar(positive=['Paris', 'Spain'], negative=['France'])

  if np.issubdtype(vec.dtype, np.int):


[('Madrid', 0.8625079393386841),
 ('Barcelona', 0.7637038230895996),
 ('Sevilla', 0.6874053478240967),
 ('Seville', 0.6747831702232361),
 ('Malaga', 0.6494932174682617),
 ('Zaragoza', 0.645937442779541),
 ('Valencia', 0.6383105516433716),
 ('Alicante', 0.6115808486938477),
 ('Salamanca', 0.6041630506515503),
 ('Murcia', 0.6019026041030884)]

In [10]:
word_vectors.most_similar(positive=['Donald', 'Putin'], negative=['Trump'])

  if np.issubdtype(vec.dtype, np.int):


[('Vladimir', 0.6446309089660645),
 ('Medvedev', 0.6112760901451111),
 ('Sergei', 0.5950402021408081),
 ('Dmitry', 0.5793238878250122),
 ('Oleg', 0.5696351528167725),
 ('Denis', 0.5639138221740723),
 ('Mikhail', 0.5574286580085754),
 ('Anatoly', 0.5540498495101929),
 ('Igor', 0.5533066987991333),
 ('Ivan', 0.5529454946517944)]