In [1]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

# Train Word2Vec on PTB Dataset

* [Penn Treebank Dataset](https://deepai.org/dataset/penn-treebank)

> Penn Treebank (PTB) dataset, is widely used in machine learning for NLP (Natural Language Processing) research. Word-level PTB does not contain capital letters, numbers, and punctuations, and the vocabulary is capped at 10k unique words, which is relatively small in comparison to most modern datasets which can result in a larger number of out of vocabulary tokens.

* https://raw.githubusercontent.com/tomsercu/lstm/master/data/

In [2]:
sentences = LineSentence(source="~/.keras/datasets/ptb.train.txt")

In [3]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=5, 
    negative=5,
    vector_size=100, 
    min_count=1, 
    workers=4
)

In [4]:
!mkdir -p ./model

In [5]:
w2v.save("./model/gensim_w2v_vecsize_100")

In [6]:
w2v.wv.most_similar('cash', topn=10)

[('amount', 0.9150373935699463),
 ('debt', 0.8811702132225037),
 ('value', 0.8553045392036438),
 ('payment', 0.8456965088844299),
 ('denominations', 0.8313669562339783),
 ('dividend', 0.828631579875946),
 ('face', 0.8225265741348267),
 ('minimum', 0.8147336840629578),
 ('assets', 0.8144380450248718),
 ('dividends', 0.8115537762641907)]

# Use pre-trained Google News dataset model

* [Word2Vec Demo](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#word2vec-demo)

In [7]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [8]:
king = wv['king']
man = wv['man']
woman = wv['woman']

candidates: list = []
for key, probability in wv.most_similar(king - man + woman):
    if key.lower() not in ["king", "man", "woman"]:
        candidates.append((key, probability))
        
candidates[:3]

[('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475)]

In [19]:
spain = wv['spain']
real_madrid = wv['real_madrid']
italy = wv['italy']

candidates: list = []
for key, probability in wv.most_similar(real_madrid -spain + italy):
    if key.lower() not in ["spain", "real_madrid", "italy"]:
        candidates.append((key, probability))
        
candidates[:3]

[('juventus', 0.6757157444953918),
 ('juve', 0.6393407583236694),
 ('mancini', 0.6235371828079224)]