In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import warnings

warnings.filterwarnings("ignore")

__Gensim word2vec family of algorithms__, uses highly optimized C routines, data streaming and Pythonic interfaces.

The word2vec algorithms include skip-gram and CBOW models, using either hierarchical softmax or negative sampling.

__Other embeddings__: Doc2Vec, FastText...

For a tutorial on Gensim word2vec, with an interactive web app trained on GoogleNews, visit https://rare-technologies.com/word2vec-tutorial/.

## Train your own model & word2vec embeddings

Initialize a model, load treining text (common_text here), choose parameters

In [2]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

__Model Parameters__

size : int, optional
    Dimensionality of the word vectors.
    
window : int, optional
    Maximum distance between the current and predicted word within a sentence.
    
min_count : int, optional
    Ignores all words with total frequency lower than this.
    
workers : int, optional
    Use these many worker threads to train the model (=faster training with multicore machines).
    
sg : {0, 1}, optional 
    Training algorithm: 1 for skip-gram; otherwise CBOW.


In [3]:
path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4, sg=1)
model.save("word2vec.model")

The training is streamed, meaning sentences can be a generator, reading input data from disk on-the-fly, without loading the entire corpus into RAM.

It also means you can continue training the model later:

In [2]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world", "blabla"]], total_examples=1, epochs=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(0, 3)

The trained word vectors are stored in a KeyedVectors instance in model.wv:

In [75]:
vector = model.wv['computer']

In [76]:
len(vector)

100

The reason for separating the trained vectors into KeyedVectors is that if you don’t need the full model state any more (don’t need to continue training), the state can discarded, resulting in a much smaller and faster object that can be mmapped for lightning fast loading and sharing the vectors in RAM between processes:

In [9]:
#path = get_tmpfile("wordvectors.kv")
model.wv.save("model.wv")
wv = KeyedVectors.load("model.wv", mmap='r')
vector = wv['computer']  # numpy vector of a word

In [4]:
word_vectors = model.wv
del model

Note that there is a gensim.models.phrases module which lets you automatically detect phrases longer than one word. Using phrases, you can learn a word2vec model where “words” are actually multiword expressions, such as new_york_times or financial_crisis:

In [12]:
from gensim.models.phrases import Phrases, Phraser

bigram_transformer = Phrases(common_texts, min_count=1, threshold=1)
model = Word2Vec(bigram_transformer[common_texts], min_count=1)

In [13]:
bigram = Phraser(bigram_transformer)
bigram[['trees', 'graph', 'minors']]

['trees', 'graph_minors']

In [14]:
print(*bigram_transformer[common_texts])

['human', 'interface', 'computer'] ['survey', 'user', 'computer', 'system', 'response_time'] ['eps', 'user', 'interface', 'system'] ['system', 'human', 'system', 'eps'] ['user', 'response_time'] ['trees'] ['graph', 'trees'] ['graph_minors', 'trees'] ['graph_minors', 'survey']


## Pretrained Word2Vec

In [17]:
from gensim.models import KeyedVectors
import numpy as np
wv_embeddings = KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin", binary=True, limit=50000)

In [18]:
"dog" in wv_embeddings

True

In [19]:
result = wv_embeddings.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7118


In [20]:
wv_embeddings.most_similar(positive=['woman', 'king'], topn=5)

[('man', 0.6628605723381042),
 ('queen', 0.64385586977005),
 ('girl', 0.61360764503479),
 ('princess', 0.6087509393692017),
 ('monarch', 0.5900578498840332)]

In [21]:
wv_embeddings.most_similar(positive=['data', 'science'], topn=5)

[('scientific', 0.6142281293869019),
 ('research', 0.565909206867218),
 ('Data', 0.5410994291305542),
 ('sciences', 0.5278131365776062),
 ('biology', 0.5095877647399902)]

In [22]:
wv_embeddings.doesnt_match(['breakfast', 'cereal', 'dinner', 'lunch'])

'cereal'

In [23]:
wv_embeddings.doesnt_match(['mouse', 'rat', 'elephant', 'cat'])

'elephant'

In [24]:
wv_embeddings.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])

'sound'

In [26]:
wv_embeddings.most_similar_to_given('pizza', ['water', 'salad', 'cheese', 'sushi'])

'sushi'

In [27]:
wv_embeddings.similarity('woman', 'man')

0.76640123

## FastText

In [28]:
from gensim.models import FastText

In [29]:
model = FastText(size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

In [30]:
"computer" in model.wv.vocab

True

In [33]:
model.wv["computer"]

array([-0.01146877,  0.05369632, -0.04157733, -0.0052736 ], dtype=float32)

In [34]:
"comp" in model.wv.vocab

False

In [35]:
oov_vec = model.wv["comp"]
oov_vec

array([0.04221442, 0.02641019, 0.0447193 , 0.0286179 ], dtype=float32)

In [36]:
model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])

[('graph', 0.344123899936676),
 ('time', 0.3010402321815491),
 ('survey', 0.20177102088928223),
 ('minors', 0.030677050352096558),
 ('trees', -0.2112969309091568),
 ('user', -0.24868673086166382),
 ('eps', -0.25328660011291504),
 ('system', -0.3587323725223541),
 ('response', -0.7912449240684509)]

In [37]:
model.wv.doesnt_match("human computer interface tree".split())

'tree'

In [38]:
model.wv.similarity('computer', 'human')

0.80814165

## Doc2Vec

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [4]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [5]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [6]:
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

In [7]:
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

If you’re finished training a model (=no more updates, only querying, reduce memory usage), you can do:

In [8]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Infer vector for a new document:

In [9]:
model.infer_vector(["system", "response"])

array([ 0.0775036 ,  0.00616904,  0.01824351,  0.03122561, -0.03967457],
      dtype=float32)

In [14]:
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar([6,7])
print(similar_doc[0])

(4, 0.5975821018218994)


In [47]:
# to find vector of doc in training data using tags
print(model.docvecs[1])

[-0.06235855 -0.05433509 -0.01355862 -0.01756822 -0.06218205]
