In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def tokenize_text(text, stop_words, stemmer):
    def is_int(s):
        try: 
            int(s)
            return True
        except ValueError:
            return False
        
    tokens = word_tokenize(text.lower())
    tokens = filter(lambda t: len(t) > 2, tokens)
    tokens = filter(lambda t: t not in stop_words, tokens)
    tokens = filter(lambda t: is_int(t) is False, tokens)
    tokens = map(lambda t: stemmer.stem(t), tokens)
    return list(tokens)
    
docs = [
    'The Art of Computer Programming',
    'Computer Programming Learn Any Programming Language In 2 Hours',
    'The Self-Taught Programmer The Definitive Guide to Programming Professionally',
    'The Complete Software Developers Career Guide How to Learn Your Next Programming Language',
    'Cracking the Coding Interview 189 Programming Questions and Solutions',
    'The Economics Book Big Ideas Simply Explained',
    'Economics in One Lesson The Shortest and Surest Way to Understand Basic Economics',
    'Basic Economics',
    'Aftermath Seven Secrets of Wealth Preservation in the Coming Chaos',
    'Economics 101 From Consumer Behavior to Competitive Markets Everything You Need to Know About Economics'
]
tags = ('comp ' * 5).strip().split(' ') + ('econ ' * 5).strip().split(' ')
stop_words = set(stopwords.words('english')) 
stemmer = PorterStemmer()

documents = [TaggedDocument(tokenize_text(doc, stop_words, stemmer), [i]) 
                 for i, (doc, tag) in enumerate(zip(docs, tags))]

In [2]:
for doc in documents:
    print(doc)

TaggedDocument(['art', 'comput', 'program'], [0])
TaggedDocument(['comput', 'program', 'learn', 'program', 'languag', 'hour'], [1])
TaggedDocument(['self-taught', 'programm', 'definit', 'guid', 'program', 'profession'], [2])
TaggedDocument(['complet', 'softwar', 'develop', 'career', 'guid', 'learn', 'next', 'program', 'languag'], [3])
TaggedDocument(['crack', 'code', 'interview', 'program', 'question', 'solut'], [4])
TaggedDocument(['econom', 'book', 'big', 'idea', 'simpli', 'explain'], [5])
TaggedDocument(['econom', 'one', 'lesson', 'shortest', 'surest', 'way', 'understand', 'basic', 'econom'], [6])
TaggedDocument(['basic', 'econom'], [7])
TaggedDocument(['aftermath', 'seven', 'secret', 'wealth', 'preserv', 'come', 'chao'], [8])
TaggedDocument(['econom', 'consum', 'behavior', 'competit', 'market', 'everyth', 'need', 'know', 'econom'], [9])


In [3]:
import multiprocessing
from tqdm import tqdm

cpu_count = multiprocessing.cpu_count()
vector_size = 5
alpha = 0.025

model = Doc2Vec(vector_size=vector_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                workers=cpu_count,
                dm=1)

model.build_vocab([d for d in tqdm(documents)])

100%|██████████| 10/10 [00:00<00:00, 26749.39it/s]


In [20]:
from sklearn.utils import shuffle

max_epochs = 500

for epoch in range(max_epochs):
    model.train(shuffle([d for d in documents]), total_examples=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

In [21]:
test_data = word_tokenize('Elements of Programming Interviews in Python The Insiders Guide'.lower())
v1 = model.infer_vector(test_data, steps=20, alpha=0.025)
print("V1_infer", v1)

V1_infer [ 0.07805808 -0.00471411 -0.03416988  0.05626874 -0.02646747]


In [23]:
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

X = np.array([model.docvecs[d.tags[0]] for d in documents], dtype=np.double)
Y = np.array([model.infer_vector(d.words, steps=50) for d in documents], dtype=np.double)
print(cosine_similarity(X, Y))

for doc in documents:
    trained = model.docvecs[doc.tags[0]]
    inferred = model.infer_vector(doc.words, steps=20, alpha=0.025)
    print(cosine(trained, inferred))  # cosine similarity from scipy

[[-0.32205365 -0.91234583 -0.63008066  0.79771684 -0.99334589 -0.99121843
  -0.18140417  0.92323718 -0.92079499 -0.98197714]
 [-0.31968745 -0.90218232 -0.62383598  0.78749905 -0.99378482 -0.99358022
  -0.15476682  0.91378258 -0.92721101 -0.98421522]
 [-0.31422213 -0.89957183 -0.60021412  0.80191147 -0.99581106 -0.98857751
  -0.19580523  0.93262018 -0.92368952 -0.9873972 ]
 [-0.26873803 -0.90076222 -0.62245437  0.80570971 -0.9971991  -0.98724518
  -0.14962747  0.91921108 -0.91992753 -0.98167963]
 [-0.30895081 -0.89977726 -0.5946612   0.78127805 -0.99515865 -0.99278009
  -0.19653136  0.91851479 -0.93453697 -0.98470567]
 [-0.30027906 -0.90255925 -0.58933573  0.79777547 -0.99393494 -0.98531753
  -0.22599743  0.93245158 -0.92385817 -0.98224667]
 [-0.27294645 -0.89515616 -0.580418    0.80423611 -0.99462809 -0.9806002
  -0.21866061  0.93341334 -0.92069549 -0.98119631]
 [-0.31011314 -0.88911392 -0.55471435  0.78494702 -0.99190371 -0.98371297
  -0.24932135  0.9330635  -0.93104791 -0.98453912]
 

In [11]:
similar_doc = model.docvecs.most_similar(0)
print(similar_doc)

[(1, 0.9839052557945251), (3, 0.9119670391082764), (2, 0.9066317677497864), (4, 0.8704017996788025), (9, 0.837632417678833), (8, 0.8258793354034424), (5, 0.7376419305801392), (7, 0.7004565596580505), (6, 0.6726831793785095)]


In [12]:
print(model.docvecs[0])

[-1.2742983   0.9502279  -0.99983233 -2.6495306  -2.0976636 ]


# Links
* https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5