### Introduction

The goal here is to reproduce the results of [gensim's doc2-vec-les.ipynb](https://github.com/RaRe-Technologies/gensim/tree/master/docs/notebooks).

Dowload the data [lee_background.cor and lee.cor from gensim](https://github.com/RaRe-Technologies/gensim/tree/master/gensim/test/test_data) and save them in the `data` folder.

In [1]:
import pandas as pd
from pandas import DataFrame
from gensim.models.doc2vec import Doc2Vec
from gs import tagdocs
from gs import evaluate

In [2]:
with open('data/lee_background.cor', 'r') as fin:
    rows = [{'label': str(i), 'text': line} for i, line in enumerate(fin)]
df = DataFrame(rows)

In [3]:
df.sample(2)

Unnamed: 0,label,text
46,46,Seven yachts have been forced to retire from t...
275,275,Australian cricket coach John Buchanan says hi...


In [4]:
docs = tagdocs(df)

In [5]:
docs[:2]

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [6]:
model = Doc2Vec(size=50, min_count=2, iter=55)
model.build_vocab(docs)
model.train(docs)

2348429

In [7]:
labels, hits = evaluate(df, model)
print(len(labels), len([hit for hit in hits if hit]))

300 300


In [8]:
model = Doc2Vec(min_count=1, size=200, window=5, sample=1e-5, negative=5, dm=0, workers=4, iter=200)
model.build_vocab(docs)
model.train(docs)

2750006

In [9]:
total, hits = evaluate(df, model)
print(len(labels), len([hit for hit in hits if hit]))

300 300


In [10]:
model.most_similar('cricket')

[('changes', 0.23447872698307037),
 ('recognised', 0.23290976881980896),
 ('prayer', 0.23000554740428925),
 ('glowing', 0.22968699038028717),
 ('listeners', 0.2282227873802185),
 ('modern', 0.2158946543931961),
 ('into', 0.21520249545574188),
 ('subsequently', 0.2091997116804123),
 ('rudeina', 0.20876652002334595),
 ('ever', 0.2062593698501587)]

In [11]:
model.most_similar('nearby')

[('help', 0.2819034457206726),
 ('piston', 0.2769801914691925),
 ('weather', 0.25383925437927246),
 ('grenades', 0.23806816339492798),
 ('documents', 0.23014771938323975),
 ('enthusiastic', 0.22506560385227203),
 ('bellis', 0.22070059180259705),
 ('vastly', 0.21293874084949493),
 ('dinner', 0.21138420701026917),
 ('fast', 0.20821930468082428)]