In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    filepath_or_buffer='../../data-server/processed/hi_rws_0001_0256_processed.csv', 
    header=None, 
    names=['norm_tokens_doc'],
    nrows=1000
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
norm_tokens_doc    1000 non-null object
dtypes: object(1)
memory usage: 7.9+ KB


In [26]:
# file fix
from itertools import chain
from ast import literal_eval
df.norm_tokens_doc = df.norm_tokens_doc.apply(lambda x: literal_eval(x))
df['norm_tokens'] = df.norm_tokens_doc.apply(lambda x: list(chain(*x)))

In [28]:
df.norm_tokens.head()

0    [stumble, great, restaurant, overlook, ocean, ...
1    [excellent, view, ocean, sunset, excellent, fo...
2    [place, what, review, portray, starter, be, wa...
3    [be, excited, repeat, keoki, kauai, lovefest, ...
4    [look, tourist, spot, could, tell, other, revi...
Name: norm_tokens, dtype: object

In [29]:
df.norm_tokens_doc.head()

0    [(stumble, great, restaurant, overlook, ocean,...
1    [(excellent, view, ocean, sunset), (excellent,...
2    [(place, what, review, portray), (starter, be,...
3    [(be, excited, repeat, keoki, kauai, lovefest,...
4    [(look, tourist, spot), (could, tell, other, r...
Name: norm_tokens_doc, dtype: object

In [65]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = []
for i, norm_token in enumerate(df.norm_tokens):
    docs.append(TaggedDocument(norm_token, tags=[i]))
    
model = Doc2Vec(vector_size=5, window=5, min_count=5, workers=4, epochs=20)
model.build_vocab(docs)

for epoch in range(20):
    print('iteration {0}'.format(epoch), round(model.alpha, 4))
    model.train(docs,
                total_examples=model.corpus_count,
                epochs=epoch)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0 0.03
iteration 1 0.02
iteration 2 0.02
iteration 3 0.02
iteration 4 0.02
iteration 5 0.02
iteration 6 0.02
iteration 7 0.02
iteration 8 0.02
iteration 9 0.02
iteration 10 0.02
iteration 11 0.02
iteration 12 0.02
iteration 13 0.02
iteration 14 0.02
iteration 15 0.02
iteration 16 0.02
iteration 17 0.02
iteration 18 0.02
iteration 19 0.02


In [51]:
v1 = model.infer_vector(df.norm_tokens[0])
print("V1_infer", v1)

V1_infer [-1.5833927  -0.15099673 -0.59771276  0.3375695  -0.15267925]


In [53]:
model.docvecs[1]

array([-3.656094 , -0.5240415, -1.1318668,  5.417684 ,  1.8410473],
      dtype=float32)

In [55]:
model.docvecs.most_similar(1)

[(120, 0.9888421297073364),
 (227, 0.9845942258834839),
 (346, 0.9814838171005249),
 (827, 0.9726826548576355),
 (847, 0.9725509881973267),
 (206, 0.9701384902000427),
 (557, 0.9696964621543884),
 (994, 0.9688946008682251),
 (772, 0.9659287929534912),
 (419, 0.9657495617866516)]

In [57]:
print(df.norm_tokens[1])
print(df.norm_tokens[227])

['excellent', 'view', 'ocean', 'sunset', 'excellent', 'food', 'fresh', 'fish', 'coconut', 'yuzu', 'husband', 'love', 'waitress', 'nice']
['bake', 'fish', 'pink', 'snapper', 'line', 'catch', 'good', 'go', 'recommend', 'place']
