In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    filepath_or_buffer='../../data/processed/hi_rws_0001_0256_processed.csv', 
    header=None, 
    names=['norm_tokens_doc'],
    nrows=1000
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
norm_tokens_doc    1000 non-null object
dtypes: object(1)
memory usage: 7.9+ KB


In [2]:
# file fix
from itertools import chain
from ast import literal_eval
df.norm_tokens_doc = df.norm_tokens_doc.apply(lambda x: literal_eval(x))
df['norm_tokens'] = df.norm_tokens_doc.apply(lambda x: list(chain(*x)))

In [3]:
df.norm_tokens.head()

0    [stumble, great, restaurant, overlook, ocean, ...
1    [excellent, view, ocean, sunset, excellent, fo...
2    [place, what, review, portray, starter, walk, ...
3    [excited, repeat, keoki, kauai, lovefest, part...
4    [look, tourist, spot, could, tell, other, revi...
Name: norm_tokens, dtype: object

In [4]:
df.norm_tokens_doc.head()

0    [(stumble, great, restaurant, overlook, ocean,...
1    [(excellent, view, ocean, sunset), (excellent,...
2    [(place, what, review, portray), (starter, wal...
3    [(excited, repeat, keoki, kauai, lovefest, par...
4    [(look, tourist, spot), (could, tell, other, r...
Name: norm_tokens_doc, dtype: object

In [23]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = []
for i, norm_token in enumerate(df.norm_tokens):
    docs.append(TaggedDocument(norm_token, tags=[i]))
    
model = Doc2Vec(vector_size=8, window=5, min_count=5, workers=4, epochs=10)
model.build_vocab(docs)

for epoch in range(10):
    print('iteration {0}'.format(epoch), round(model.alpha, 4))
    model.train(docs,
                total_examples=model.corpus_count,
                epochs=epoch)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0 0.025
iteration 1 0.0248
iteration 2 0.0246
iteration 3 0.0244
iteration 4 0.0242
iteration 5 0.024
iteration 6 0.0238
iteration 7 0.0236
iteration 8 0.0234
iteration 9 0.0232


In [24]:
X = []
for norm_token in df.norm_tokens[:20]:
    v = model.infer_vector(norm_token, alpha=model.alpha, steps=10)
    X.append(v)
    print(v)

[-0.0128735  -0.00175112 -0.14733122  0.04462452  0.10733464  0.211001
  0.3709563  -0.06415744]
[-0.02308503  0.09367393 -0.02145127  0.07479919  0.05140461  0.06081245
  0.06499862  0.00950072]
[-0.12083384  0.18723033 -0.36883876  0.02520585  0.3758736   0.31960443
  0.78572774 -0.01358689]
[-0.29434308  0.42585114 -0.28646222 -0.23193839  0.2992898   0.08091611
  0.30105683 -0.1269898 ]
[-0.1216435   0.3036373  -0.36133906  0.12667844  0.3099317   0.0971282
  0.54056406 -0.17312063]
[-0.05302937  0.18524486 -0.07554426  0.08786791  0.14252311  0.10472614
  0.19809969 -0.00528354]
[ 0.05424517  0.06343371 -0.04671352  0.08356301  0.09515156 -0.05661555
  0.07675306  0.0018847 ]
[-0.043973    0.12069135 -0.29421648  0.05756711  0.24247009  0.10671855
  0.42696905 -0.06563081]
[-0.17638639  0.16010065 -0.17442039 -0.09821399  0.08557218 -0.03781883
  0.33034962  0.02741913]
[-0.0757033   0.19272095 -0.27174577  0.12532946  0.28237864  0.04786527
  0.53450465 -0.03863899]
[-0.04818025 

In [36]:
model.vocabulary()

TypeError: 'Doc2VecVocab' object is not callable

In [26]:
model.docvecs.most_similar(1)

[(936, 0.9446977972984314),
 (675, 0.9426066875457764),
 (739, 0.931880533695221),
 (612, 0.9311423897743225),
 (556, 0.9275106191635132),
 (241, 0.9132927060127258),
 (847, 0.9102202653884888),
 (255, 0.9092434644699097),
 (158, 0.8943102955818176),
 (770, 0.8904882669448853)]

In [27]:
print(df.norm_tokens[1])
print(df.norm_tokens[241])

['excellent', 'view', 'ocean', 'sunset', 'excellent', 'food', 'have', 'fresh', 'fish', 'coconut', 'yuzu', 'husband', 'love', 'waitress', 'nice']
['food', 'amazing', 'great', 'service', 'atmosphere', 'have', 'lobster', 'crust', 'fish', 'skewer', 'shrimp', 'pasta', 'table', 'everyone', 'love', 'food', 'would', 'recommend', 'place', 'would', 'come']


In [19]:
from sklearn.cluster import Birch
 
brc = Birch(branching_factor=50, n_clusters=5, threshold=0.1, compute_labels=True)
brc.fit(X)
brc.predict(X)

array([4, 1, 2, 3, 2, 1, 1, 0, 1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0])

In [15]:
clusters

array([4, 1, 2, 3, 2, 1, 1, 0, 1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0])