### Load data

앞서 학습한 doc2vec model 을 읽어옵니다.

In [1]:
import gensim
print('Gensim version = {}'.format(gensim.__version__))

import warnings
warnings.filterwarnings('ignore')

Gensim version = 3.8.1


이전 (Gensim <= 3.6.x) 에는 document vectors 가 `Doc2Vec.docvecs.doctag_syn0` 에 있었습니다.

Gensim >= 4.0.0 이후에 `Doc2Vec.docvecs.vectors_docs` 로 옮겨집니다.

In [2]:
from lovit_textmining_dataset.navermovie_comments import load_trained_embedding
from lovit_textmining_dataset.navermovie_comments import load_id_to_movie


tag_to_movie = load_id_to_movie()
tag_to_movie = {'#%s'%idx:movie for idx, movie in tag_to_movie.items()}

doc2vec_model = load_trained_embedding(
    data_name='large',
    tokenize='soynlp_unsup',
    embedding='doc2vec')

print(doc2vec_model.docvecs.vectors_docs.shape)
print(type(doc2vec_model.docvecs.vectors_docs))

(172, 100)
<class 'numpy.ndarray'>


In [3]:
doctags = doc2vec_model.docvecs.doctags
tag_to_idx = {tag:info.offset for tag, info in doctags.items()}
idx_to_tag = [tag for tag, idx in sorted(tag_to_idx.items(), key=lambda x:x[1])]
idx_to_movie = [tag_to_movie[tag] for tag in idx_to_tag]

idx_to_movie[:3]

['고사 두 번째 이야기: 교생실습', '박쥐', '해무']

### t-SNE

In [4]:
%%time 

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, metric='cosine')
y_tsne = tsne.fit_transform(doc2vec_model.docvecs.vectors_docs)

CPU times: user 984 ms, sys: 28 ms, total: 1.01 s
Wall time: 1.01 s


### preparing Bokeh

In [5]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.plotting import ColumnDataSource

output_notebook()

def draw_scatterplot(tags, title, embedding):

    # prepare data source

    source = ColumnDataSource(data=dict(
        x = embedding[:,0].tolist(),
        y = embedding[:,1].tolist(),
        desc= idx_to_movie
    ))

    TOOLTIPS = [
        ("(x,y)", "($x, $y)"),
        ("desc", "@desc"),
    ]

    # draw figure
    p = figure(plot_width=600, plot_height=600, tooltips=TOOLTIPS, title=title)
    p.grid.grid_line_color = None
    p.background_fill_color = "white"
    p.scatter('x', 'y', source=source, marker="circle", size=5,
        line_color="navy", fill_color="orange" , alpha=0.5)

    def mtext(p, x, y, text):
        p.text(x, y, text=[text], text_color="black",
               text_align="center", text_font_size="10pt")

    # annotation
    for tag in tags:
        idx = tag_to_idx[tag]
        movie = tag_to_movie[tag]
        x, y = embedding[idx, 0], embedding[idx, 1]
        mtext(p, x, y, movie)

    return p

In [6]:
queries = [
    ('라라랜드', '#134963'),
    ('관상', '#93728')
]

for query_name, query_tag in queries:
    similars = doc2vec_model.docvecs.most_similar(query_tag)
    similar_tags, _ = zip(*similars)

    p = draw_scatterplot(
        similar_tags,
        title = '"{}" 유사 영화'.format(query_name),
        embedding = y_tsne
    )
    show(p)