In [91]:
import pickle
import numpy as np
import bokeh
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.models import ColumnDataSource, LinearColorMapper
from bokeh.io import export_png
from bokeh.palettes import magma
from bokeh.layouts import gridplot
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

def load_word2vec_model(path):
    # model
    with open(path, 'rb') as f:
        word2vec_model = pickle.load(f)
    
    # word vector
    wv = word2vec_model.wv.syn0
        
    # vocab index
    idx2count = [keyvector.count for _, keyvector in sorted(word2vec_model.wv.vocab.items(), key=lambda x:x[1].index)]
    idx2vocab = [vocab for vocab, _ in sorted(word2vec_model.wv.vocab.items(), key=lambda x:x[1].index)]
    vocab2idx = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
    
    return word2vec_model, wv, idx2vocab, vocab2idx, idx2count

## 네이버 영화 커멘트 데이터

논문과 반대야??

In [41]:
word2vec_path = '/mnt/lovit/works/fastcampus_text_deeplearning/5th/data/comments_172movies/movie_review_word2vec_model_v3.1.pkl'
word2vec_model, wv, idx2vocab, vocab2idx, idx2count = load_word2vec_model(word2vec_path)

# PCA
pca = PCA(n_components=10, copy=True, whiten=True, svd_solver='full')
z = pca.fit_transform(wv)

In [42]:
# array([ 0.04644043,  0.03611285,  0.0351846 ,  0.03198047,  0.02939169,
#         0.027771  ,  0.02581117,  0.02377452,  0.02230856,  0.02146147])
pca.explained_variance_ratio_

array([ 0.04644043,  0.03611285,  0.0351846 ,  0.03198047,  0.02939169,
        0.027771  ,  0.02581117,  0.02377452,  0.02230856,  0.02146147])

In [43]:
def mscatter(p, x, y, c, size=4):
    p.scatter(x, y, marker='circle', fill_color=c, fill_alpha=0.4, size=size, line_color=None)

In [44]:
def draw_scatters(boundaries, z, idx2count):
    color_map = magma(len(boundaries)+3)
    plots = []
    for i, (low, high) in enumerate(boundaries):
        p = figure(title = 'frequency (%d, %d)' % (low, high),
                   plot_width=500,
                   plot_height=500
                  )
        colors = [color_map[i] if low <= v <= high else '#ffffff' for v in idx2count]
        mscatter(p, z[:,0], z[:,1], colors)
        plots.append(p)
    
    def get_color(frequency):
        for i, (low, high) in enumerate(boundaries):
            if low <= frequency <= high:
                return color_map[i]
        return color_map[-1]
    
    merged_plot = figure(title = '(PC1, PC2) encoded by frequency (dark means low frequency)',
                         plot_width=800, plot_height=800)
    colors = [get_color(v) for v in idx2count]
    mscatter(merged_plot, z[:,0], z[:,1], colors)
    
    return plots, merged_plot

In [45]:
boundaries = [
    (0, 10),
    (10, 30),
    (30, 50),
    (50, 100),
    (100, 500),
    (500, 2000),
    (2000, 8000),
    (8000, 10000000)
]

plots, merged_plot = draw_scatters(boundaries, z, idx2count)
output_notebook()

In [74]:
#show(merged_plot)



'/mnt/lovit/git/lovit.github.io/jupyter/understanding_word2vec_naver_all.png'

In [76]:
#export_png(merged_plot, filename='understanding_word2vec_naver_all.png')
# print('saved merged plot')

for i, plot in enumerate(plots):
    export_png(plot, filename='understanding_word2vec_naver_sub{}.png'.format(i))
    print('saved plot {}'.format(i))



saved plot 0




saved plot 1




saved plot 2




saved plot 3




saved plot 4




saved plot 5




saved plot 6




saved plot 7


In [57]:
for word in ['켄시로', '나우유씨', '클러버필드', '와일더']:
    similars = word2vec_model.most_similar(word)
    word_count = idx2count[vocab2idx[word]]
    print('\nword = {} ({})'.format(word, word_count))
    for similar_word, sim in similars:
        similar_count = idx2count[vocab2idx[similar_word]]
        print('  - {} ({}) = {}'.format(similar_word, similar_count, sim))


word = 켄시로 (5)
  - 클러버필드 (7) = 0.8885868787765503
  - 디오디오디오디오디오 (8) = 0.8865270614624023
  - 역스 (5) = 0.8859496116638184
  - qf (5) = 0.8835954666137695
  - 숨도못쉴만큼 (5) = 0.8809152841567993
  - 좋갯다 (5) = 0.879935622215271
  - 구웃구웃 (9) = 0.8792446851730347
  - 굳ㅋ굳ㅋ굳ㅋ굳ㅋ굳ㅋ (5) = 0.8777256011962891
  - 마니마니마니 (7) = 0.8768800497055054
  - 유월에 (5) = 0.8764756917953491

word = 나우유씨 (5)
  - 씨미 (47) = 0.5544609427452087
  - 로보 (408) = 0.5406851768493652
  - 트레 (42) = 0.537074089050293
  - 뱅 (13) = 0.533500611782074
  - 죤 (19) = 0.5286658406257629
  - 썩시딩 (9) = 0.5260113477706909
  - 니이이 (6) = 0.5208901166915894
  - 피아 (469) = 0.5202763080596924
  - 빠이 (50) = 0.519188642501831
  - 합류하 (14) = 0.5077943801879883

word = 클러버필드 (7)
  - characters (5) = 0.9774893522262573
  - 미라클잼 (5) = 0.9760110378265381
  - 유월에 (5) = 0.9756873846054077
  - 디오디오디오디오디오 (8) = 0.9700965285301208
  - 잡잡잡잡 (5) = 0.9691967368125916
  - 내꼬야 (5) = 0.9688028693199158
  - qf (5) = 0.9663950204849243
  - 굳굿굳굿굳 (5) = 0.9661939

In [58]:
for word in '영화 관람객 재미 연기 관상 클로버필드'.split():
    similars = word2vec_model.most_similar(word)
    word_count = idx2count[vocab2idx[word]]
    print('\nword = {} ({})'.format(word, word_count))
    for similar_word, sim in similars:
        similar_count = idx2count[vocab2idx[similar_word]]
        print('  - {} ({}) = {}'.format(similar_word, similar_count, sim))


word = 영화 (1412516)
  - 애니 (6075) = 0.7358444929122925
  - 애니메이션 (7456) = 0.6823039650917053
  - 작품 (39544) = 0.6504106521606445
  - 명화 (708) = 0.6343749761581421
  - 드라마 (16306) = 0.6164193749427795
  - 에니메이션 (577) = 0.5870471000671387
  - 엉화 (126) = 0.5800251364707947
  - 수작 (5048) = 0.5750956535339355
  - 양화 (164) = 0.574091374874115
  - 블록버스터 (5015) = 0.5722830295562744

word = 관람객 (585858)
  - 굿굿 (14681) = 0.4626150131225586
  - 그치만 (1616) = 0.45037755370140076
  - 이지만 (8276) = 0.44437551498413086
  - 유쾌하고 (2810) = 0.44243335723876953
  - but (809) = 0.43762609362602234
  - 그러나 (9951) = 0.43672603368759155
  - 듯하면서도 (72) = 0.4354483187198639
  - 아주 (24571) = 0.4312880337238312
  - 다만 (9957) = 0.42622146010398865
  - 였지만 (5319) = 0.4185895323753357

word = 재미 (344634)
  - 제미 (630) = 0.8922843933105469
  - 재이 (197) = 0.795185923576355
  - 잼이 (730) = 0.7624080181121826
  - 잼 (13098) = 0.7277223467826843
  - 짜임새 (3739) = 0.6783016920089722
  - 기다린보람이 (98) = 0.6516778469085693
  - 잼미 

## cosine kernel PCA

pairwise distance 때문에 out of memory 뜸

In [13]:
# from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components=2, kernel='cosine')

# 64 Gb mem 
# z_kpca = kpca.fit_transform(word2vec_model.wv.syn0)

## axis variance

In [60]:
variance = np.var(wv, axis=0)
sorted_variance = variance.copy()
sorted_variance.sort()
sorted_variance = sorted_variance[::-1]
x = [i for i in range(variance.shape[0])]

In [77]:
from bokeh.plotting import figure, output_notebook, show

# output to static HTML file
#output_file("lines.html")

# output to notebook
output_notebook()

# create a new plot with a title and axis labels
p = figure(title="variance / axis in Word2Vec embedding (Naver movie comments)", x_axis_label='axis', y_axis_label='variance')

# add a line renderer with legend and line thickness
p.line(x, sorted_variance, legend="variance", line_width=2)

# show the results
# show(p)
export_png(p, filename='understanding_word2vec_naver_axis_variance.png')



'/mnt/lovit/git/lovit.github.io/jupyter/understanding_word2vec_naver_axis_variance.png'

In [92]:
for window in [2, 3, 5, 7]:
    for iteration in [3, 5, 10]:        
        model_filehead = 'Reuters_w{}_m5_i{}'.format(window, iteration)
        model_name = 'Reuters window={}, min_count=5, iteration={}'.format(window, iteration)
        
        model_path = '/mnt/sdc2/word2vec_exploration/reuters-news-word2vec/models/word2vec_w{}_m5_i{}.pkl'.format(window, iteration)
        word2vec_model, wv, idx2vocab, vocab2idx, idx2count = load_word2vec_model(model_path)

        # PCA
        pca = PCA(n_components=wv.shape[1], copy=True, whiten=True, svd_solver='full')
        z = pca.fit_transform(wv)
        
        # PCA variance explane
        variance = pca.explained_variance_ratio_
        sorted_variance = variance.copy()
        sorted_variance.sort()
        sorted_variance = sorted_variance[::-1]
        x = [i for i in range(variance.shape[0])]
        p = figure(title="variance / axis in Word2Vec embedding ({})".format(model_name),
                   x_axis_label='axis', y_axis_label='variance')
        p.line(x, sorted_variance, legend="variance", line_width=2)
        export_png(p, filename='understanding_word2vec_{}_pca_variance.png'.format(model_filehead))
        print('PCA explane {}'.format(model_name))

PCA explane Reuters window=2, min_count=5, iteration=3
PCA explane Reuters window=2, min_count=5, iteration=5
PCA explane Reuters window=2, min_count=5, iteration=10
PCA explane Reuters window=3, min_count=5, iteration=3
PCA explane Reuters window=3, min_count=5, iteration=5
PCA explane Reuters window=3, min_count=5, iteration=10
PCA explane Reuters window=5, min_count=5, iteration=3
PCA explane Reuters window=5, min_count=5, iteration=5
PCA explane Reuters window=5, min_count=5, iteration=10
PCA explane Reuters window=7, min_count=5, iteration=3
PCA explane Reuters window=7, min_count=5, iteration=5
PCA explane Reuters window=7, min_count=5, iteration=10


In [101]:
for window in [2, 3, 5, 7]:
    for iteration in [3, 5, 10]:        
        model_filehead = 'Reuters_w{}_m5_i{}'.format(window, iteration)
        model_name = 'Reuters window={}, min_count=5, iteration={}'.format(window, iteration)
        
        model_path = '/mnt/sdc2/word2vec_exploration/reuters-news-word2vec/models/word2vec_w{}_m5_i{}.pkl'.format(window, iteration)
        word2vec_model, wv, idx2vocab, vocab2idx, idx2count = load_word2vec_model(model_path)

        # PCA
        pca = PCA(n_components=10, copy=True, whiten=True, svd_solver='full')
        z = pca.fit_transform(wv)
        print(model_name)
        print(pca.explained_variance_ratio_)
        
        # plotting
        plots, merged_plot = draw_scatters(boundaries, z, idx2count)
        
        export_png(merged_plot, filename='understanding_word2vec_{}_all.png'.format(model_filehead))
        print('saved merged plot')
        for i, plot in enumerate(plots):
            export_png(plot, filename='understanding_word2vec_{}_sub{}.png'.format(model_filehead, i))
            print('saved plot {}'.format(i))
        
        # Axis variance
        variance = np.var(wv, axis=0)
        sorted_variance = variance.copy()
        sorted_variance.sort()
        sorted_variance = sorted_variance[::-1]
        x = [i for i in range(variance.shape[0])]
        p = figure(title="variance / axis in Word2Vec embedding ({})".format(model_name),
                   x_axis_label='axis', y_axis_label='variance')
        p.line(x, sorted_variance, legend="variance", line_width=2)
        export_png(p, filename='understanding_word2vec_{}_axis_variance.png'.format(model_filehead))
        
        print('\n\n')

Reuters window=2, min_count=5, iteration=3
[ 0.07251347  0.04826339  0.04012959  0.03158802  0.0284175   0.02682253
  0.02456756  0.02374059  0.0223418   0.02105032]
saved merged plot
saved plot 0
saved plot 1
saved plot 2
saved plot 3
saved plot 4
saved plot 5
saved plot 6
saved plot 7



Reuters window=2, min_count=5, iteration=5
[ 0.06546113  0.04603168  0.03927437  0.03003581  0.02794286  0.02543461
  0.02389497  0.02329861  0.02139375  0.02042051]
saved merged plot
saved plot 0
saved plot 1
saved plot 2
saved plot 3
saved plot 4
saved plot 5
saved plot 6
saved plot 7



Reuters window=2, min_count=5, iteration=10
[ 0.05833484  0.04200065  0.0376673   0.02835111  0.02736414  0.02345997
  0.02299153  0.02238982  0.02018784  0.01936296]
saved merged plot
saved plot 0
saved plot 1
saved plot 2
saved plot 3
saved plot 4
saved plot 5
saved plot 6
saved plot 7



Reuters window=3, min_count=5, iteration=3
[ 0.05911211  0.04619669  0.03855368  0.03227293  0.02818107  0.02613801
  0.025196

## Reuters similar words

In [95]:
wv.shape

(300678, 100)

In [96]:
print(idx2vocab[510:520])
print(idx2vocab[-200:-190])

['offer', 'source', 'point', 'game', 'clear', 'future', 'lost', 'know', 'taking', 'Department']
['Shellback', 'Reflektor', 'Lazaretto', 'Suchman', 'Kissin', 'Maccabees', 'Oppikoppi', 'Koopsta', 'Knicca', 'Gra']


In [100]:
for word in ['offer', 'source', 'point', 'game', 'clear', 'future', 'lost', 'know', 'taking', 'Department']:
    similars = word2vec_model.most_similar(word)
    word_count = idx2count[vocab2idx[word]]
    print('\nword = {} ({})'.format(word, word_count))
    for similar_word, sim in similars:
        similar_count = idx2count[vocab2idx[similar_word]]
        print('  - {} ({}) = {}'.format(similar_word, similar_count, sim))


word = offer (70274)
  - offering (35315) = 0.6627539396286011
  - bid (56866) = 0.6070243120193481
  - purchase (20995) = 0.5987930297851562
  - proposal (37150) = 0.5755871534347534
  - buy (73425) = 0.5725027322769165
  - receive (22050) = 0.5671845078468323
  - offers (16412) = 0.5657945871353149
  - deal (208557) = 0.5439972877502441
  - appeal (23098) = 0.5417824983596802
  - unsolicited (1427) = 0.5362376570701599

word = source (70065)
  - sources (66331) = 0.8027632236480713
  - official (118857) = 0.7853236198425293
  - person (36569) = 0.6788262128829956
  - aide (10990) = 0.6749960780143738
  - diplomat (12020) = 0.6449186205863953
  - banker (7946) = 0.5791378617286682
  - staffer (834) = 0.5574541091918945
  - matter (46652) = 0.5555279850959778
  - participant (1081) = 0.5449809432029724
  - spokesperson (3471) = 0.5401448011398315

word = point (69646)
  - moment (19169) = 0.6767750978469849
  - points (94945) = 0.5941938757896423
  - juncture (650) = 0.565805435180664

In [99]:
for word in ['Shellback', 'Reflektor', 'Lazaretto', 'Suchman', 'Kissin', 'Maccabees', 'Oppikoppi', 'Koopsta', 'Knicca', 'Gra']:
    similars = word2vec_model.most_similar(word)
    word_count = idx2count[vocab2idx[word]]
    print('\nword = {} ({})'.format(word, word_count))
    for similar_word, sim in similars:
        similar_count = idx2count[vocab2idx[similar_word]]
        print('  - {} ({}) = {}'.format(similar_word, similar_count, sim))


word = Shellback (5)
  - keyboardist (82) = 0.6243799924850464
  - Frideric (7) = 0.607430636882782
  - Chick (48) = 0.5969504117965698
  - co-writer (146) = 0.5964188575744629
  - singer) (17) = 0.5938587188720703
  - Sings (15) = 0.5919713973999023
  - Ralph: (5) = 0.5872100591659546
  - saxophonist (56) = 0.5869432687759399
  - Menken; (5) = 0.5822124481201172
  - Amis (53) = 0.5791580080986023

word = Reflektor (5)
  - naveen (5) = 0.8889473676681519
  - Kaczorowski; (5) = 0.887866199016571
  - com/gen92k (8) = 0.8824543952941895
  - alonso (7) = 0.8821945190429688
  - Davis/Greg (5) = 0.8812547922134399
  - guttsman (5) = 0.8811526894569397
  - yoon (8) = 0.8794268369674683
  - LUZERNE (6) = 0.8785741329193115
  - SKOLKOVO (6) = 0.8781652450561523
  - 13-09173 (6) = 0.8780122399330139

word = Lazaretto (5)
  - MINISERIES/TELEVISION (6) = 0.759848952293396
  - Groupings (5) = 0.7597087621688843
  - Kinosis (5) = 0.7527410984039307
  - Davis/Greg (5) = 0.7436151504516602
  - 2017-2

## 네이버 영화 full PCA

In [85]:
word2vec_path = '/mnt/lovit/works/fastcampus_text_deeplearning/5th/data/comments_172movies/movie_review_word2vec_model_v3.1.pkl'
word2vec_model, wv, idx2vocab, vocab2idx, idx2count = load_word2vec_model(word2vec_path)

# PCA
pca = PCA(n_components=wv.shape[1], copy=True, whiten=True, svd_solver='full')
z = pca.fit_transform(wv)

In [86]:
print(pca.explained_variance_ratio_)
        
# plotting
plots, merged_plot = draw_scatters(boundaries, z, idx2count)

model_filehead = 'Naver_fullpca'
export_png(merged_plot, filename='understanding_word2vec_{}_all.png'.format(model_filehead))
print('saved merged plot')
for i, plot in enumerate(plots):
    export_png(plot, filename='understanding_word2vec_{}_sub{}.png'.format(model_filehead, i))
    print('saved plot {}'.format(i))

# Axis variance
variance = np.var(wv, axis=0)
sorted_variance = variance.copy()
sorted_variance.sort()
sorted_variance = sorted_variance[::-1]
x = [i for i in range(variance.shape[0])]
p = figure(title="variance / axis in Word2Vec embedding ({})".format(model_name),
           x_axis_label='axis', y_axis_label='variance')
p.line(x, sorted_variance, legend="variance", line_width=2)
export_png(p, filename='understanding_word2vec_{}_axis_variance.png'.format(model_filehead))

[ 0.04644043  0.03611285  0.0351846   0.03198047  0.02939169  0.027771
  0.02581117  0.02377452  0.02230856  0.02146147  0.01984337  0.01756978
  0.01737837  0.01676816  0.01542087  0.0145461   0.01420028  0.01386659
  0.01325141  0.01304987  0.01208675  0.01174449  0.01163264  0.01144043
  0.01132122  0.0110087   0.01056165  0.01045729  0.01030297  0.01025839
  0.01011186  0.00992099  0.00940054  0.0093483   0.00920374  0.00889445
  0.00883293  0.00871238  0.0085994   0.00836486  0.0082335   0.00813713
  0.00804035  0.00792316  0.00784196  0.00775318  0.00754591  0.00745072
  0.00733453  0.00727622  0.0071425   0.00704949  0.00699183  0.00686726
  0.00676652  0.00667727  0.00660653  0.00650219  0.0064626   0.00641843
  0.00628546  0.00617355  0.00615048  0.00608294  0.00596566  0.00588608
  0.00586216  0.00581904  0.00578196  0.00573752  0.00565861  0.00557409
  0.00553298  0.00545536  0.00540291  0.00531776  0.00527867  0.00520243
  0.00513352  0.00499867  0.00495172  0.00490348  0.0

'/mnt/lovit/git/lovit.github.io/jupyter/understanding_word2vec_Naver_fullpca_axis_variance.png'