## 参考
- https://github.com/theeluwin/pytorch-sgns

In [1]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
from google.colab import drive
drive.mount('/content/drive')
print('Mount OK')

Mounted at /content/drive
Mount OK


In [3]:
# https://qiita.com/jun40vn/items/78e33e29dce3d50c2df1

# 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
!pip install mecab-python3 > /dev/null

# シンボリックリンクによるエラー回避
!ln -s /etc/mecabrc /usr/local/etc/mecabrc

Cloning into 'mecab-ipadic-neologd'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 75 (delta 5), reused 54 (delta 0), pack-reused 0[K
Unpacking objects: 100% (75/75), done.


In [4]:
import os
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import MeCab

from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [5]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/word2vec_pytorch/input/"
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/word2vec_pytorch/output/"
os.makedirs(SAVE_DIR, exist_ok=True)

In [6]:
# wikipediaの日本語版から作られたコーパス
# https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip
df = pd.read_csv(os.path.join(DATA_DIR, 'amazon_reviews_multilingual_JP_v1_00.tsv'), sep='\t')
print(df.shape)
df.head()

(262256, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,1,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,1,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,5,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,5,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,4,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10


In [7]:
# tagger = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
tagger = MeCab.Tagger(f"-Ochasen {path}")
def tokenize(text, target=['名詞', '動詞']):
        # 連結リスト
        node = tagger.parseToNode(text)

        result = []
        while node:
            hinshi = node.feature.split(",")[0]
            if hinshi in target:
                if node.feature.split(",")[6]!='*':
                    result.append(node.feature.split(",")[6])
                else:
                    result.append(node.surface)  # 原形で取得しようとする場合、英字だと表示されずに「*」となることがあるため、その場合はsurfaceを取る
            node = node.next

        return result

In [8]:
from bs4 import BeautifulSoup
def clean_html(text, strip=True):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(strip=strip)
    return text

In [9]:
import re
def nornalize_number(text):
    text = re.sub(r'\d+', '0', text)
    return text

In [10]:
def remove_symbol(text):
    """
    https://ohke.hateblo.jp/entry/2019/02/09/141500
    """
    # 半角記号の置換
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text)

    # 全角記号の置換 (ここでは0x25A0 - 0x266Fのブロックのみを除去)
    text = re.sub(u'[■-♯]', ' ', text)
    
    return text

In [11]:
with open(os.path.join(DATA_DIR, 'stopwords_slothlib.txt'), 'r') as f:
    stopwords = [w.strip() for w in f]
    stopwords = set(stopwords)

add_stopwords = {
    '*',
    'あ','い','う','え','お',
    'か','き','く','け','こ',
    'さ','し','す','せ','そ',
    'た','ち','つ','て','と',
    'な','に','ぬ','ね','の',
    'は','ひ','ふ','へ','ほ',
    'ま','み','む','め','も',
    'や','ゆ','よ',
    'わ' ,'を','ん',
    '0', '1', 
    'ã',
    'å',
    'å',
    'ä',
    'ï', 
    'è',
    'é',
    'æ',
    'ç',
    'of',
    'the',
    'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
    'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
}
stopwords = stopwords | add_stopwords

def remove_stopwords(words):
    words = [w for w in words if w not in stopwords]
    return words

In [12]:
X = df['review_body']

In [13]:
%%time
# 前処理+tokenization
X = [tokenize(remove_symbol(nornalize_number(clean_html(text))), target=['名詞']) for text in X]

# stopwordsの除去
X_rm_stopwords = []
for tokens in X:
    X_rm_stopwords.append([w for w in tokens if w not in stopwords])

X = X_rm_stopwords
# # スペース区切りの分かち書きの状態にする
# X = [' '.join(tokens) for tokens in X_rm_stopwords]

  ' that document to Beautiful Soup.' % decoded_markup


CPU times: user 3min 23s, sys: 3.17 s, total: 3min 26s
Wall time: 3min 26s


## word2vec

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)

cpu


In [15]:
import pickle
word_count = pickle.load(open(os.path.join(SAVE_DIR, 'word_count.pkl'), 'rb'))
vocab = pickle.load(open(os.path.join(SAVE_DIR, 'vocab.pkl'), 'rb'))
word2idx = pickle.load(open(os.path.join(SAVE_DIR, 'word2idx.pkl'), 'rb'))
idx2word = pickle.load(open(os.path.join(SAVE_DIR, 'idx2word.pkl'), 'rb'))

In [16]:
class Word2Vec(nn.Module):
    """
    input_word, output_wordsを、それぞれEmbeddingでベクトルに変換
    ベクトルの重みを学習パラメータとする
    """
    def __init__(self, vocab_size=20000, embedding_size=300, padding_idx=0):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.ivectors = nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx)
        self.ovectors = nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx)
        self.ivectors.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size-1, self.embedding_size).uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)]))
        self.ovectors.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size-1, self.embedding_size).uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)]))
        self.ivectors.weight.requires_grad = True
        self.ovectors.weight.requires_grad = True

    def forward(self, data):
        return self.forward_i(data)

    def forward_i(self, data):
        v = torch.LongTensor(data).to(device)
        return self.ivectors(v)

    def forward_o(self, data):
        v = torch.LongTensor(data).to(device)
        return self.ovectors(v)

In [19]:
vocab_size = len(idx2word)
embedding_size = 128

model = Word2Vec(vocab_size=vocab_size, embedding_size=embedding_size).to(device)
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'word2vec.pt'), map_location=torch.device('cpu')))

<All keys matched successfully>

In [36]:
# 学習した分散表現をpd.DataFrameに格納
vals = model(torch.LongTensor([i for i in range(len(idx2word))]))
df = pd.DataFrame(vals.detach().numpy())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.106793,-0.152478,0.174593,-0.078396,-0.133794,0.099618,0.074074,0.375492,-0.049113,-0.236578,0.235441,0.161749,0.089418,0.008814,0.175541,-0.107391,-0.591829,-0.393635,-0.023725,0.061424,-0.05712,0.114692,-0.055774,-0.056635,-0.108502,0.15703,0.205292,0.046451,0.126172,-0.025308,-0.143746,-0.214405,-0.198731,-0.398823,-0.00631,0.209876,-0.167805,-0.010938,0.04396,-0.102676,...,0.042056,-0.046562,0.074522,0.115885,-0.305021,0.372427,-0.096901,0.127295,0.059206,0.070374,0.110966,0.153938,0.059198,0.09453,0.102932,-0.00631,0.006225,-0.04163,0.112263,-0.137726,0.01456,0.07556,-0.138243,0.084384,-0.054973,0.107172,0.078156,-0.016336,-0.160795,-0.002251,-0.036207,0.275698,0.113353,-0.006903,0.215475,0.028386,0.193406,0.473533,0.026671,-0.037469
2,-0.241506,0.512962,-0.136076,0.461085,-0.057002,-0.654072,-0.101083,0.159774,-0.26762,-0.237275,0.359544,-0.088536,0.181655,0.334509,-0.271405,0.083225,0.101763,0.564471,-0.052027,0.298668,-0.086432,0.261878,-0.074687,1.155048,0.251029,0.09161,0.929972,0.294393,0.305607,0.60952,-0.37818,0.182596,-0.386493,0.334132,0.28757,-0.500872,0.05231,0.888738,0.101156,-0.335891,...,-0.026062,-0.147273,-0.370162,-0.077841,-0.289502,-0.545902,0.140753,-0.070779,-0.901623,0.589658,0.235909,0.331418,-0.317092,0.156893,0.497307,-0.29259,0.140456,-0.468713,0.217002,0.330954,-0.367236,0.58699,-0.152602,0.448376,0.544645,-0.025061,-0.043619,0.339292,-0.132702,-0.2232,0.018514,-0.474079,-0.788808,-0.308466,-0.020856,-0.14046,0.63683,0.265056,0.078004,-0.453766
3,0.453557,0.398644,0.056489,-0.619565,-0.606274,-0.156992,0.078922,0.446199,-0.146416,0.02279,-0.165884,-0.959759,0.202783,0.009399,-0.107456,-0.247783,-0.002709,-0.189059,0.104591,0.165224,0.810089,0.289198,-0.200411,0.123212,0.501037,0.993115,0.046015,-0.335713,-0.829718,0.405127,-0.419746,-0.37448,0.542596,0.166895,-0.344825,0.10775,-0.274986,0.184914,0.04223,0.403844,...,-0.783196,0.141197,-0.542827,-0.101611,-0.135889,-0.303222,0.519968,-0.011878,-0.169235,0.24273,-0.386186,-0.022197,-0.274242,0.288909,-0.494699,0.079662,-0.422453,0.009897,0.14678,-0.420504,-0.770643,0.035113,-0.112189,-0.755263,-0.438742,0.318651,0.03698,0.524817,-0.198226,-0.113962,0.238495,-0.561382,-0.317718,-0.063798,-0.419158,-0.107482,0.348031,-0.115069,0.029396,0.236423
4,-0.071922,-0.03642,-0.07191,0.208959,-0.18852,-0.707835,0.090304,-0.096504,0.302876,-0.034359,-0.284622,-0.098711,-0.588943,-0.484908,-0.07934,-0.090356,-0.034514,0.411405,-0.014368,-0.124904,-0.318052,-0.169159,-0.678239,-0.503626,-0.462733,0.129259,0.078144,0.538628,0.107989,0.165444,-0.413219,0.159881,-0.224595,0.319907,0.147839,-0.935304,-0.357932,-0.678712,-0.433295,-0.159361,...,-0.069861,-0.285853,-0.29276,-0.183892,-0.105043,0.095826,-0.685411,0.479705,-0.460079,0.05105,-0.255674,0.258285,-0.156222,0.206922,0.192945,0.216316,0.47805,0.026836,-0.554015,0.002822,-0.153591,0.398148,0.128454,0.102531,0.074492,-0.557177,0.529949,-0.357455,0.150763,0.563142,0.504888,0.682642,-0.404463,0.321455,0.010462,-0.089817,-0.300076,0.233856,0.189977,0.513066


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1068, -0.1525,  0.1746,  ...,  0.4735,  0.0267, -0.0375],
        [-0.2415,  0.5130, -0.1361,  ...,  0.2651,  0.0780, -0.4538],
        ...,
        [-0.2304, -0.0641,  0.2690,  ..., -0.5462, -0.0945, -0.8044],
        [-0.1340, -0.0192,  0.0369,  ..., -0.4042,  0.5964, -0.5951],
        [ 0.2453, -0.1276,  0.0532,  ...,  0.5346,  0.5866, -0.2343]],
       grad_fn=<EmbeddingBackward>)

In [38]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

array([[ 7.22714663e-02,  9.46157500e-02, -6.49613589e-02,
        -1.66363344e-01, -2.01720774e-01, -7.95711130e-02,
        -2.27984235e-01, -2.01148912e-01,  2.25153074e-01,
         4.66657132e-01, -4.80752319e-01, -3.15253615e-01,
        -9.71156433e-02, -9.70034823e-02, -2.86173940e-01,
         1.44153103e-01, -1.71036780e-01,  4.04208392e-01,
        -1.78045213e-01,  2.18572274e-01,  2.19562769e-01,
        -3.83157134e-01,  3.35806042e-01,  2.03913763e-01,
         1.12704761e-01,  2.97145337e-01,  3.14466830e-04,
         1.75006449e-01, -1.20509557e-01, -1.64142638e-01,
         2.30236456e-01, -1.98494673e-01,  2.70854264e-01,
        -3.15107346e-01, -1.54416025e-01, -7.40623549e-02,
        -3.50594819e-01, -1.43485606e-01,  7.71483928e-02,
         2.22622603e-01, -4.15431648e-01,  4.71714251e-02,
         4.06728595e-01,  2.05869004e-02, -4.04450186e-02,
         1.33015037e-01,  1.72402352e-01,  5.72291110e-03,
        -2.83082813e-01,  1.38139069e-01, -2.32121557e-0

In [55]:
idx = 100
print('着目単語: ', idx2word[idx])
distances, indices = model_knn.kneighbors(df.iloc[idx].values.reshape(1,-1), n_neighbors=20)
# 似ている単語top20:
print('='*20)
print('似ている単語top20:')
print('='*20)
for i in indices[0]:
    print(idx2word[i])

着目単語:  番組
似ている単語top20:
番組
NHK
テレビ番組
FM
ラジオ番組
ラジオ
放送
毎週
BS
録画
音楽番組
オンエア
放映
YouTube
エアチェック
局
特集
紹介
TV番組
深夜


In [56]:
idx = 200
print('着目単語: ', idx2word[idx])
distances, indices = model_knn.kneighbors(df.iloc[idx].values.reshape(1,-1), n_neighbors=20)
# 似ている単語top20:
print('='*20)
print('似ている単語top20:')
print('='*20)
for i in indices[0]:
    print(idx2word[i])

着目単語:  一位
似ている単語top20:
一位
ランキング
位
Billboard
ランクイン
順位
チャート
初登場
全米
ヒット
首位
ナンバーワン
売上
売り上げ
全英
認定
上位
歴代
ヒットチャート
全世界


In [57]:
idx = 1000
print('着目単語: ', idx2word[idx])
distances, indices = model_knn.kneighbors(df.iloc[idx].values.reshape(1,-1), n_neighbors=20)
# 似ている単語top20:
print('='*20)
print('似ている単語top20:')
print('='*20)
for i in indices[0]:
    print(idx2word[i])

着目単語:  のり
似ている単語top20:
のり
ノリ
ノリノリ
アクセント
ゴリ
ロン・ウッド
いいね
メロー
GUHROOVY
ブギー
ば
Hey
ちょ
アップテンポ
Been
ご機嫌
Yeah
ティッシュ。
ラップ
脚気
