## 参考
- https://github.com/theeluwin/pytorch-sgns

In [1]:
!nvidia-smi

Sun Jul  4 22:46:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')
print('Mount OK')

Mounted at /content/drive
Mount OK


In [3]:
# https://qiita.com/jun40vn/items/78e33e29dce3d50c2df1

# 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
!pip install mecab-python3 > /dev/null

# シンボリックリンクによるエラー回避
!ln -s /etc/mecabrc /usr/local/etc/mecabrc

Cloning into 'mecab-ipadic-neologd'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 75 (delta 5), reused 54 (delta 0), pack-reused 0[K
Unpacking objects: 100% (75/75), done.


In [4]:
import os
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import MeCab

from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [5]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/word2vec_pytorch/input/"
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/word2vec_pytorch/output/"
os.makedirs(SAVE_DIR, exist_ok=True)

In [6]:
# wikipediaの日本語版から作られたコーパス
# https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip
df = pd.read_csv(os.path.join(DATA_DIR, 'amazon_reviews_multilingual_JP_v1_00.tsv'), sep='\t')
print(df.shape)
df.head()

(262256, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,1,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,1,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,5,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,5,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,4,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10


In [7]:
# tagger = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
tagger = MeCab.Tagger(f"-Ochasen {path}")
def tokenize(text, target=['名詞', '動詞']):
        # 連結リスト
        node = tagger.parseToNode(text)

        result = []
        while node:
            hinshi = node.feature.split(",")[0]
            if hinshi in target:
                if node.feature.split(",")[6]!='*':
                    result.append(node.feature.split(",")[6])
                else:
                    result.append(node.surface)  # 原形で取得しようとする場合、英字だと表示されずに「*」となることがあるため、その場合はsurfaceを取る
            node = node.next

        return result

In [8]:
from bs4 import BeautifulSoup
def clean_html(text, strip=True):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(strip=strip)
    return text

In [9]:
import re
def nornalize_number(text):
    text = re.sub(r'\d+', '0', text)
    return text

In [10]:
def remove_symbol(text):
    """
    https://ohke.hateblo.jp/entry/2019/02/09/141500
    """
    # 半角記号の置換
    text = re.sub(r'[!-/:-@[-`{-~]', r' ', text)

    # 全角記号の置換 (ここでは0x25A0 - 0x266Fのブロックのみを除去)
    text = re.sub(u'[■-♯]', ' ', text)
    
    return text

In [11]:
with open(os.path.join(DATA_DIR, 'stopwords_slothlib.txt'), 'r') as f:
    stopwords = [w.strip() for w in f]
    stopwords = set(stopwords)

add_stopwords = {
    '*',
    'あ','い','う','え','お',
    'か','き','く','け','こ',
    'さ','し','す','せ','そ',
    'た','ち','つ','て','と',
    'な','に','ぬ','ね','の',
    'は','ひ','ふ','へ','ほ',
    'ま','み','む','め','も',
    'や','ゆ','よ',
    'わ' ,'を','ん',
    '0', '1', 
    'ã',
    'å',
    'å',
    'ä',
    'ï', 
    'è',
    'é',
    'æ',
    'ç',
    'of',
    'the',
    'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
    'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
}
stopwords = stopwords | add_stopwords

def remove_stopwords(words):
    words = [w for w in words if w not in stopwords]
    return words

In [12]:
X = df['review_body']

In [13]:
%%time
# 前処理+tokenization
X = [tokenize(remove_symbol(nornalize_number(clean_html(text))), target=['名詞']) for text in X]

# stopwordsの除去
X_rm_stopwords = []
for tokens in X:
    X_rm_stopwords.append([w for w in tokens if w not in stopwords])

X = X_rm_stopwords
# # スペース区切りの分かち書きの状態にする
# X = [' '.join(tokens) for tokens in X_rm_stopwords]

  ' that document to Beautiful Soup.' % decoded_markup


CPU times: user 2min 36s, sys: 2.42 s, total: 2min 39s
Wall time: 2min 38s


## word2vec

### 1. データセット作成
- 各センテンス（wordのリスト）に対して、各単語(input word)ごとに, windowサイズ前後分の単語(output words)を、ペアで揃える

In [14]:
# 未知語、windowサイズの設定
unk = '<UNK>'
window = 5

In [15]:
# 1つsentenceを取って、試しに作ってみる
sentence = X[10]
sentence

['無料', '範囲', '普通に', '潰し', 'もってこい', '飽き', 'ゲーム', 'Galaxy', 'イメージ']

In [16]:
i=4  # sentenceの文字番号
input_word = sentence[i]
left = sentence[max(i-window, 0): i]  # i番目のwordの前window分の単語
left

['無料', '範囲', '普通に', '潰し']

In [17]:
right = sentence[i+1: i+1+window]  # i番目のwordの後window分の単語
right

['飽き', 'ゲーム', 'Galaxy', 'イメージ']

In [18]:
print(input_word)
# 着目wordの前後window分のword(window分ない場合に、unkで埋める)
[unk for _ in range(window-len(left))] + left + right + [unk for _ in range(window-len(right))]

もってこい


['<UNK>', '無料', '範囲', '普通に', '潰し', '飽き', 'ゲーム', 'Galaxy', 'イメージ', '<UNK>']

In [19]:
# 上記を関数としてまとめる
def skipgram(sentence, i):
    iword = sentence[i]
    left = sentence[max(i - window, 0): i]
    right = sentence[i+1: i+1+window]
    return iword, [unk for _ in range(window-len(left))] + left + right + [unk for _ in range(window-len(right))]

In [20]:
word_list = []
for sent in X:
    word_list.extend(sent)

In [21]:
from collections import Counter
word_count = Counter(word_list)

In [22]:
max_vocab = 20000
vocab = []
for v, c in word_count.most_common(n=max_vocab):
    vocab.append(v)
vocab = set(vocab)

In [23]:
word2idx = {unk: 0}
word2idx.update({x: i+1 for i, x in enumerate(vocab)})  # 0はunkで、eunumerateは0から始まるので
idx2word = {v: k for k, v in word2idx.items()}

In [24]:
%%time
data = []
for sent in tqdm(X):
    sent_limit = []
    for word in sent:
        if word in vocab:  # vocabがlistの場合、このif判定がめちゃめちゃ遅くなるので注意（そのため、setにしている）
            sent_limit.append(word)
        else:
            sent_limit.append(unk)
    for i in range(len(sent_limit)):
        input_word, output_words = skipgram(sent_limit, i)
        data.append((word2idx[input_word], np.array([word2idx[oword] for oword in output_words])))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=262256.0), HTML(value='')))


CPU times: user 53.5 s, sys: 1.4 s, total: 54.9 s
Wall time: 55.2 s


In [25]:
data[0]

(2257, array([   0,    0,    0,    0,    0, 6359,    0, 8891, 4746,    0]))

In [26]:
# 保存
import pickle
pickle.dump(word_count, open(os.path.join(SAVE_DIR, 'word_count.pkl'), 'wb'))
pickle.dump(vocab, open(os.path.join(SAVE_DIR, 'vocab.pkl'), 'wb'))
pickle.dump(word2idx, open(os.path.join(SAVE_DIR, 'word2idx.pkl'), 'wb'))
pickle.dump(idx2word, open(os.path.join(SAVE_DIR, 'idx2word.pkl'), 'wb'))

### 2. モデル

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)

cuda


In [28]:
class Word2Vec(nn.Module):
    """
    input_word, output_wordsを、それぞれEmbeddingでベクトルに変換
    ベクトルの重みを学習パラメータとする
    """
    def __init__(self, vocab_size=20000, embedding_size=300, padding_idx=0):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.ivectors = nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx)
        self.ovectors = nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx)
        self.ivectors.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size-1, self.embedding_size).uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)]))
        self.ovectors.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size-1, self.embedding_size).uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)]))
        self.ivectors.weight.requires_grad = True
        self.ovectors.weight.requires_grad = True

    def forward(self, data):
        return self.forward_i(data)

    def forward_i(self, data):
        v = torch.LongTensor(data).to(device)
        return self.ivectors(v)

    def forward_o(self, data):
        v = torch.LongTensor(data).to(device)
        return self.ovectors(v)

In [29]:
class SGNS(nn.Module):
    """
    2値分類(positive_samplesとnegative_samplesの分類)の損失関数を設定
    positive_samplesのolossと、negative_samplesのnlossの合計
    negative_samplesは、単語の頻度分布の0.75乘からサンプリングしたものを使用
    """
    def __init__(self, embedding, vocab_size=20000, n_negs=20, weights=None):
        super(SGNS, self).__init__()
        self.embedding = embedding
        self.vocab_size = vocab_size
        self.n_negs = n_negs  # 1つのpositive_sampleに対するnegative_sampleの数
        self.weights = None
        if weights is not None:
            # 原論文だと、word_flequencyの0.75乘に設定
            wf = np.power(weights, 0.75)
            wf = wf / wf.sum()
            self.weights = torch.FloatTensor(wf)

    def forward(self, iword, owords):
        batch_size = iword.size()[0]
        context_size = len(owords)
        if self.weights is not None:
            nwords = torch.multinomial(self.weights, batch_size * context_size * self.n_negs, replacement=True).view(batch_size, -1)
        else:
            nwords = torch.FloatTensor(batch_size, context_size*self.n_negs).uniform_(0, self.vocab_size - 1).long()
        ivectors = self.embedding.forward_i(iword).unsqueeze(2)
        ovectors = self.embedding.forward_o(owords)
        nvectors = self.embedding.forward_o(nwords).neg()  # .neg()で、-1倍
        oloss = torch.bmm(ovectors, ivectors).squeeze().sigmoid().log().mean(1)
        nloss = torch.bmm(nvectors, ivectors).squeeze().sigmoid().log().view(-1, context_size, self.n_negs).sum(2).mean(1)
        return -(oloss + nloss).mean()

In [30]:
# 単語分布の0.75乘
word_flequency = np.array([word_count[idx2word[i]] for i in range(len(idx2word))])  # idx=0から順にwordカウントを入れていく
word_flequency = np.power(word_flequency, 0.75)
word_flequency = word_flequency / word_flequency.sum()
print(word_flequency.sum())
word_flequency

1.0000000000000002


array([0.00000000e+00, 1.20218153e-04, 4.21448542e-05, ...,
       1.27212589e-05, 5.43866388e-05, 1.12960505e-04])

In [31]:
weights = word_flequency
batch_size = 100
context_size = 5  # 隣接するwordの数
n_negs = 20

# negative samples:
# torch.multinomial
# inputの重みに対応する"多項分布"から, num_samples個のindexを値に持つtensorを返す
# 1つのinputに対し、positive_samplesはcontext_size分ある
# negative_samplesは、各positive_sampleに対し、n_negs個あるので、batch_size * context_size * n_negsとする
nwords = torch.multinomial(input=torch.FloatTensor(weights), num_samples=batch_size*context_size*n_negs, replacement=True).view(batch_size, -1)
print(nwords.shape)

torch.Size([100, 100])


### 3. 学習

In [32]:
vocab_size = len(idx2word)
embedding_size = 128
weights = word_flequency
n_negs = 5
batch_size = 128

model = Word2Vec(vocab_size=vocab_size, embedding_size=embedding_size).to(device)
sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=n_negs, weights=weights).to(device)

In [33]:
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
optimizer = Adam(sgns.parameters())

In [34]:
# 確認
for iword, owords in tqdm(dataloader):
    print(iword.size())
    print(owords.size())
    break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))

torch.Size([128])
torch.Size([128, 10])


In [35]:
EPOCHS= 5
loss_list = []
for epoch in range(EPOCHS):
    print(f'EPOCH {epoch+1}')
    for iword, owords in tqdm(dataloader):
        loss = sgns(iword, owords)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())

EPOCH 1


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))


EPOCH 2


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))


EPOCH 3


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))


EPOCH 4


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))


EPOCH 5


HBox(children=(FloatProgress(value=0.0, max=74791.0), HTML(value='')))




In [36]:
torch.save(sgns.state_dict(), os.path.join(SAVE_DIR, 'sgns.pt'))
torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'word2vec.pt'))
torch.save(optimizer.state_dict(), os.path.join(SAVE_DIR, 'optimizer.pt'))