In [5]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import string
import requests
import collections
import io
import tarfile
import gzip
import nltk
from nltk.corpus import stopwords
from tensorflow.python.framework import ops
ops.reset_default_graph()

sess = tf.Session()

In [6]:
batch_size = 100         # 一度にトレーニングする単語のペア数
embedding_size = 100    #  トレーニングする各事業の埋め込みサイズ
vocabulary_size = 2000 # トレーニングの対象となる単語の数
generations = 100000    #  トレーニングの実行回数
print_loss_every = 1000  # 1000回おきに損失値を出力

num_sampled = int(batch_size/2) # 不正解サンプルの数
window_size = 5         # 考慮の対象となる前後の単語の数.

In [7]:
# ストップワードを設定
nltk.download('stopwords')
stops = stopwords.words('english')

# 同義語が見つかることを期待して、テストワードを 5 つ設定
print_valid_every = 10000
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def load_movie_data():
    save_folder_name = 'temp'
    pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')
    # データがすでにダウンロードされているかどうかを確認
    if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
        movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
        # tar.gz ファイルを保存
        req = requests.get(movie_data_url, stream=True)
        with open(os.path.join(save_folder_name,'temp_movie_review_temp.tar.gz'), 'wb') as f:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        # tar.gz ファイルをtempフォルダに抽出
        tar = tarfile.open(os.path.join(save_folder_name,'temp_movie_review_temp.tar.gz'), "r:gz")
        tar.extractall(path='temp')
        tar.close()
    pos_data = []
    with open(pos_file, 'r', encoding='latin-1') as f:
        for line in f:
            pos_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    pos_data = [x.rstrip() for x in pos_data]
    neg_data = []
    with open(neg_file, 'r', encoding='latin-1') as f:
        for line in f:
            neg_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    neg_data = [x.rstrip() for x in neg_data]  
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    
    return(texts, target)
texts, target = load_movie_data()

In [9]:
# テキストの正規化
def normalize_text(texts, stops):
    # 小文字に変換
    texts = [x.lower() for x in texts]

    # 句読点を削除
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

    # 数字を削除
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

    # ストップワードを削除
    texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

    # 余分なホワイトスペースを削除
    texts = [' '.join(x.split()) for x in texts]
    
    return(texts)
    
texts = normalize_text(texts, stops)

In [10]:
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [11]:
def build_dictionary(sentences, vocabulary_size):
    # 文（文字列のリスト）を単語のリストに変換
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    
    # 各単語の [word, word_count] のリストを未知のものから初期化
    count = [['RARE', -1]]
    
    # 最も出現頻度の高い単語を N  個まで追加（Nは語彙のサイズ）
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    
    # ディクショナリを作成
    word_dict = {}
    # ディクショナリに含めたい単語を追加し
    #  ディクショナリの1つ前の長さを値として設定
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    
    return(word_dict)

In [12]:
def text_to_numbers(sentences, word_dict):
    # この関数から返すデータを初期化
    data = []
    for sentence in sentences:
        sentence_data = []
        # 単語ごとに、選択されたインデックスか'RARE'単語のインデックスを使用
        for word in sentence.split(' '):
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

In [13]:
word_dictionary = build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_to_numbers(texts, word_dictionary)

In [14]:
valid_examples = [word_dictionary[x] for x in valid_words]

In [15]:
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # バッチデータにデータを設定
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # 最初に文字をランダムに選択
        rand_sentence = np.random.choice(sentences)
        # 調査の対象となる連続的なウィンドウを生成
        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        # 各ウィンドウのどの要素が目的の単語であるかを指定
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        
        # ウィンドウごとに目的の単語を抽出し、タプルを作成
        if method=='skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
            #  タプル（目的の単語、前後の単語）からなる大きなリストを作成
            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
        elif method=='cbow':
            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
            # タプル（目的の単語、前後の単語）からなる大きなリストを作成
            tuple_data = [(x_, y) for x,y in batch_and_labels for x_ in x]
        else:
            raise ValueError('Method {} not implemented yet.'.format(method))
            
        # バッチとラベルを抽出
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    # バッチとラベルをトリミング
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Numpy配列に変換
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)

In [16]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# プレースホルダを作成
x_inputs = tf.placeholder(tf.int32, shape=[batch_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# 単語埋め込みを検索
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

In [18]:
# NCE損失関数のパラメータ
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# 予測値をもとに損失値を取得
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

In [19]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [20]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [21]:
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # トレーニングステップを実行
    sess.run(optimizer, feed_dict=feed_dict)

    # 損失値を取得
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print("Loss at step {} : {}".format(i+1, loss_val))
      
    # 検証：テストワードと最も関連する上位5つの単語を出力
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # 最も近くにある単語の数
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                score = sim[j,nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
            print(log_str)

Loss at step 1000 : 4.279643535614014
Loss at step 2000 : 4.697796821594238
Loss at step 3000 : 4.405325412750244
Loss at step 4000 : 3.9387598037719727
Loss at step 5000 : 3.9294633865356445
Loss at step 6000 : 3.9107372760772705
Loss at step 7000 : 3.900785207748413
Loss at step 8000 : 3.169133186340332
Loss at step 9000 : 4.299883842468262
Loss at step 10000 : 3.3558096885681152
Nearest to cliche: sad, soap, giant, eight, insights,
Nearest to love: capture, ill, strong, RARE, cynical,
Nearest to hate: RARE, like, mike, menace, darkly,
Nearest to silly: fit, many, escapism, crazy, depressing,
Nearest to sad: cliche, imagery, human, rap, parents,
Loss at step 11000 : 3.7676165103912354
Loss at step 12000 : 3.0968637466430664
Loss at step 13000 : 4.227167129516602
Loss at step 14000 : 3.942843437194824
Loss at step 15000 : 4.367527961730957
Loss at step 16000 : 3.495479106903076
Loss at step 17000 : 3.448532819747925
Loss at step 18000 : 4.329442501068115
Loss at step 19000 : 3.9101390