<p>Twitterのプロフィールで興味・関心キーワードを抜き出したものをcorpus.pklに保存されている</p>
<p> それらのキーワードを1つ隠し、残りのキーワードから隠した1つを予測し、学習することで興味ベクトルを作る</p>

In [None]:
import pickle
with open('data/corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('data/dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)

In [21]:
reverse_dictionary = {v: k for k, v in dictionary.items()}

In [10]:
data_index = 0
import numpy as np
from copy import copy
import random

def generate_batch(batch_size):
    global data_index
    assert batch_size % 2 == 0
#     batch = np.ndarray(shape=(batch_size, len), dtype=np.int32)
    batch = []
    labels = []
#     labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    batch_count = 0
    
    while(batch_count < batch_size):
        fun_ids = copy(corpus[data_index])
        # 興味が１つしかない場合は予測できないので無視
        if len(fun_ids) >= 2:
            label = random.sample(fun_ids, 1)
            fun_ids.remove(label[0])
            for fun_id in fun_ids:
                batch.append([batch_count, fun_id])
            labels.append(label)
            batch_count += 1
        data_index = (data_index + 1) % len(corpus)
        if batch_count >= batch_size:
            break

    return batch, labels

_batch, _labels = generate_batch(batch_size=10)
print(' batch:', _batch)
print(' labels:', _labels)

 batch: [[0, 5], [0, 0], [0, 2], [0, 1], [0, 4], [1, 9405], [1, 2420], [2, 2403], [2, 4699], [2, 7999], [2, 3190], [3, 6430], [3, 5552], [3, 53937], [3, 2571], [3, 5186], [3, 352], [3, 3875], [3, 2991], [3, 455], [3, 20664], [3, 391], [4, 30324], [4, 13857], [5, 2994], [5, 7378], [5, 18559], [5, 14151], [5, 8423], [5, 72], [5, 4975], [5, 4699], [5, 10165], [5, 1961], [5, 77], [5, 41895], [6, 6596], [7, 5710], [7, 53], [7, 3617], [7, 52], [7, 51], [8, 26476], [8, 1862], [8, 1869], [8, 1866], [8, 554], [8, 4699], [8, 2570], [9, 23551], [9, 1756], [9, 2372], [9, 31205], [9, 6271], [9, 47541], [9, 2791]]
 labels: [[2302], [30234], [9338], [8163], [13848], [2505], [33417], [2420], [9899], [554]]


In [23]:
# Train cbow model
import tensorflow as tf
import math
import numpy as np
import random

BATCH_SIZE = 500
EMBEDDING_SIZE = 200
NUM_SAMPLED = 15
VOCAB_SIZE = len(dictionary)
SAMPLE_SIZE = 15
# 辞書の単語IDは0から順に振られているのでrangeでもOK
sample_indices = np.array([[idx, vocab_id] for idx, vocab_id in enumerate(random.sample(range(VOCAB_SIZE), SAMPLE_SIZE))])

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    # input data

    # sparse matrix(shape=batch_size*vocab_size)にindicesだけ0でない興味が入り、valuesはすべて0
    train_indices = tf.placeholder(tf.int64) # 0でない場所を指す
    train_dataset = tf.SparseTensor(train_indices, values=tf.ones([tf.shape(train_indices)[0]], tf.float32), dense_shape=[BATCH_SIZE, VOCAB_SIZE])
    train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
    sample_dataset = tf.SparseTensor(sample_indices, values=tf.ones([tf.shape(sample_indices)[0]], tf.float32), dense_shape=[SAMPLE_SIZE, VOCAB_SIZE])
    
    # Variables
    # 一様分布で興味ベクトル初期化
    embeddings = tf.Variable(
        tf.random_uniform([len(dictionary), EMBEDDING_SIZE], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.truncated_normal([len(dictionary), EMBEDDING_SIZE],
                           stddev=1.0/math.sqrt(EMBEDDING_SIZE)))
    softmax_biases = tf.Variable(tf.zeros([len(dictionary)]))

    # 該当ユーザーのtrainingデータの興味ベクトルをすべて足す
    embed = tf.sparse_tensor_dense_matmul(train_dataset, embeddings)

    # softmax loss with negative sampling
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(
            weights=softmax_weights,
            biases=softmax_biases,
            inputs=embed, # batch_size * embedding_size
            labels=train_labels,
            num_sampled=NUM_SAMPLED,
            num_classes=len(dictionary)))
    
    optimizer = tf.train.AdagradOptimizer(0.8).minimize(loss)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    # サンプルデータのベクトル
    sample_embed = tf.sparse_tensor_dense_matmul(sample_dataset, normalized_embeddings)
    # サンプルデータとの類似度
    similarity = tf.matmul(sample_embed, tf.transpose(normalized_embeddings))

In [None]:
num_steps = 10000

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    for step in range(1, num_steps+1):
        batch_data, batch_labels = generate_batch(BATCH_SIZE)
        feed_dict = {
            train_indices: batch_data,
            train_labels: batch_labels
        }
        _, I = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += I
#         print(step)
        if step % 1000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
                print('Average loss at step %d: %f' % (step, average_loss))
                average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            TOP_K = 8
            for i in range(SAMPLE_SIZE):
                sample_fun = reverse_dictionary[samples[i]]
                nearest = (-sim[i, :]).argsort()[1:TOP_K+1]
                log = 'Nearest to %s:' % sample_fun
                for k in range(TOP_K):
                    close_fun = reverse_dictionary[nearest[k]]
                    log = '%s %s ' % (log, close_fun)
                print(log)
    final_embeddings = normalized_embeddings.eval()

Average loss at step 1000: 1.057525
Average loss at step 2000: 0.856630
