<p>Twitterのプロフィールで興味・関心キーワードを抜き出したものをcorpus.pklに保存されている</p>
<p> それらのキーワードを1つ隠し、残りのキーワードから隠した1つを予測し、学習することで興味ベクトルを作る</p>

In [None]:
import pickle
with open('data/corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('data/dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)

In [None]:
max(list(dictionary.values()))

In [None]:
data_index = 0
import numpy as np
from copy import copy
import random

def generate_batch(batch_size):
    global data_index
    assert batch_size % 2 == 0
#     batch = np.ndarray(shape=(batch_size, len), dtype=np.int32)
    batch = []
    labels = []
#     labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    batch_count = 0
    
    while(batch_count < batch_size):
        fun_ids = copy(corpus[data_index])
        # 興味が１つしかない場合は予測できないので無視
        if len(fun_ids) >= 2:
            label = random.sample(fun_ids, 1)
            fun_ids.remove(label[0])
            batch.append(fun_ids)
            labels.append(label)
            batch_count += 1
        data_index = (data_index + 1) % len(corpus)
        if batch_count >= batch_size:
            break

    return batch, labels

_batch, _labels = generate_batch(batch_size=10)
print(' batch:', _batch)
print(' labels:', _labels)

In [None]:
# Train cbow model
import tensorflow as tf
import math
import numpy as np
import random

BATCH_SIZE = 500
EMBEDDING_SIZE = 200
NUM_SAMPLED = 15

SAMPLE_SIZE = 15
# 辞書の単語IDは0から順に振られているのでrangeでもOK
samples = np.array(random.sample(range(len(dictionary)), SAMPLE_SIZE))

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    # input data
    train_dataset = tf.placeholder(tf.int32, shape=[BATCH_SIZE, None])
    train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
    sample_dataset = tf.constant(samples, dtype=tf.int32)
    sample_dataset = tf.reshape(sample_dataset, [SAMPLE_SIZE, 1])
    
    # Variables
    # 一様分布で興味ベクトル初期化
    embeddings = tf.Variable(
        tf.random_uniform([len(dictionary), EMBEDDING_SIZE], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.truncated_normal([len(dictionary), EMBEDDING_SIZE],
                           stddev=1.0/math.sqrt(EMBEDDING_SIZE)))
    softmax_biases = tf.Variable(tf.zeros([len(dictionary)]))

    # 該当ユーザーのtrainingデータの興味ベクトルをすべて足す
    embed = tf.stack([tf.foldl(lambda x, y: x + y, tf.nn.embedding_lookup(embeddings, train_dataset[i])) for i in range(BATCH_SIZE)])

    # softmax loss with negative sampling
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(
            weights=softmax_weights,
            biases=softmax_biases,
            inputs=embed, # batch_size * embedding_size
            labels=train_labels,
            num_sampled=NUM_SAMPLED,
            num_classes=len(dictionary)))
    
    optimizer = tf.train.AdagradOptimizer(0.8).minimize(loss)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    # サンプルデータのベクトル
    sample_embed = tf.stack([tf.foldl(lambda x, y: x + y, tf.nn.embedding_lookup(normalized_embeddings, sample_dataset[i])) for i in range(SAMPLE_SIZE)])
    # サンプルデータとの類似度
    similarity = tf.matmul(sample_embed, tf.transpose(normalized_embeddings))

In [None]:
num_steps = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    for step in range(1, num_steps+1):
        batch_data, batch_labels = generate_batch(BATCH_SIZE)
        feed_dict = {
            train_dataset: batch_data,
            train_labels: batch_labels
        }
        _, I = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += I
        print(step)
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
                print('Average loss at step %d: %f' % (step, average_loss))
                average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            TOP_K = 8
            for i in range(SAMPLE_SIZE):
                sample_fun = dictionary[samples[i]]
                nearest = (-sim[i, :]).argsort()[1:TOP_K+1]
                log = 'Nearest to %s:' % sample_fun
                for k in range(TOP_K):
                    close_fun = dictionary[nearest[k]]
                    log = '%s %s ' % (log, close_fun)
                print(log)
    final_embeddings = normalized_embeddings.eval()