<p>Twitterのプロフィールで興味・関心キーワードを抜き出したものをcorpus.pklに保存されている</p>
<p> それらのキーワードを1つ隠し、残りのキーワードから隠した1つを予測し、学習することで興味ベクトルを作る</p>

In [1]:
import pickle
with open('data/corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('data/dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)

In [2]:
reverse_dictionary = {v: k for k, v in dictionary.items()}

In [3]:
data_index = 0
import numpy as np
from copy import copy
import random

def generate_batch(batch_size):
    global data_index
    assert batch_size % 2 == 0
#     batch = np.ndarray(shape=(batch_size, len), dtype=np.int32)
    batch = []
    labels = []
#     labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    batch_count = 0
    
    while(batch_count < batch_size):
        fun_ids = copy(corpus[data_index])
        # 興味が１つしかない場合は予測できないので無視
        if len(fun_ids) >= 2:
            label = random.sample(fun_ids, 1)
            fun_ids.remove(label[0])
            for fun_id in fun_ids:
                batch.append([batch_count, fun_id])
            labels.append(label)
            batch_count += 1
        data_index = (data_index + 1) % len(corpus)
        if batch_count >= batch_size:
            break

    return batch, labels

_batch, _labels = generate_batch(batch_size=10)
print(' batch:', _batch)
print(' labels:', _labels)

 batch: [[0, 5], [0, 2302], [0, 2], [0, 1], [0, 4], [1, 9405], [1, 30234], [2, 2403], [2, 4699], [2, 7999], [2, 3190], [3, 8163], [3, 6430], [3, 5552], [3, 53937], [3, 2571], [3, 5186], [3, 352], [3, 3875], [3, 2991], [3, 455], [3, 20664], [4, 30324], [4, 13848], [5, 2505], [5, 2994], [5, 7378], [5, 18559], [5, 14151], [5, 8423], [5, 72], [5, 4975], [5, 4699], [5, 10165], [5, 1961], [5, 41895], [6, 33417], [7, 5710], [7, 53], [7, 3617], [7, 51], [7, 2420], [8, 9899], [8, 26476], [8, 1862], [8, 1869], [8, 1866], [8, 554], [8, 2570], [9, 23551], [9, 2372], [9, 554], [9, 31205], [9, 6271], [9, 47541], [9, 2791]]
 labels: [[0], [2420], [9338], [391], [13857], [77], [6596], [52], [4699], [1756]]


In [15]:
# Train cbow model
import tensorflow as tf
import math
import numpy as np
import random

BATCH_SIZE = 500
EMBEDDING_SIZE = 200
NUM_SAMPLED = int(BATCH_SIZE/2)
VOCAB_SIZE = len(dictionary)
SAMPLE_SIZE = len([16038,892,3655,2259,4055,1604,4569])
# 辞書の単語IDは0から順に振られているのでrangeでもOK
sample_indices = np.array([[idx, vocab_id] for idx, vocab_id in enumerate([16038,892,3655,2259,4055,1604,4569])])

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    # input data
    # sparse matrix(shape=batch_size*vocab_size)にtrain_indicesの部分に0でない興味が入り、valuesはすべて1
    train_indices = tf.placeholder(tf.int64) # 0でない場所を指す
    train_dataset = tf.SparseTensor(train_indices, values=tf.ones([tf.shape(train_indices)[0]], tf.float32), dense_shape=[BATCH_SIZE, VOCAB_SIZE])
    train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
    sample_dataset = tf.SparseTensor(sample_indices, values=tf.ones([tf.shape(sample_indices)[0]], tf.float32), dense_shape=[SAMPLE_SIZE, VOCAB_SIZE])
    
    # Variables
    # 一様分布で興味ベクトル初期化
    embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBEDDING_SIZE], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBEDDING_SIZE], stddev=1.0/math.sqrt(EMBEDDING_SIZE)))
    softmax_biases = tf.Variable(tf.zeros([VOCAB_SIZE]))

    # 該当ユーザーのtrainingデータの興味ベクトルをすべて足す
    embed = tf.sparse_tensor_dense_matmul(train_dataset, embeddings)

    # softmax loss with negative sampling
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(
            weights=softmax_weights,
            biases=softmax_biases,
            inputs=embed, # batch_size * embedding_size
            labels=train_labels,
            num_sampled=NUM_SAMPLED,
            num_classes=VOCAB_SIZE))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdagradOptimizer(0.8).minimize(loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    # サンプルデータのベクトル
    sample_embed = tf.sparse_tensor_dense_matmul(sample_dataset, normalized_embeddings)
    # サンプルデータとの類似度
    similarity = tf.matmul(sample_embed, tf.transpose(normalized_embeddings))
    # tensorboard summary
    merged_summary = tf.summary.merge_all()

In [18]:
NUM_STEPS = 2000
SUMMARY_DIR = 'data/tensorboard'
TOP_K = 8

with tf.Session(graph=graph) as sess:
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
    tf.global_variables_initializer().run()
    average_loss = 0
    for step in range(1, NUM_STEPS+1):
        batch_data, batch_labels = generate_batch(BATCH_SIZE)
        feed_dict = {
            train_indices: batch_data,
            train_labels: batch_labels
        }
        summary, I = sess.run([merged_summary, loss], feed_dict=feed_dict, options=options, run_metadata=run_metadata)
        average_loss += I
        if step % 100 == 0:
            average_loss = average_loss / 1000
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
            writer.add_summary(summary, step)
        if step % 1000 == 0:
            sim = similarity.eval()
            for i in range(SAMPLE_SIZE):
                sample_fun = reverse_dictionary[sample_indices[i][1]]
                nearest = (-sim[i, :]).argsort()[1:TOP_K+1]
                log = 'Nearest to %s:' % sample_fun
                for k in range(TOP_K):
                    close_fun = reverse_dictionary[nearest[k]]
                    log = '%s %s ' % (log, close_fun)
                print(log)
    final_embeddings = normalized_embeddings.eval()

Average loss at step 100: 0.665660
Average loss at step 200: 0.660416
Average loss at step 300: 0.650846
Average loss at step 400: 0.643520
Average loss at step 500: 0.640894
Average loss at step 600: 0.641348
Average loss at step 700: 0.639553
Average loss at step 800: 0.639181
Average loss at step 900: 0.636717
Average loss at step 1000: 0.637034
Nearest to 機械学習: 今日から俺は  胎盤  貘  北村匠海  梨菜  智香  マジガルディズ  三越 
Nearest to プログラミング: かい  ピスト  ジェジュン  日にち  ゴローちゃん  文献学  出茂  ガングレイヴ 
Nearest to 将棋: 極黒のブリュンヒルデ  赤さん  具志堅用高  戦旗  博物館  ネイサン  コンヒーロ  METAL MAX 
Nearest to ビリヤード: トニジャ  品田  残像  東京ヴェルディ  斉藤和義  ももクロちゃん  速水  婆裟羅 
Nearest to ダーツ: 依田  申告  作りました  小宮有紗  ハヤシ  GRADIUS  茹  结衣 
Nearest to 旅行: 異性装  田崎敬浩  たばこ  共和主義  国交断絶  専念  鼻血  ガンダムブレイカー 
Nearest to 温泉: 痴呆症  メシア  猫と金魚  ブリーフ  自治会  無修正動画  今野杏南  福浦 
Average loss at step 1100: 0.631687
Average loss at step 1200: 0.629109
Average loss at step 1300: 0.628437
Average loss at step 1400: 0.664179
Average loss at step 1500: 0.661092
Average loss at step 1600: 0

In [6]:
print(dictionary['機械学習'])
print(dictionary['プログラミング'])
print(dictionary['将棋'])
print(dictionary['ビリヤード'])
print(dictionary['ダーツ'])
print(dictionary['旅行'])
print(dictionary['温泉'])

16038
892
3655
2259
4055
1604
4569
