[View in Colaboratory](https://colab.research.google.com/github/muik/notebooks/blob/master/experiments/article_recommend_training_extreme_multiple_classification.ipynb)

In [3]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'towneers'
!gcloud config set project {project_id}

Updated property [core/project].


In [4]:
%%bash
gsutil -m cp gs://daangn-tmp/ml/data/user_article_links/train*.gz  /tmp/
gsutil -m cp gs://daangn-tmp/ml/data/user_article_links/article_ids.txt  /tmp/

Copying gs://daangn-tmp/ml/data/user_article_links/train1.tfrecord.gz...
Copying gs://daangn-tmp/ml/data/user_article_links/train0.tfrecord.gz...
Copying gs://daangn-tmp/ml/data/user_article_links/train2.tfrecord.gz...
Copying gs://daangn-tmp/ml/data/user_article_links/train3.tfrecord.gz...
/ [0/4 files][    0.0 B/148.3 MiB]   0% Done                                    / [0/4 files][    0.0 B/148.3 MiB]   0% Done                                    / [0/4 files][    0.0 B/148.3 MiB]   0% Done                                    / [0/4 files][    0.0 B/148.3 MiB]   0% Done                                    -- [0/4 files][  6.7 MiB/148.3 MiB]   4% Done                                    \\ [1/4 files][ 55.9 MiB/148.3 MiB]  37% Done                                    |// [2/4 files][104.8 MiB/148.3 MiB]  70% Done                                    / [3/4 files][111.9 MiB/148.3 MiB]  75% Done                                    -- [3/4 files][114.2 MiB/148.3 MiB]  77% Done      

In [5]:
article_id_set = [int(x.rstrip()) for x in open('/tmp/article_ids.txt', 'r').readlines()]
article_id_to_idx = dict([(id, i) for i, id in enumerate(article_id_set)])
print('articles count:', len(article_id_set))

articles count: 297197


In [0]:
import multiprocessing

import tensorflow as tf
from tensorflow.python.lib.io import file_io

def read_examples(input_files, batch_size, shuffle=True, num_epochs=None):
  """Creates readers and queues for reading example protos."""
  files = []
  for e in input_files:
    for path in e.split(','):
      files.extend(file_io.get_matching_files(path))
  thread_count = multiprocessing.cpu_count()
  
  # The minimum number of instances in a queue from which examples are drawn
  # randomly. The larger this number, the more randomness at the expense of
  # higher memory requirements.
  min_after_dequeue = 1000
  
  # When batching data, the queue's capacity will be larger than the batch_size
  # by some factor. The recommended formula is (num_threads + a small safety
  # margin). For now, we use a single thread for reading, so this can be small.
  queue_size_multiplier = thread_count + 3
  
  # Convert num_epochs == 0 -> num_epochs is None, if necessary
  num_epochs = num_epochs or None
  
  # Build a queue of the filenames to be read.
  filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle)
  
  options = tf.python_io.TFRecordOptions(
      compression_type=tf.python_io.TFRecordCompressionType.GZIP)
  example_id, encoded_example = tf.TFRecordReader(options=options).read_up_to(
      filename_queue, batch_size)
  
  if shuffle:
    capacity = min_after_dequeue + queue_size_multiplier * batch_size
    return tf.train.shuffle_batch(
        [example_id, encoded_example],
        batch_size,
        capacity,
        min_after_dequeue,
        enqueue_many=True,
        num_threads=thread_count)
        
  else:
    capacity = queue_size_multiplier * batch_size
    return tf.train.batch(
        [example_id, encoded_example],
        batch_size,
        capacity=capacity,
        enqueue_many=True,
        num_threads=thread_count)

In [0]:
def dot_mode(latent_codes_1, latent_codes_2):
    return tf.reduce_sum(latent_codes_1 * latent_codes_2, axis=-1)

def cos_mode(latent_codes_1, latent_codes_2):
    sq_norm_1 = tf.reduce_sum(latent_codes_1 ** 2, axis=-1)
    sq_norm_2 = tf.reduce_sum(latent_codes_2 ** 2, axis=-1)
    dot = dot_mode(latent_codes_1, latent_codes_2)
    return dot / tf.sqrt(sq_norm_1 * sq_norm_2)

def l2_mode(latent_codes_1, latent_codes_2):
  return tf.norm(latent_codes_1 - latent_codes_2, ord=2, axis=-1)

In [0]:
classifier = tf.estimator.Estimator()
classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 500))

In [30]:
from time import time
import tensorflow as tf
import numpy as np

from tensorflow.contrib import layers
from tensorflow.python.keras.layers import Embedding

DICT_SIZE = len(article_id_set)
EMB_SIZE = 32
NEGATIVE_SAMPLE_SIZE = 16

def build_model(examples, training=True):
  feature_map = {
    'article_ids': tf.VarLenFeature(dtype=tf.int64),
    'article_scores': tf.VarLenFeature(dtype=tf.float32),
    'age': tf.FixedLenFeature(shape=[], dtype=tf.int64),
    'target_id': tf.FixedLenFeature(shape=[], dtype=tf.int64),
    'target_score': tf.FixedLenFeature(shape=[], dtype=tf.float32),
  }
  parsed = tf.parse_example(examples, features=feature_map)
  article_ids = tf.sparse_tensor_to_dense(parsed['article_ids'], default_value=0)
  article_scores = tf.sparse_tensor_to_dense(parsed['article_scores'], default_value=0.)
  age = parsed['age'] # example age
  target_id = parsed['target_id']
  target_score = parsed['target_score']
  
  batch_size_tensor = tf.shape(age)[0]
  negative_ids = tf.random_uniform([batch_size_tensor, NEGATIVE_SAMPLE_SIZE],
                                   maxval=DICT_SIZE, dtype=tf.int64)

  initializer = tf.random_uniform_initializer(-1., 1.)
  article_embedding = Embedding(DICT_SIZE, EMB_SIZE,
                                embeddings_initializer=initializer)

  pre_embeddings = article_embedding(article_ids)
  pre_score_sums = tf.expand_dims(tf.reduce_sum(article_scores, 1), 1)
  expanded_article_scores = tf.expand_dims(article_scores / pre_score_sums, -1)
  pre_embedding = tf.reduce_sum(expanded_article_scores * pre_embeddings, 1)

  expanded_age = tf.expand_dims(age, 1)
  expanded_age = tf.to_float(expanded_age)
  expanded_age = tf.layers.batch_normalization(expanded_age, training=training)
  
  user = tf.concat([pre_embedding, expanded_age], 1)
  user = layers.fully_connected(user, EMB_SIZE, activation_fn=tf.nn.tanh)

  target_ids = tf.concat([tf.expand_dims(target_id, 1), negative_ids], 1)
  target_embeddings = article_embedding(target_ids)
  # [1, 0, 0, 0] => [0]
  
  logits = tf.matmul(tf.expand_dims(user, 1), target_embeddings, transpose_b=True)
  logits = tf.squeeze(logits, 1)
  prediction = tf.argmax(logits, 1)
  labels = tf.zeros([batch_size_tensor], dtype=tf.int32)

  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      logits=logits, labels=labels, name='xentropy')
  loss_op = tf.reduce_mean(cross_entropy, name='xentropy_mean')
  
  all_logits = tf.matmul(user,
                         article_embedding.variables[0],
                         transpose_b=True)
  is_in_top_k = tf.nn.in_top_k(all_logits, target_id, 100)
  accuracy = tf.reduce_mean(tf.to_float(is_in_top_k))

  global_step = tf.Variable(0, name='global_step', trainable=False)
  
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    train_op = tf.train.AdamOptimizer(0.001).minimize(loss_op, global_step)
  
  return train_op, {
      'global_step': global_step, 'loss': loss_op, 'eval': accuracy
  }, [logits, prediction, user]

#!rm -rf /tmp/train
with tf.Graph().as_default():
  checkpoint_dir = '/tmp/train'
  max_steps = 3000000
  batch_size = 128
  keys, examples = read_examples(['/tmp/train*.tfrecord.gz'], batch_size, num_epochs=10)

  train_op, logging_tensors, debug_tensors = build_model(examples)
  
  logging_hook = tf.train.LoggingTensorHook(
          tensors=logging_tensors,
          every_n_iter=5000)
  hooks=[logging_hook, tf.train.StopAtStepHook(last_step=max_steps)]

  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=checkpoint_dir, hooks=hooks, log_step_count_steps=10000,
      save_checkpoint_secs=300) as session:
    i = 0
    while not session.should_stop():
      session.run(train_op)
      
      if False:
        if i % 1000 == 0:
          results = session.run(debug_tensors)
          for i, x in enumerate(results):
            print(i, x[:2])
        i += 1

#!gsutil -m cp -r /tmp/train gs://daangn-tmp/ml/data/user_article_links/train

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/train/model.ckpt-1822556
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1822557 into /tmp/train/model.ckpt.
INFO:tensorflow:global_step = 1822557, loss = 0.008787343, eval = 0.4375
INFO:tensorflow:global_step = 1827557, loss = 0.052560907, eval = 0.5625 (53.080 sec)
INFO:tensorflow:global_step/sec: 94.3097
INFO:tensorflow:global_step = 1832557, loss = 0.009392569, eval = 0.5703125 (52.739 sec)
INFO:tensorflow:global_step = 1837557, loss = 0.03137827, eval = 0.484375 (52.626 sec)
INFO:tensorflow:global_step/sec: 94.8469
INFO:tensorflow:global_step = 1842557, loss = 0.030319199, eval = 0.4375 (52.802 sec)
INFO:tensorflow:global_step = 1847557, loss = 0.003398628, eval = 0.5390625 (52.857 sec)
INFO:tensorflow:Saving checkpoints for 1850939 into /tmp/train/model.ckpt.
INFO:tensorflow:global

INFO:tensorflow:Saving checkpoints for 1964190 into /tmp/train/model.ckpt.
INFO:tensorflow:global_step = 1967557, loss = 0.054718792, eval = 0.53125 (53.390 sec)
INFO:tensorflow:global_step/sec: 93.5659
INFO:tensorflow:global_step = 1972557, loss = 0.013845446, eval = 0.4921875 (53.483 sec)
INFO:tensorflow:global_step = 1977557, loss = 0.00697367, eval = 0.5234375 (53.287 sec)
INFO:tensorflow:global_step/sec: 93.7477
INFO:tensorflow:global_step = 1982557, loss = 0.018910915, eval = 0.5 (53.384 sec)
INFO:tensorflow:global_step = 1987557, loss = 0.019925067, eval = 0.546875 (53.329 sec)
INFO:tensorflow:Saving checkpoints for 1992306 into /tmp/train/model.ckpt.
INFO:tensorflow:global_step/sec: 93.6378
INFO:tensorflow:global_step = 1992557, loss = 0.015852392, eval = 0.4609375 (53.464 sec)
INFO:tensorflow:global_step = 1997557, loss = 0.007155015, eval = 0.53125 (53.469 sec)
INFO:tensorflow:global_step/sec: 93.5984
INFO:tensorflow:global_step = 2002557, loss = 0.077112734, eval = 0.5390625

In [29]:
!gsutil -m cp -r /tmp/train gs://daangn-tmp/ml/data/user_article_links/train

Copying file:///tmp/train/model.ckpt-1747834.index [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1804325.meta [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1776135.index [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1804325.data-00000-of-00001 [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1719444.meta [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1747834.meta [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1804325.index [Content-Type=application/octet-stream]...
Copying file:///tmp/train/checkpoint [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1822556.meta [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1747834.data-00000-of-00001 [Content-Type=application/octet-stream]...
Copying file:///tmp/train/model.ckpt-1822556.data