# Loading data

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
! unzip ml-1m.zip -d .

In [1]:
import pandas as pd
import numpy as np
import tensorflow

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ratings = (pd.read_csv('./ml-1m/ratings.dat', engine='python', sep='::', names=['user', 'item', 'rating', 'timestamp'])
    .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000))
          )

movies = (pd.read_csv('./ml-1m/movies.dat', engine='python', sep='::', names=['item', 'title', 'genres'])
          .assign(genres=lambda df:df.genres.str.split('|').values)
          .set_index('item', drop=False))

# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = pd.read_csv('./ml-1m/users.dat', engine='python', sep='::', 
                    names=['user', 'gender', 'age', 'occupation', 'zipcode'])\
    .set_index('user', drop=False)

In [3]:
ratings = ratings.assign(feedback=lambda df: 2 * (df.rating >= 4) - 1)

ratings.head()

Unnamed: 0,user,item,rating,timestamp,feedback
0,1,1193,5,2000-12-31 22:12:40,1
1,1,661,3,2000-12-31 22:35:09,-1
2,1,914,3,2000-12-31 22:32:48,-1
3,1,3408,4,2000-12-31 22:04:35,1
4,1,2355,5,2001-01-06 23:38:11,1


## Train/test split

 * Ideally time based split
 * For the sake of simplicity, let's just sample ratings uniformly (breaking the time machine rule)

In [4]:
test = ratings.sample(n=100000, random_state=0)
train_ratings_mask = ~ratings.index.isin(test.index)
train = ratings.loc[train_ratings_mask]

test_user_items = test[['user', 'item']]

print(train.shape)
print(test.shape)

test.head()

(900209, 5)
(100000, 5)


Unnamed: 0,user,item,rating,timestamp,feedback
324271,1922,2094,4,2000-11-20 04:34:27,1
818637,4918,2808,1,2000-07-08 19:29:05,-1
148677,957,1660,4,2000-11-25 05:28:13,1
778790,4653,914,5,2000-11-29 21:22:43,1
525489,3245,3324,1,2000-09-07 06:33:31,-1


## Refresher on cross entropy loss

In [5]:
import tensorflow as tf
sess = tf.InteractiveSession()

### Cross entroy loss for softmax (multi class) classification 

In [7]:
BATCH_SIZE = 4
N_CLASSES = 3

logits_values = np.array([[10, 1, 1], [10, 1, 1], [1, 2, 1], [1, 2, 1]], dtype=float)
logits = tf.constant(logits_values, shape=(BATCH_SIZE, N_CLASSES))

logits.eval()

array([[ 10.,   1.,   1.],
       [ 10.,   1.,   1.],
       [  1.,   2.,   1.],
       [  1.,   2.,   1.]])

In [8]:
labels_values = np.array([0, 1, 1, 2])
labels = tf.constant(labels_values)

one_hot_values = np.array([[1, 0, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float)
one_hot_labels = tf.constant(one_hot_values)

print('class labels:')
print(tf.reshape(labels, (-1, 1)).eval())

print('one-hot encoded class labels:')
print(one_hot_labels.eval())

class labels:
[[0]
 [1]
 [1]
 [2]]
one-hot encoded class labels:
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]


In [9]:
# tf.nn.sparse_softmax_cross_entropy_with_logits (a la word2vec)
# Measures the probability error in discrete classification tasks in which the
# classes are mutually exclusive (each entry is in exactly one class).

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)
loss.eval()

array([  2.46789153e-04,   9.00024679e+00,   5.51444714e-01,
         1.55144471e+00])

In [10]:
# step by step numpy equivalent
raw_softmax_values = np.exp(logits_values)
probabilities = (raw_softmax_values.T / np.sum(raw_softmax_values, axis=1)).T
probabilities

array([[  9.99753241e-01,   1.23379352e-04,   1.23379352e-04],
       [  9.99753241e-01,   1.23379352e-04,   1.23379352e-04],
       [  2.11941558e-01,   5.76116885e-01,   2.11941558e-01],
       [  2.11941558e-01,   5.76116885e-01,   2.11941558e-01]])

In [12]:
cross_entropy_losses = -np.log(probabilities)[[0, 1, 2, 3], labels_values]
cross_entropy_losses

array([  2.46789153e-04,   9.00024679e+00,   5.51444714e-01,
         1.55144471e+00])

### Binary cross entropy loss

In [16]:
# tf.nn.sigmoid_cross_entropy_with_logits (like in the CF auto-encoder article)
# Measures the probability error in discrete classification tasks in which each
# class is independent and not mutually exclusive.
tf.nn.sigmoid_cross_entropy_with_logits(logits, targets=one_hot_labels).eval()

array([[  4.53988992e-05,   1.31326169e+00,   1.31326169e+00],
       [  1.00000454e+01,   3.13261688e-01,   1.31326169e+00],
       [  1.31326169e+00,   1.26928011e-01,   1.31326169e+00],
       [  1.31326169e+00,   2.12692801e+00,   3.13261688e-01]])

In [13]:
# step by step numpy equivalent
binary_probabilities = 1 / (1 + np.exp(-logits_values))
binary_probabilities

array([[ 0.9999546 ,  0.73105858,  0.73105858],
       [ 0.9999546 ,  0.73105858,  0.73105858],
       [ 0.73105858,  0.88079708,  0.73105858],
       [ 0.73105858,  0.88079708,  0.73105858]])

In [18]:
- (one_hot_values * np.log(binary_probabilities) + (1 - one_hot_values) * np.log(1 - binary_probabilities))

array([[  4.53988992e-05,   1.31326169e+00,   1.31326169e+00],
       [  1.00000454e+01,   3.13261688e-01,   1.31326169e+00],
       [  1.31326169e+00,   1.26928011e-01,   1.31326169e+00],
       [  1.31326169e+00,   2.12692801e+00,   3.13261688e-01]])

## TensorFlow multi-class classification

http://stackoverflow.com/questions/37671974/tensorflow-negative-sampling

Can use:
 * `sparse_softmax_cross_entropy_with_logits` or `tf.nn.sampled_softmax_loss` (multi-class classsification a la word2vec)
 * but we could try [`tf.nn.sigmoid_cross_entropy_with_logits`](https://www.tensorflow.org/versions/r0.11/api_docs/python/nn.html#sigmoid_cross_entropy_with_logits) like in [Collaborative Denoising Auto-Encoders for Top-N Recommender Systems](http://yaowu.co/docs/wsdm16cdae.pdf)

In [21]:
import tensorflow as tf

import datetime as dt

BATCH_SIZE = 1024
N_ITER = 100
LOG_DIR = '/tmp/tflearn_logs'

N_ITEMS = ratings.item.max() + 1
N_USERS = ratings.user.max() + 1

N_USERS, N_ITEMS

(6041, 3953)

## Evaluating on a topn metric

In [28]:
count_by_items = train.groupby('item').size()
item_count_ranks = (count_by_items
                           .to_frame('score')
                           .reset_index()
                           .assign(rank=lambda df:np.arange(df.shape[0])+1))
item_count_ranks.head()

Unnamed: 0,item,score,rank
0,1,1871,1
1,2,624,2
2,3,429,3
3,4,147,4
4,5,277,5


In [29]:
pop_ranked_user_item = (pd.merge(
    left=pd.DataFrame.from_dict({'user': np.arange(N_USERS)}).assign(key=lambda df:np.ones_like(df.values)),
    right=item_count_ranks.assign(key=lambda df:np.ones_like(df.item)),
    on='key')
        .drop(['key'], axis=1))

pop_ranked_user_item.head()

Unnamed: 0,user,item,score,rank
0,0,1,1871,1
1,0,2,624,2
2,0,3,429,3
3,0,4,147,4
4,0,5,277,5


In [30]:
def recall_precision_at_k(ranked_user_item, test, topk=1):
    topk_ranked_user_item = ranked_user_item.query('rank <= {}'.format(topk))
    in_test_mask = topk_ranked_user_item.user.isin(test.user.unique())
    topk_in_test_ranked_user_item = topk_ranked_user_item[in_test_mask]

    recs_and_test = pd.merge(
        test,
        topk_in_test_ranked_user_item,
        on=['user', 'item'], how='outer')
    
    topk_predictions_mask = ~recs_and_test.score.isnull()
    ground_truth_mask = recs_and_test.feedback == 1
    n_hits = np.sum(ground_truth_mask & topk_predictions_mask).astype('float')
    n_ground_truth = np.sum(ground_truth_mask)
    n_predictions = np.sum(topk_predictions_mask)
    return n_hits / n_ground_truth, n_hits / n_predictions

recall_precision_at_k(pop_ranked_user_item, test)

(0.002738483542934887, 0.026426527520619424)

In [35]:
# the auto-encoder net will produce scores for N_USERS x N_ITEMS 
all_user_item_scores = np.random.uniform(low=0, high=1, size=(N_USERS, N_ITEMS))


def scores_to_frame(user_item_score_array, topk=1):
    user_as_rows = np.arange(N_USERS)
    item_as_cols = np.arange(N_ITEMS)

    all_users_item_ranks = np.argsort(np.argsort(-user_item_score_array, axis=1), axis=1).ravel() + 1
    only_topn_mask = all_users_item_ranks <= topk
    return pd.DataFrame.from_dict({
        'user': np.repeat(user_as_rows, N_ITEMS)[only_topn_mask],
        'item': np.tile(item_as_cols, N_USERS)[only_topn_mask],
        'score': all_users_item_ranks.ravel()[only_topn_mask],
        'rank': all_users_item_ranks[only_topn_mask]
    })

def scores_to_top1_frame(user_item_score_array):
    top1_items = np.argmax(user_item_score_array, axis=1)
    user_as_rows = np.arange(N_USERS)
    return pd.DataFrame.from_dict({
        'user': user_as_rows,
        'item': top1_items,
        'score': user_item_score_array[user_as_rows,top1_items],
        'rank': np.ones_like(top1_items)
    })


%timeit scores_to_frame(all_user_item_scores)

%timeit scores_to_top1_frame(all_user_item_scores)

1 loop, best of 3: 2.81 s per loop
10 loops, best of 3: 46.1 ms per loop


## Training

In [37]:
class UserAndItem2Vec:
    def __init__(self, dimensionality=50, batch_size=None):
        
        with tf.name_scope('user_embeddings'):
            user_embeddings = tf.Variable(tf.random_normal([N_USERS, dimensionality], 
                                                           stddev=0.01, mean=0), name='users')
            tf.histogram_summary('user_embeddings', user_embeddings)

        with tf.name_scope('item_biases'):
            item_biases = tf.Variable(tf.random_normal([N_ITEMS], stddev=0.01, mean=0), name='items')

        with tf.name_scope('item_embeddings'):
            item_embeddings = tf.Variable(tf.random_normal([N_ITEMS, dimensionality], stddev=0.01, mean=0), name='items')
            tf.histogram_summary('item_embeddings', item_embeddings)

        self.user_embeddings = user_embeddings
        self.item_embeddings = item_embeddings
        self.item_biases = item_biases
    
        self.input_user_ids = tf.placeholder(
            tf.int32, shape=[batch_size], name='user_ids')
        self.input_positive_item_ids = tf.placeholder(
            tf.int32, shape=[batch_size], name='positive_item_ids')

    def user_to_all_item_logits(self):
        """ This is the model described at equation (22) from http://yaowu.co/docs/wsdm16cdae.pdf
        y_ui = \sigma{W_i^t V_u + b_i}
        The architecture is made of:
         * input nodes for the user_id, and associated weights or embeddings V_u
         * internal weights W_i and biases b_i for each items
        """
        return tf.add(
            self.item_biases, 
            tf.matmul(
                tf.nn.embedding_lookup(self.user_embeddings, self.input_user_ids),
                tf.transpose(self.item_embeddings)),
            name='user_to_all_item_logits')

    def sampled_loss(self):
        with tf.name_scope('loss'):
            sample_losses = tf.nn.sampled_softmax_loss(
                biases=self.item_biases,
                inputs=tf.nn.embedding_lookup(self.user_embeddings, self.input_user_ids),
                labels=tf.reshape(self.input_positive_item_ids, (-1, 1)),
                weights=self.item_embeddings,
                num_classes=N_ITEMS,
                num_sampled=10, num_true=1)
            return tf.reduce_mean(sample_losses)
        
    def exact_loss(self):
        with tf.name_scope('loss'):
            cross_entropy_sum = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.user_to_all_item_logits(),
                    labels=self.input_positive_item_ids))
        return cross_entropy_sum

def training(loss, learning_rate=0.01):
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step


with tf.Graph().as_default():
    model = UserAndItem2Vec()
    loss = model.exact_loss()    #.sampled_loss()
    
    tf.scalar_summary('batch_ll', loss)
    summary = tf.merge_all_summaries()
    test_summary = tf.scalar_summary('test_ll', loss)
    train_step = training(loss)
    
    def perform_step(step, train, test, summary_writer):
        positive_sample_ratings = train.query('feedback == 1').sample(BATCH_SIZE)
        _, loss_value, summary_value = sess.run(
            fetches=[train_step, loss, summary], 
            feed_dict={
                model.input_user_ids: positive_sample_ratings.user.values,
                model.input_positive_item_ids: positive_sample_ratings.item.values
            })
        summary_writer.add_summary(summary_value, global_step=step)

        if step % 10 == 0:
            test_loss_value, test_summary_value = sess.run(
                fetches=[loss, test_summary],
                feed_dict={
                    model.input_user_ids: test.query('feedback == 1').user.values,
                    model.input_positive_item_ids: test.query('feedback == 1').item.values
                })

            all_users_item_logits = model.user_to_all_item_logits().eval(
                feed_dict={model.input_user_ids: np.arange(N_USERS)})

            print('Step {:2d}: batch/test cross-entropy loss = {:.2f}/{:.2f}'.format(step, loss_value, test_loss_value))
            print('         test recall/precision@1 = {:.4f}/{:.4f}'.format(
                *recall_precision_at_k(scores_to_top1_frame(all_users_item_logits), test)))
            summary_writer.add_summary(test_summary_value, global_step=step)

        summary_writer.flush()
        
    with tf.Session() as sess:

        summary_writer = tf.train.SummaryWriter(LOG_DIR + '/{:%Y%m%d%H%M%S}'.format(dt.datetime.now()), sess.graph)

        sess.run(tf.initialize_all_variables())
        
        for step in range(N_ITER):
            perform_step(step, train, test, summary_writer)
        

Step  0: batch/test cross-entropy loss = 8.28/8.27
         test recall/precision@1 = 0.0001/0.0008
Step 10: batch/test cross-entropy loss = 8.20/8.19
         test recall/precision@1 = 0.0021/0.0202
Step 20: batch/test cross-entropy loss = 8.04/8.02
         test recall/precision@1 = 0.0022/0.0215
Step 30: batch/test cross-entropy loss = 7.74/7.71
         test recall/precision@1 = 0.0034/0.0328
Step 40: batch/test cross-entropy loss = 7.47/7.46
         test recall/precision@1 = 0.0036/0.0343
Step 50: batch/test cross-entropy loss = 7.38/7.38
         test recall/precision@1 = 0.0044/0.0428
Step 60: batch/test cross-entropy loss = 7.32/7.34
         test recall/precision@1 = 0.0041/0.0392
Step 70: batch/test cross-entropy loss = 7.29/7.31
         test recall/precision@1 = 0.0042/0.0401
Step 80: batch/test cross-entropy loss = 7.26/7.30
         test recall/precision@1 = 0.0041/0.0396
Step 90: batch/test cross-entropy loss = 7.24/7.29
         test recall/precision@1 = 0.0043/0.0419


In [39]:
# Compared to pop 
recall_precision_at_k(pop_ranked_user_item, test)

# In the collaborative auto-encoder paper they report a recall@1/map@1 on the movielense 10M of: 
# * (0.01, 0.19) for pop
# * (0.04, 0.4) for their proposal
# Can the fact that we use the ML 1M explain this difference?

(0.002738483542934887, 0.026426527520619424)