In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

from lightfm.datasets import fetch_movielens

data = fetch_movielens('movielens', indicator_features=True, genre_features=True)

print('original train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('original test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())

# binarizing training examples as in the original lightfm paper to use the logistic loss
data['train'].data = np.array([-1, 1])[1 * (data['train'].data >= 4)]
data['test'].data = np.array([-1, 1])[1 * (data['test'].data >= 4)]

# should keep only positive test interactions
data['test_positive_only'] = data['test'].copy()
data['test_positive_only'].data = 1 *(data['test_positive_only'].data>=1)
data['test_positive_only'].eliminate_zeros()

train = data['train']
test = data['test']
test_positives = data['test_positive_only']

print('train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())
print('test_positive_only')
print(np.unique(data['test_positive_only'].data))
print(data['test_positive_only'].__repr__())

item_features = data['item_features']
tag_labels = data['item_feature_labels']
print('There are %s distinct item features, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))

# indicator only user features
unique_user_ids = np.unique(train.row)
user_features = sparse.csr_matrix((np.ones_like(unique_user_ids), (unique_user_ids, unique_user_ids)))
print('There are %s distinct user features.' % user_features.shape[1])



original train
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 90570 stored elements in COOrdinate format>
original test
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 9430 stored elements in COOrdinate format>
train
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 90570 stored elements in COOrdinate format>
test
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 9430 stored elements in COOrdinate format>
test_positive_only
[1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 5469 stored elements in COOrdinate format>
There are 1701 distinct item features, with values like ['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)'].
There are 943 distinct user features.


In [2]:
def mean_reciprocal_rank(predicted_ranks_df):
    return predicted_ranks_df.assign(rec_rank=lambda df:1 / (df['rank'] + 1)).groupby('user')['rec_rank'].max().mean()

## Tensforflow model

In [44]:
import tensorflow as tf
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

print(tf.__version__)

1.10.0


## train/test

In [6]:
train_df = pd.DataFrame.from_dict({
        'user': train.row,
        'item': train.col,
        'rating': train.data,
    })

test_df = pd.DataFrame.from_dict({
        'user': test.row,
        'item': test.col,
        'rating': test.data,
    })

print(train_df.shape)
train_df.head()

test_user_ids = test_df.user.unique()
all_user_ids = train_df.user.unique()
all_item_ids = np.unique(data['item_features'].tocoo().row)

def to_all_user_items(user_ids, item_ids):
    return pd.DataFrame.from_dict(
        {'user': np.repeat(user_ids, len(item_ids)),
         'item': np.tile(item_ids, len(user_ids))})

all_user_items = to_all_user_items(all_user_ids, all_item_ids)
print(all_user_items.shape)
print(test_user_ids.shape)
all_user_items.head()

(90570, 3)
(1586126, 2)
(943,)


Unnamed: 0,user,item
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4


In [46]:
user_features[train_df.tail().user.values,:].data

array([1, 1, 1, 1, 1], dtype=int32)

In [43]:
def intra_row_index(row_indexes):
    count_by_row = np.bincount(row_indexes)
    shift_by_row = np.concatenate([[0], np.cumsum(count_by_row)])
    return np.arange(len(row_indexes)) - shift_by_row[row_indexes]

# from https://github.com/tensorflow/tensorflow/issues/342#issuecomment-160354041
# not very sparse, but rather a kind of jagged array where every batch sample can have 1, N_FEATURES features
def sparse_features_to_tensor(batch_sparse_features):
    batch_features_as_coo = batch_sparse_features.tocoo()
    batch_features_sparse_tensor = tf.SparseTensorValue(
        indices=np.vstack([batch_features_as_coo.row, intra_row_index(batch_features_as_coo.row)]).T,
        values=batch_features_as_coo.col,
        dense_shape=batch_features_as_coo.shape
    )
    return batch_features_sparse_tensor

sparse_features_to_tensor(user_features[train_df.tail().user.values,:])

SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0],
       [4, 0]]), values=array([942, 942, 942, 942, 942], dtype=int32), dense_shape=(5, 943))

In [99]:
def sample_batch(positives_df, batch_size, positive_ratio=.33):
    n_positives = int(batch_size * positive_ratio)
    n_negatives = batch_size - n_positives
    negatives = pd.DataFrame.from_dict({
        'user': np.random.choice(all_user_ids, replace=True, size=n_negatives),
        'item': np.random.choice(all_item_ids, replace=True, size=n_negatives),
        'rating': np.repeat(0, n_negatives)
        })
    return pd.concat([positives_df.sample(n_positives), negatives], axis=0)

# if train has both positives and negatives
def sample_batch(positives_and_negatives_df, batch_size):
    batch_df = positives_and_negatives_df.sample(batch_size)
    return batch_df.assign(rating = lambda df: np.maximum(df.rating, 0))

test_samples = sample_batch(train_df, batch_size=5)

## TF model

In [111]:
N_USERS, N_ITEMS = train.shape
N_ITEM_FEATURES = item_features.shape[1]
N_USER_FEATURES = user_features.shape[1]

class Placeholders:
    
    def __init__(self, batch_size=None):
        self.user_sparse_features = tf.sparse_placeholder(tf.int32, name='user_features')
        self.item_sparse_features = tf.sparse_placeholder(tf.int32, name='item_features')
        self.ratings = tf.placeholder(tf.float32, shape=[batch_size], name='ratings')

    def to_feed_dict(self, user_items_df, with_ratings=False):
        features_dict = {
            self.item_sparse_features: sparse_features_to_tensor(item_features[user_items_df.item.values,:]),
            self.user_sparse_features: sparse_features_to_tensor(user_features[user_items_df.user.values,:])
            }
        
        if with_ratings:
            features_dict[self.ratings] = user_items_df.rating.values

        return features_dict
    
    
class UserItemFeaturesModel:
    def __init__(self, dimensionality=30):
        self.dimensionality = dimensionality
    
        with tf.name_scope('BU'):
            self.user_features_biases =  tf.Variable(tf.random_normal(shape=[N_USER_FEATURES, 1], stddev=0.01, mean=0))
            tf.summary.histogram('user_features_biases', self.user_features_biases)

        with tf.name_scope('BI'):
            self.item_features_biases =  tf.Variable(tf.random_normal(shape=[N_ITEM_FEATURES, 1], stddev=0.01, mean=0))
            tf.summary.histogram('item_features_biases', self.item_features_biases)

        with tf.name_scope('Q'):
            self.user_features_factors = tf.Variable(tf.random_normal([N_USER_FEATURES, self.dimensionality], stddev=0.01, mean=0))
            tf.summary.histogram('user_features_factors', self.user_features_factors)
            
        with tf.name_scope('P'):
            self.item_features_factors = tf.Variable(tf.random_normal([N_ITEM_FEATURES, self.dimensionality], stddev=0.01, mean=0))
            tf.summary.histogram('item_features_factors', self.item_features_factors)
    
    def user_bias(self, user_sparse_features):
        with tf.name_scope('B_user'):
            return tf.squeeze(tf.nn.embedding_lookup_sparse(
                        self.user_features_biases,
                        sp_ids=user_sparse_features, sp_weights=None, combiner='sum'))

    def item_bias(self, item_sparse_features):
        with tf.name_scope('B_item'):
            return tf.squeeze(tf.nn.embedding_lookup_sparse(
                        self.item_features_biases,
                        sp_ids=item_sparse_features, sp_weights=None, combiner='sum'))

    def user_item_features_product(self, user_sparse_features, item_sparse_features):
        with tf.name_scope('Q_user'):
            batch_user_factors = tf.squeeze(tf.nn.embedding_lookup_sparse(
                        self.user_features_factors,
                        sp_ids=user_sparse_features, sp_weights=None, combiner='sum'))
            
        with tf.name_scope('P_item'):
            batch_item_factors = tf.squeeze(tf.nn.embedding_lookup_sparse(
                        self.item_features_factors,
                        sp_ids=item_sparse_features, sp_weights=None, combiner='sum'))

        with tf.name_scope('dot'):
            factors_prediction = tf.reduce_mean(
                tf.mul(batch_user_factors, batch_item_factors), reduction_indices=1)
        return factors_prediction                
    
    def predictions(self, user_sparse_features, item_sparse_features):
        with tf.name_scope('inference'):
            return tf.add(
                self.user_item_features_product(user_sparse_features, item_sparse_features), 
                tf.add(self.user_bias(user_sparse_features), self.item_bias(item_sparse_features), name='biases'),
                name='logits')

def compute_loss(predictions, targets):
    with tf.name_scope('loss'):
        return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=predictions, targets=targets))

In [112]:
def all_predictions_to_hits(all_user_items, all_predicted_values, ground_truth_user_items):
    predicted_ratings = all_user_items.assign(predicted_rating=lambda _: all_predicted_values)
    predicted_ranks = predicted_ratings.groupby('user')['predicted_rating'].rank(ascending=False, method='max')
    predicted_ratings['rank'] = predicted_ranks.values - 1

    ground_truth_hits = pd.merge(
        left=ground_truth_user_items,
        right=predicted_ratings,
        on=['user', 'item'], how='left')
    return ground_truth_hits

In [113]:
def all_predicted_hits(predict_function, ground_truth_df, split_size=1000):
    user_ids = ground_truth_df.user.unique()
    item_ids = ground_truth_df.item.unique()
    user_ids_splits = np.array_split(user_ids, len(user_ids) / split_size)
    user_items_splits = (to_all_user_items(user_ids_split, item_ids) for user_ids_split in user_ids_splits)
    hits_for_user_splits = [all_predictions_to_hits(
            split_user_items, 
            all_predicted_values=predict_function(split_user_items),
            ground_truth_user_items=ground_truth_df[ground_truth_df.user.isin(split_user_items.user.unique())])
        for split_user_items in user_items_splits]
    return pd.concat(hits_for_user_splits)

In [114]:
import os

LEARNING_RATE = 0.01
N_ITER = 101
BATCH_SIZE = 1024
N_STEP_SUMMARY = 20
LOG_DIR = '/tmp/tfrecs_logs'

with tf.Graph().as_default():
    model = UserItemFeaturesModel(dimensionality=10)
    inputs = Placeholders()
    
    logits = model.predictions(inputs.user_sparse_features, inputs.item_sparse_features)
    loss = compute_loss(logits, inputs.ratings)
    
    tf.summary.scalar('train_loss', loss)
    summary = tf.summary.merge_all()
    test_summary = tf.summary.scalar('test_loss', loss)

    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
                
    def perform_step(step, train, test, summary_writer):
        batch_samples = sample_batch(train_df, BATCH_SIZE)

        _, loss_value, summary_value = sess.run(
            fetches=[train_step, loss, summary], 
            feed_dict=inputs.to_feed_dict(batch_samples, with_ratings=True))
        
        summary_writer.add_summary(summary_value, global_step=step)

        if step % N_STEP_SUMMARY == 0:

            test_samples = sample_batch(test_df, BATCH_SIZE)
            test_loss_value, test_summary_value = sess.run(
                fetches=[loss, test_summary],
                feed_dict=inputs.to_feed_dict(test_samples, with_ratings=True))
            summary_writer.add_summary(test_summary_value, global_step=step)

            # predicting on all users and items
            train_hits = all_predicted_hits(
                lambda user_items: logits.eval(feed_dict=inputs.to_feed_dict(user_items)),
                train_df, split_size=200)

            test_hits = all_predicted_hits(
                lambda user_items: logits.eval(feed_dict=inputs.to_feed_dict(user_items)),
                test_df, split_size=200)
            
            print('Step %d: batch/test log loss = %.3f/%.3f, train/test MRR = %.3f/%.3f' % (
                    step, loss_value, test_loss_value, 
                    mean_reciprocal_rank(train_hits),
                    mean_reciprocal_rank(test_hits)
                ))

        summary_writer.flush()
                
    with tf.Session() as sess:

        summary_writer = tf.summary.FileWriter(LOG_DIR + '/{:%Y%m%d%H%M%S}'.format(dt.datetime.now()), sess.graph)

        print('Starting training')
        sess.run(tf.global_variables_initializer())
        
        for step in range(N_ITER):
            perform_step(step, train_df, test_df, summary_writer)

        train_hits = all_predicted_hits(
            lambda user_items: logits.eval(feed_dict=inputs.to_feed_dict(user_items)),
            train_df, split_size=100)

        test_hits = all_predicted_hits(
            lambda user_items: logits.eval(feed_dict=inputs.to_feed_dict(user_items)),
            test_df, split_size=100)
        
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), step)

Starting training
Step 0: batch/test log loss = 0.693/0.691, train/test MRR = 0.282/0.063
Step 20: batch/test log loss = 0.654/0.659, train/test MRR = 0.496/0.131
Step 40: batch/test log loss = 0.637/0.637, train/test MRR = 0.448/0.110
Step 60: batch/test log loss = 0.598/0.616, train/test MRR = 0.429/0.108
Step 80: batch/test log loss = 0.569/0.608, train/test MRR = 0.426/0.111
Step 100: batch/test log loss = 0.575/0.593, train/test MRR = 0.431/0.114


In [116]:
print(train_hits.shape)
print(test_hits.shape)
print(mean_reciprocal_rank(train_hits))
print(mean_reciprocal_rank(test_hits))

(90570, 5)
(9430, 5)
0.431066543434
0.114040851421
