In [1]:
import pandas as pd
import numpy as np
from lightfm.datasets import fetch_movielens

data = fetch_movielens('movielens', indicator_features=False, genre_features=True)

print('original train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('original test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())

# binarizing traing examples as in the original lightfm paper to use the logistic loss
data['train'].data = np.array([-1, 1])[1 * (data['train'].data >= 4)]
data['test'].data = np.array([-1, 1])[1 * (data['test'].data >= 4)]

# should keep only positive test interactions
data['test_positive_only'] = data['test'].copy()
data['test_positive_only'].data = 1 *(data['test_positive_only'].data>=1)
data['test_positive_only'].eliminate_zeros()

train = data['train']
test = data['test']
test_positives = data['test_positive_only']

print('train')
print(np.unique(data['train'].data))
print(data['train'].__repr__())
print('test')
print(np.unique(data['test'].data))
print(data['test'].__repr__())
print('test_positive_only')
print(np.unique(data['test_positive_only'].data))
print(data['test_positive_only'].__repr__())

item_features = data['item_features']
tag_labels = data['item_feature_labels']
print('There are %s distinct item features, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))



original train
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 90570 stored elements in COOrdinate format>
original test
[1 2 3 4 5]
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 9430 stored elements in COOrdinate format>
train
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 90570 stored elements in COOrdinate format>
test
[-1  1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 9430 stored elements in COOrdinate format>
test_positive_only
[1]
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 5469 stored elements in COOrdinate format>
There are 19 distinct item features, with values like ['genre:unknown', 'genre:Action', 'genre:Adventure'].


In [2]:
train_df = pd.DataFrame.from_dict({
        'user': train.row,
        'item': train.col,
        'rating': train.data,
    })

test_df = pd.DataFrame.from_dict({
        'user': test.row,
        'item': test.col,
        'rating': test.data,
    })

print(train_df.shape)
train_df.head()

test_user_ids = test_df.user.unique()
all_user_ids = train_df.user.unique()
all_item_ids = np.unique(data['item_features'].tocoo().row)

def to_all_user_items(user_ids, item_ids):
    return pd.DataFrame.from_dict(
        {'user': np.repeat(user_ids, len(item_ids)),
         'item': np.tile(item_ids, len(user_ids))})

all_user_items = to_all_user_items(all_user_ids, all_item_ids)
print(all_user_items.shape)
print(test_user_ids.shape)
all_user_items.head()

(90570, 3)
(1586126, 2)
(943,)


Unnamed: 0,user,item
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4


## Tensforflow model

In [3]:
import tensorflow as tf
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.10.0


## Prediction

In [4]:
def all_predictions_to_hits(all_user_items, all_predicted_values, ground_truth_user_items):
    predicted_ratings = all_user_items.assign(predicted_rating=lambda _: all_predicted_values)
    predicted_ranks = predicted_ratings.groupby('user')['predicted_rating'].rank(ascending=False, method='max')
    predicted_ratings['rank'] = predicted_ranks.values - 1

    ground_truth_hits = pd.merge(
        left=ground_truth_user_items,
        right=predicted_ratings,
        on=['user', 'item'], how='left')
    return ground_truth_hits

In [5]:
def all_predicted_hits(predict_function, ground_truth_df, split_size=1000):
    user_ids = ground_truth_df.user.unique()
    item_ids = ground_truth_df.item.unique()
    user_ids_splits = np.array_split(user_ids, len(user_ids) / split_size)
    user_items_splits = (to_all_user_items(user_ids_split, item_ids) for user_ids_split in user_ids_splits)
    hits_for_user_splits = [all_predictions_to_hits(
            split_user_items, 
            all_predicted_values=predict_function(split_user_items),
            ground_truth_user_items=ground_truth_df[ground_truth_df.user.isin(split_user_items.user.unique())])
        for split_user_items in user_items_splits]
    return pd.concat(hits_for_user_splits)

In [6]:
def mean_reciprocal_rank(predicted_ranks_df):
    return predicted_ranks_df\
        .assign(rec_rank=lambda df:1 / (df['rank'] + 1))\
        .groupby('user')['rec_rank'].max()\


In [7]:
def sample_batch(positives_df, batch_size, positive_ratio=.33):
    n_positives = int(batch_size * positive_ratio)
    n_negatives = batch_size - n_positives
    negatives = pd.DataFrame.from_dict({
        'user': np.random.choice(all_user_ids, replace=True, size=n_negatives),
        'item': np.random.choice(all_item_ids, replace=True, size=n_negatives),
        'rating': np.repeat(0, n_negatives)
        })
    return pd.concat([positives_df.sample(n_positives), negatives], axis=0)

# if train has both positives and negatives
def sample_batch(positives_and_negatives_df, batch_size):
    batch_df = positives_and_negatives_df.sample(batch_size)
    return batch_df.assign(rating = lambda df: np.maximum(df.rating, 0))

test_samples = sample_batch(train_df, batch_size=5)

## Multi-class classification

In [29]:
N_USERS, N_ITEMS = train.shape


class Placeholders:
    
    def __init__(self, batch_size=None):
        self.user_ids = tf.placeholder(tf.int32, shape=[batch_size], name='user_ids')
        self.item_ids = tf.placeholder(tf.int32, shape=[batch_size], name='item_ids')
        self.ratings = tf.placeholder(tf.float32, shape=[batch_size], name='ratings')

    def to_feed_dict(self, user_items_df, with_ratings=False):
        features_dict = {
            self.user_ids: user_items_df.user.values,
            self.item_ids: user_items_df.item.values
        }
        
        if with_ratings:
            features_dict[self.ratings] = user_items_df.rating.values

        return features_dict

    
class User2MultiClassItemsModel:
    def __init__(self, dimensionality):
        self.dimensionality = dimensionality
            
        with tf.name_scope('B'):
            self.item_biases =  tf.Variable(tf.random_normal(shape=[N_ITEMS], stddev=0.01, mean=0), name='item_biases')
            tf.summary.histogram('item_biases', self.item_biases)

        with tf.name_scope('Q'):
            self.user_factors = tf.Variable(tf.random_normal([N_USERS, self.dimensionality], stddev=0.01, mean=0), name='users')
            tf.summary.histogram('user_factors', self.user_factors)
            
        with tf.name_scope('P'):
            self.item_factors = tf.Variable(tf.random_normal([N_ITEMS, self.dimensionality], stddev=0.01, mean=0), name='users')
            tf.summary.histogram('item_factors', self.item_factors)
       
    def predictions(self, user_ids):
        with tf.name_scope('inference'):
            with tf.name_scope('Q_user'):
                batch_user_factors = tf.nn.embedding_lookup(self.user_factors, user_ids)
            with tf.name_scope('all_items_logits'):
                return tf.matmul(batch_user_factors, tf.transpose(self.item_factors)) + self.item_biases

            
def cross_entropy_loss(logits, target_item_ids):
    with tf.name_scope('cross_entropy_loss'):
        return tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits,
                labels=target_item_ids))
    

LEARNING_RATE = 0.01
N_ITER = 401
BATCH_SIZE = 1024
N_STEP_SUMMARY = 20
LOG_DIR = '/tmp/tfrecs_logs'

with tf.Graph().as_default():
    inputs = Placeholders()
        
    model = User2MultiClassItemsModel(10)
    logits = model.predictions(inputs.user_ids)
    loss = cross_entropy_loss(logits, inputs.item_ids)

    tf.summary.scalar('train_loss', loss)

    summary = tf.summary.merge_all()
    test_summary = tf.summary.scalar('test_loss', loss)
    
    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
    
    def perform_step(step, train, test, summary_writer):
        batch_samples = train_df.query("rating == 1").sample(BATCH_SIZE)

        _, loss_value, summary_value = sess.run(
            fetches=[train_step, loss, summary], 
            feed_dict=inputs.to_feed_dict(batch_samples))
        
        summary_writer.add_summary(summary_value, global_step=step)

        if step % N_STEP_SUMMARY == 0:
            test_samples = sample_batch(test_df, BATCH_SIZE)
            test_loss_value, test_summary_value = sess.run(
                fetches=[loss, test_summary],
                feed_dict=inputs.to_feed_dict(test_samples))
            summary_writer.add_summary(test_summary_value, global_step=step)

            # predicting on all users and items
            all_prediction_values = logits.eval(feed_dict={inputs.user_ids: all_user_ids}).ravel()
            print('Step %d: batch/test log loss = %.3f/%.3f, train/test MRR = %.3f/%.3f' % (
                    step, loss_value, test_loss_value, 
                    mean_reciprocal_rank(all_predictions_to_hits(
                        all_user_items, all_prediction_values, train_df.query("rating > 0"))).mean(),
                    mean_reciprocal_rank(all_predictions_to_hits(
                        all_user_items, all_prediction_values, test_df.query("rating > 0"))).mean()
                ))

        summary_writer.flush()

    with tf.Session() as sess:

        now = dt.datetime.now()
        summary_writer = tf.summary.FileWriter(LOG_DIR + '/movielens-multiclass/train/{:%Y%m%d%H%M%S}'.format(now), sess.graph)
        test_summary_writer = tf.summary.FileWriter(LOG_DIR + '/movielens-multiclass/test/{:%Y%m%d%H%M%S}'.format(now), sess.graph)

        print('Starting training')
        sess.run(tf.global_variables_initializer())
        sess.run(
            tf.variables_initializer(tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics"))
        )

        for step in range(N_ITER):
            perform_step(step, train_df, test_df, summary_writer)

        all_prediction_values = logits.eval(feed_dict={inputs.user_ids: all_user_ids}).ravel()
        train_hits = all_predictions_to_hits(
            all_user_items, all_prediction_values,
            train_df.query("rating > 0"))

        test_hits = all_predictions_to_hits(
            all_user_items, all_prediction_values,
            test_df.query("rating > 0"))

Starting training
Starting training
test
Step 0: batch/test log loss = 7.428/7.420, train/test MRR = 0.139/0.048
test
Step 20: batch/test log loss = 7.154/7.196, train/test MRR = 0.476/0.127
test
Step 40: batch/test log loss = 6.630/6.834, train/test MRR = 0.508/0.146
test
Step 60: batch/test log loss = 6.443/6.720, train/test MRR = 0.560/0.164
test
Step 80: batch/test log loss = 6.418/6.694, train/test MRR = 0.579/0.169
test
Step 100: batch/test log loss = 6.450/6.691, train/test MRR = 0.589/0.179
test
Step 120: batch/test log loss = 6.464/6.690, train/test MRR = 0.592/0.182
test
Step 140: batch/test log loss = 6.403/6.620, train/test MRR = 0.614/0.196
test
Step 160: batch/test log loss = 6.344/6.627, train/test MRR = 0.638/0.201
test
Step 180: batch/test log loss = 6.306/6.511, train/test MRR = 0.655/0.209
test
Step 200: batch/test log loss = 6.235/6.454, train/test MRR = 0.678/0.214
test
Step 220: batch/test log loss = 6.238/6.474, train/test MRR = 0.710/0.214
test
Step 240: batch/t