In [1]:
#! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip --directory-prefix /Users/gui/Data/
#! unzip /Users/gui/Data/ml-1m.zip -d /Users/gui/Data/

--2017-02-11 16:13:22--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: '/Users/gui/Data/ml-1m.zip.1'


2017-02-11 16:13:26 (1.51 MB/s) - '/Users/gui/Data/ml-1m.zip.1' saved [5917549/5917549]

Archive:  /Users/gui/Data/ml-1m.zip
replace /Users/gui/Data/ml-1m/movies.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

$$ \mathbf{Y}^\top = \mathbf{R} $$

In [2]:
ratings = (pd.read_csv('/Users/gui/Data/ml-1m/ratings.dat', engine='python', sep='::', names=['user', 'item', 'rating', 'timestamp'])
    .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000)))

movies = (pd.read_csv('/Users/gui/Data/ml-1m/movies.dat', engine='python', sep='::', names=['item', 'title', 'genres'])
          .assign(genres=lambda df:df.genres.str.split('|').values)
          .set_index('item', drop=False))

# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = pd.read_csv('/Users/gui/Data/ml-1m/users.dat', engine='python', sep='::', 
                    names=['user', 'gender', 'age', 'occupation', 'zipcode'])\
    .set_index('user', drop=False)

N_USERS, N_ITEMS = users.user.max() + 1, movies.item.max() + 1

In [3]:
test_df = ratings.sample(n=100000, random_state=0).sort_index()
test_sample_ids = test_df.index.values
train_ratings_mask = ~ratings.index.isin(test_sample_ids)
train_df = ratings.loc[train_ratings_mask]
train_sample_ids = train_df.index.values

print(train_df.shape)
print(test_df.shape)

test_user_ids = test_df.user.unique()
all_user_ids = train_df.user.unique()
all_item_ids = np.arange(N_ITEMS)

def to_all_user_items(user_ids, item_ids):
    return pd.DataFrame.from_dict(
        {'user': np.repeat(user_ids, len(item_ids)),
         'item': np.tile(item_ids, len(user_ids))})

all_user_items = to_all_user_items(all_user_ids, all_item_ids)
print(all_user_items.shape)
print(test_user_ids.shape)
all_user_items.head()

(900209, 4)
(100000, 4)
(23876120, 2)
(5941,)


Unnamed: 0,item,user
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


## User features

In [75]:
from itertools import chain
from scipy import sparse


def columns_to_key_feature_pairs(row, key_column, feature_columns):
    return [(row[key_column], '{}={}'.format(column, row[column])) for column in feature_columns]

feature_columns=['gender', 'age']#'user', 'zipcode']

user_features_df = pd.DataFrame.from_records(
    data=chain.from_iterable(
        columns_to_key_feature_pairs(row, key_column='user', feature_columns=feature_columns)
        for _, row in users.iterrows()),
    columns=['user', 'feature'])\
    .assign(feature=lambda df: df.feature.astype('category'))\
    .assign(encoded_feature=lambda df: df.feature.values.codes)

user_features = sparse.csr_matrix((np.ones_like(user_features_df.user), (user_features_df.user, user_features_df.encoded_feature)))
print('user features = %s' % user_features.__repr__())    

N_USER_FEATURES = user_features.shape[1]
user_features_df.head(5)

user features = <6041x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12080 stored elements in Compressed Sparse Row format>


Unnamed: 0,user,feature,encoded_feature
0,1,gender=F,7
1,1,age=1,0
2,2,gender=M,8
3,2,age=56,6
4,3,gender=M,8


In [101]:
user_ids = [1, 2, 6038]

pd.DataFrame(
    index=pd.Index(user_ids, name='user'),
    data=user_features[user_ids].toarray(),
    columns=user_features_df.feature.values.categories)

Unnamed: 0_level_0,age=1,age=18,age=25,age=35,age=45,age=50,age=56,gender=F,gender=M
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,1,0,1
6038,0,0,0,0,0,0,1,1,0


## User history

In [76]:
positive_views = train_df.query('rating > 3')
user_history_features = sparse.csr_matrix(
    (np.ones_like(positive_views.user), 
    (positive_views.user, positive_views.item)),
    shape=(N_USERS, N_ITEMS))

# To make sure every user has a feature
user_default_feature = sparse.coo_matrix(np.ones(N_USERS).reshape(-1, 1))
user_history_features = sparse.hstack([user_history_features, user_default_feature]).tocsr()

N_USER_HISTORY_FEATURES = user_history_features.shape[1]

print('user history features = %s' % user_history_features.__repr__())    

positive_views.head()

user history features = <6041x3954 sparse matrix of type '<class 'numpy.float64'>'
	with 523991 stored elements in Compressed Sparse Row format>


Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11
6,1,1287,5,2000-12-31 22:33:59
7,1,2804,5,2000-12-31 22:11:59


In [104]:
user_ids = [1, 2, 6038]

pd.DataFrame(
    index=pd.Index(user_ids, name='user'),
    data=np.asarray(user_history_features[user_ids].toarray(), dtype=np.int64),
    columns=pd.Index(np.arange(user_history_features.shape[1]), name='item'))

item,0,1,2,3,4,5,6,7,8,9,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,3953
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [108]:
train_df.set_index('user').loc[[1, 2, 6038]].reset_index().groupby('user').first()

Unnamed: 0_level_0,item,rating,timestamp
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1193,5,2000-12-31 22:12:40
2,1357,5,2000-12-31 21:38:29
6038,1419,4,2000-04-26 02:06:55


In [77]:
positive_views.groupby('user').size().describe()

count    6038.000000
mean       85.781716
std        94.441821
min         1.000000
25%        25.000000
50%        52.000000
75%       111.000000
max      1292.000000
dtype: float64

## Tensforflow model

In [78]:
import tensorflow as tf
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

In [79]:
def intra_row_index(row_indexes):
    count_by_row = np.bincount(row_indexes)
    shift_by_row = np.concatenate([[0], np.cumsum(count_by_row)])
    return np.arange(len(row_indexes)) - shift_by_row[row_indexes]

# from https://github.com/tensorflow/tensorflow/issues/342#issuecomment-160354041
# not very sparse, but rather a kind of jagged array where every batch sample can have 1, N_FEATURES features
def sparse_features_to_tensor(batch_sparse_features):
    batch_features_as_coo = batch_sparse_features.tocoo()
    batch_features_sparse_tensor = tf.SparseTensorValue(
        indices=np.vstack([batch_features_as_coo.row, intra_row_index(batch_features_as_coo.row)]).T,
        values=batch_features_as_coo.col,
        shape=batch_features_as_coo.shape
    )
    return batch_features_sparse_tensor

sparse_features_to_tensor(user_history_features[21,:])

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [0, 4],
       [0, 5]]), values=array([1210, 2700, 2800, 3000, 3704, 3953], dtype=int32), shape=(1, 3954))

## Prediction

In [80]:
def all_predictions_to_hits(all_user_items, all_predicted_values, ground_truth_user_items):
    predicted_ratings = all_user_items.assign(predicted_rating=lambda _: all_predicted_values)
    predicted_ranks = predicted_ratings.groupby('user')['predicted_rating'].rank(ascending=False, method='max')
    predicted_ratings['rank'] = predicted_ranks.values - 1

    ground_truth_hits = pd.merge(
        left=ground_truth_user_items,
        right=predicted_ratings,
        on=['user', 'item'], how='left')
    return ground_truth_hits

In [81]:
def mean_reciprocal_rank(predicted_ranks_df):
    return predicted_ranks_df\
        .assign(rec_rank=lambda df:1 / (df['rank'] + 1))\
        .groupby('user')['rec_rank'].max()

In [90]:

class Placeholders:
    
    def __init__(self, batch_size=None):
        self.user_sparse_features = tf.sparse_placeholder(tf.int32, name='user_features')
        self.user_sparse_history = tf.sparse_placeholder(tf.int32, name='user_history')
        self.item_ids = tf.placeholder(tf.int32, shape=[batch_size], name='item_ids')

    def to_feed_dict(self, user_items_df, with_output_item_ids=True):
        features_dict = {
            self.user_sparse_history: sparse_features_to_tensor(user_history_features[user_items_df.user.values,:]),
            self.user_sparse_features: sparse_features_to_tensor(user_features[user_items_df.user.values,:])
        }
        
        if with_output_item_ids:
            features_dict[self.item_ids] = user_items_df.item.values

        return features_dict


class UserFeatures2MultiClassItemsModel:
    def __init__(self, history_dim, demo_dim, items_dim):
        self.history_dim = history_dim
        self.demo_dim = demo_dim
        self.items_dim = items_dim
        
        with tf.name_scope('BI'):
            self.item_biases =  tf.Variable(tf.random_normal(shape=[N_ITEMS], stddev=0.01, mean=0), name='item_biases')
            tf.summary.histogram('item_biases', self.item_biases)

        with tf.name_scope('Q'):
            self.user_features_factors = tf.Variable(tf.random_normal([N_USER_FEATURES, self.demo_dim], stddev=0.01, mean=0), 
                                                     name='user_features_factors')
            tf.summary.histogram('user_features_factors', self.user_features_factors)

        with tf.name_scope('H'):
            self.user_history_factors = tf.Variable(tf.random_normal([N_USER_HISTORY_FEATURES, self.history_dim], stddev=0.01, mean=0),
                                                     name='user_history_factors')
            tf.summary.histogram('user_history_factors', self.user_history_factors)
       
        with tf.name_scope('P'):
            self.item_factors = tf.Variable(tf.random_normal([N_ITEMS, self.items_dim], stddev=0.01, mean=0), name='item_factors')
            tf.summary.histogram('item_factors', self.item_factors)
            
    def predictions(self, user_sparse_features, user_sparse_history):
        with tf.name_scope('inference'):
            with tf.name_scope('Q_user'):
                batch_user_features_factors = tf.squeeze(tf.nn.embedding_lookup_sparse(
                            self.user_features_factors,
                            sp_ids=user_sparse_features, sp_weights=None, combiner='sum'))
            with tf.name_scope('H_user'):
                batch_user_history_factors = tf.squeeze(tf.nn.embedding_lookup_sparse(
                            self.user_history_factors,
                            sp_ids=user_sparse_history, sp_weights=None, combiner='sum'))
            
            
            with tf.name_scope('QH_user'):
                #batch_user_factors = tf.add(batch_user_features_factors, batch_user_history_factors, name='added_factors')
                batch_concat_user_factors = tf.concat(1, (batch_user_features_factors, batch_user_history_factors),
                                               name='concatenated_factors')
                
            with tf.name_scope('ReLU_user'):
                fully_connected_weights = tf.Variable(tf.random_normal([self.history_dim + self.demo_dim, self.items_dim]))
                fully_connected_biases = tf.Variable(tf.random_normal([self.items_dim]))
                batch_user_factors = tf.nn.relu(
                    tf.add(tf.matmul(batch_concat_user_factors, fully_connected_weights), fully_connected_biases))

            with tf.name_scope('Logits_user_items'):
                return tf.matmul(batch_user_factors, tf.transpose(self.item_factors)) + self.item_biases

def cross_entropy_loss(logits, target_item_ids):
    with tf.name_scope('cross_entropy_loss'):
        return tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits,
                labels=target_item_ids))
    

LEARNING_RATE = 0.01
N_ITER = 201
BATCH_SIZE = 1024
N_STEP_SUMMARY = 20

LOG_DIR = '/tmp/tfrecs_logs'


with tf.Graph().as_default():
    inputs = Placeholders()
        
    model = UserFeatures2MultiClassItemsModel(history_dim=20, demo_dim=20, items_dim=20)
    logits = model.predictions(inputs.user_sparse_features, inputs.user_sparse_history)
    loss = cross_entropy_loss(logits, inputs.item_ids)
        
    tf.summary.scalar('train_loss', loss)
    summary = tf.summary.merge_all()
    test_summary = tf.summary.scalar('test_loss', loss)

    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

    def perform_step(step, train, test, summary_writer):
        # positive only sampling
        batch_samples = train_df.query("rating > 3").sample(BATCH_SIZE)

        _, loss_value, summary_value = sess.run(
            fetches=[train_step, loss, summary], 
            feed_dict=inputs.to_feed_dict(batch_samples))
        
        summary_writer.add_summary(summary_value, global_step=step)

        if step % N_STEP_SUMMARY == 0:
            test_samples = train_df.query("rating > 3").sample(BATCH_SIZE)
            test_loss_value, test_summary_value = sess.run(
                fetches=[loss, test_summary],
                feed_dict=inputs.to_feed_dict(test_samples))
            summary_writer.add_summary(test_summary_value, global_step=step)

            # predicting on all users
            all_prediction_values = logits.eval(inputs.to_feed_dict(pd.DataFrame.from_dict({'user':all_user_ids}),
                                                                    with_output_item_ids=False)).ravel()
            
            print('Step %d: batch/test log loss = %.3f/%.3f, train/test MRR = %.3f/%.3f' % (
                    step, loss_value, test_loss_value, 
                    mean_reciprocal_rank(all_predictions_to_hits(
                        all_user_items, all_prediction_values, train_df.query("rating > 3"))).mean(),
                    mean_reciprocal_rank(all_predictions_to_hits(
                        all_user_items, all_prediction_values, test_df.query("rating > 3"))).mean()
                ))

        summary_writer.flush()

    with tf.Session() as sess:

        summary_writer = tf.summary.FileWriter(LOG_DIR + '/{:%Y%m%d%H%M%S}'.format(dt.datetime.now()), sess.graph)

        print('Starting training')
        sess.run(tf.global_variables_initializer())
        
        for step in range(N_ITER):
            perform_step(step, train_df, test_df, summary_writer)
        
        # would like to have the binding tensor - ids somewhere else
        item_factors_df = pd.DataFrame(index=np.arange(N_ITEMS), data=model.item_factors.eval())
        user_features_factors_df = pd.DataFrame(index=np.arange(N_USER_FEATURES), data=model.user_features_factors.eval())

Starting training
Step 0: batch/test log loss = 8.284/8.173, train/test MRR = 0.225/0.047
Step 20: batch/test log loss = 7.331/7.311, train/test MRR = 0.601/0.131
Step 40: batch/test log loss = 7.195/7.175, train/test MRR = 0.647/0.144
Step 60: batch/test log loss = 7.149/7.036, train/test MRR = 0.664/0.140
Step 80: batch/test log loss = 7.128/7.054, train/test MRR = 0.678/0.150
Step 100: batch/test log loss = 7.027/7.049, train/test MRR = 0.691/0.146
Step 120: batch/test log loss = 7.036/7.003, train/test MRR = 0.688/0.145
Step 140: batch/test log loss = 7.032/6.993, train/test MRR = 0.694/0.142
Step 160: batch/test log loss = 7.048/6.967, train/test MRR = 0.683/0.149
Step 180: batch/test log loss = 6.977/7.097, train/test MRR = 0.673/0.150
Step 200: batch/test log loss = 6.957/6.938, train/test MRR = 0.704/0.147


## Exporting embeddings for later visualisation

In [110]:
import os
import json

individual_user_features = pd.DataFrame(user_features_df.feature.values.categories, columns=['feature'])\
    .assign(title=lambda df:df.feature)
subset_individual_user_features = individual_user_features[~individual_user_features.feature.str.startswith('user=')]

subset_item_features = movies.assign(feature=lambda df: 'genre=' + df.genres.str[0])[['title', 'feature']]


# Cannot export embeddings for non tf.Variable tensors, so doing it manually
concatenated_metadata = pd.concat([
    subset_individual_user_features.assign(feature_type=lambda _: 'user_based'),
    subset_item_features.assign(feature_type=lambda _: 'item_based')],
    ignore_index=True)

concatenated_factors = pd.concat([
    user_features_factors_df.loc[subset_individual_user_features.index], 
    item_factors_df.loc[subset_item_features.index]],
    ignore_index=True)


class ProjectorConfig:
    root_url = 'https://raw.githubusercontent.com/pilipolio/tfrecs/master/embeddings/'
    local_root = '../embeddings'

    @classmethod
    def save_projector_config(cls, name, metadata_df, tensor_df):
        print(metadata_df.shape)
        print(tensor_df.shape)
        
        metadata_filename = name + '_metadata.tsv'
        metadata_df.to_csv(os.path.join(cls.local_root, metadata_filename), sep='\t', index=None)

        tensor_filename = name + '.tsv'
        tensor_df.to_csv(os.path.join(cls.local_root, tensor_filename), sep='\t', index=None, header=None)
        
        projector_config_dict = {
            'embeddings': [
            {
                "metadataPath": os.path.join(cls.root_url, metadata_filename),
                'tensorName': name,
                'tensorShape': tensor_df.shape,
                'tensorPath': os.path.join(cls.root_url, tensor_filename)
            }]
        }
        
        with open(os.path.join(cls.local_root, name + '_projector_config.json'), 'w') as f:
            json.dump(projector_config_dict, f)
    
ProjectorConfig.save_projector_config('ml1m_user_items', concatenated_metadata, concatenated_factors)


(3892, 3)
(3892, 10)


In [97]:
import collections

pd.DataFrame.from_records(list(collections.Counter(chain.from_iterable(pd.DataFrame(movies.genres.tolist()).values)).items()),
                         columns=['genre', 'count']).sort_values(by='count', ascending=False)

Unnamed: 0,genre,count
3,,16890
7,Drama,1603
2,Comedy,1200
8,Action,503
10,Thriller,492
6,Romance,471
11,Horror,343
4,Adventure,283
12,Sci-Fi,276
1,Children's,251


In [105]:
subset_individual_user_features

Unnamed: 0,feature
0,age=1
1,age=18
2,age=25
3,age=35
4,age=45
5,age=50
6,age=56
7,gender=F
8,gender=M
