# Loading data

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
! unzip ml-1m.zip -d .

In [1]:
import pandas as pd
import numpy as np
import tensorflow
import tflearn

In [2]:
ratings = (pd.read_csv('./ml-1m/ratings.dat', engine='python', sep='::', names=['user', 'item', 'rating', 'timestamp'])
    .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000))
          )

movies = (pd.read_csv('./ml-1m/movies.dat', engine='python', sep='::', names=['item', 'title', 'genres'])
          .assign(genres=lambda df:df.genres.str.split('|').values)
          .set_index('item', drop=False))

# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = pd.read_csv('./ml-1m/users.dat', engine='python', sep='::', 
                    names=['user', 'gender', 'age', 'occupation', 'zipcode'])\
    .set_index('user', drop=False)

In [3]:
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


## Train/test split

 * Ideally time based split
 * For the sake of simplicity, let's just sample ratings uniformly (breaking the time machine rule)

In [4]:
test = ratings.sample(n=100000, random_state=0)
train_ratings_mask = ~ratings.index.isin(test.index)
train = ratings.loc[train_ratings_mask]

test_user_items = test[['user', 'item']]

print(train.shape)
print(test.shape)

test.head()

(900209, 4)
(100000, 4)


Unnamed: 0,user,item,rating,timestamp
324271,1922,2094,4,2000-11-20 04:34:27
818637,4918,2808,1,2000-07-08 19:29:05
148677,957,1660,4,2000-11-25 05:28:13
778790,4653,914,5,2000-11-29 21:22:43
525489,3245,3324,1,2000-09-07 06:33:31


## Large scale LR model

Using TFlearn high-level wrapper
https://www.tensorflow.org/versions/r0.9/tutorials/linear/overview.html

See also  https://github.com/tflearn/tflearn/blob/master/examples/others/recommender_wide_and_deep.py
and https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/tutorials/wide_and_deep/index.md


In [5]:
ratings_and_features = (pd.merge(
    left=pd.merge(
        left=ratings[['user', 'item', 'rating']],
        right=users, on='user', how='left'),
    right=movies.assign(genre=lambda df:df.genres.str[0]), on='item', how='left')
    .sort_values('user')
)

train_ratings_and_features = ratings_and_features[ratings_and_features.index.isin(train.index)]
test_ratings_and_features = ratings_and_features[ratings_and_features.index.isin(test.index)]
print(train_ratings_and_features.shape)
print(ratings_and_features.shape)
ratings_and_features.head(10)

(900209, 10)
(1000209, 10)


Unnamed: 0,user,item,rating,gender,age,occupation,zipcode,title,genres,genre
0,1,1193,5,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama],Drama
29,1,745,3,F,1,10,48067,"Close Shave, A (1995)","[Animation, Comedy, Thriller]",Animation
30,1,2294,4,F,1,10,48067,Antz (1998),"[Animation, Children's]",Animation
31,1,3186,4,F,1,10,48067,"Girl, Interrupted (1999)",[Drama],Drama
32,1,1566,4,F,1,10,48067,Hercules (1997),"[Adventure, Animation, Children's, Comedy, Mus...",Adventure
33,1,588,4,F,1,10,48067,Aladdin (1992),"[Animation, Children's, Comedy, Musical]",Animation
34,1,1907,4,F,1,10,48067,Mulan (1998),"[Animation, Children's]",Animation
35,1,783,4,F,1,10,48067,"Hunchback of Notre Dame, The (1996)","[Animation, Children's, Musical]",Animation
36,1,1836,5,F,1,10,48067,"Last Days of Disco, The (1998)",[Drama],Drama
37,1,1022,5,F,1,10,48067,Cinderella (1950),"[Animation, Children's, Musical]",Animation


In [9]:
import tensorflow as tf
from tensorflow.contrib import learn

gender = tf.contrib.layers.sparse_column_with_hash_bucket(column_name='gender', hash_bucket_size=int(1e4))
genre = tf.contrib.layers.sparse_column_with_hash_bucket(column_name='genre', hash_bucket_size=int(1e4))

gender_genre = tf.contrib.layers.crossed_column(
    [gender, genre], hash_bucket_size=int(1e4))

def to_sparse_tensor(ratings_and_features, feature_colum):
    n_samples = ratings_and_features.shape[0]
    
    return tf.SparseTensor(
        indices=np.vstack([np.arange(n_samples), np.zeros(n_samples)]).T,
        values=ratings_and_features[feature_colum].values,
        shape=[n_samples, 1]
    )

def input_fn(ratings_and_features, categorical_cols):
    column_to_tensors = {c: to_sparse_tensor(ratings_and_features, c) for c in categorical_cols}
    ratings_as_target = tf.constant(ratings_and_features.rating.values)
    return (column_to_tensors, ratings_as_target)

## Using high-level TF.Learn

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/tutorials/wide/index.md

In [40]:
from functools import partial

train_fn = partial(input_fn, ratings_and_features=train_ratings_and_features, categorical_cols=['gender', 'genre'])
test_fn = partial(input_fn, ratings_and_features=test_ratings_and_features, categorical_cols=['gender', 'genre'])

lr = learn.LinearRegressor(
    feature_columns=[gender, genre, gender_genre])

lr.fit(input_fn=train_fn, steps=20)

# Evaluate for one step (one pass through the test data).
results = lr.evaluate(input_fn=test_fn, steps=1)

# Print the stats for the evaluation.
for key in sorted(results):
    print("{}: {}".format(key, results[key]))

  result_shape.insert(dim, 1)


global_step: 20
loss: 1.2786636352539062


## Using low level helpers with pure TF

https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/contrib/layers/python/layers/feature_column_ops.py#L289

In [11]:
from tensorflow.contrib.layers import feature_column_ops

categorical_cols = ['gender', 'genre']

weighted_features, _, __ = feature_column_ops.weighted_sum_from_feature_columns(
    columns_to_tensors = {c: to_sparse_tensor(ratings_and_features, c) for c in categorical_cols},
    feature_columns=[gender, genre],
    num_outputs=1)

weighted_features

<tf.Tensor 'weighted_sum_from_feature_columns/BiasAdd:0' shape=(?, 1) dtype=float32>

In [38]:
import tensorflow as tf
from tensorflow.contrib.layers import feature_column_ops
import datetime as dt

BATCH_SIZE = 512 * 10
N_ITER = 100
LOG_DIR = '/tmp/tflearn_logs'

def compute_loss(predictions, targets):
    with tf.name_scope('loss') as s:
        loss = tf.sqrt(tf.reduce_mean(tf.square(tf.sub(predictions, targets))), name='rmse')
    return loss

def training(loss, learning_rate=0.02):
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step

import itertools

def to_feature_columns(column_names, cross=False):
    feature_columns = [
        tf.contrib.layers.sparse_column_with_hash_bucket(
            column_name, hash_bucket_size=int(1e2))
        for column_name in column_names]
    crossed_columns = [tf.contrib.layers.crossed_column(columns=[first, second], hash_bucket_size=int(1e3))
     for first, second in itertools.combinations(feature_columns, 2)]
    return crossed_columns + feature_columns 

def names_to_values_holders(column_names, batch_size=None):
    return {
        column_name: tf.placeholder(tf.string, shape=[batch_size], name='sample_' + column_name)
        for column_name in column_names}

def holders_to_named_tensors(named_values_holders, indices_holder, shape_holder):
    return {
        column_name: tf.SparseTensor(
            indices=indices_holder,
            values=values_holder,
            shape=shape_holder)
        for column_name, values_holder
        in named_values_holders.items()}

def holders_to_feed_values(ratings_and_features, named_values_holders):
    return {
        values_holder:ratings_and_features[column_name].values
        for column_name, values_holder
        in named_values_holders.items()}
                             
categorical_cols = ['gender', 'genre']

with tf.Graph().as_default():
    ratings_placeholder = ratings_placeholder = tf.placeholder(tf.float32, shape=[None], name='ratings')
    indices_holder = tf.placeholder(tf.int64, shape=[None, 2], name='sample_indices')
    shape_holder = tf.placeholder(tf.int64, name='samples_shape')
    
    names_to_values_holders = names_to_values_holders(categorical_cols)
    names_to_tensors = holders_to_named_tensors(names_to_values_holders, indices_holder, shape_holder)
    
    with tf.name_scope('weighted_features') as s:
        # see https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/contrib/layers/python/layers/feature_column_ops.py#L289
        weighted_sum_features, _, __ = feature_column_ops.weighted_sum_from_feature_columns(
            columns_to_tensors=names_to_tensors,
            feature_columns=to_feature_columns(categorical_cols),
            num_outputs=1, scope=s)

    loss = compute_loss(weighted_sum_features, ratings_placeholder)
    train_step = training(loss)
    
    tf.scalar_summary('batch_rmse', loss)
    summary = tf.merge_all_summaries()
    test_summary = tf.scalar_summary('test_rmse', loss)
    
    with tf.Session() as sess:
        summary_writer = tf.train.SummaryWriter(LOG_DIR + '/{:%Y%m%d%H%M%S}'.format(dt.datetime.now()), sess.graph)
        
        holders_to_batch_feed_values = {
            indices_holder: np.vstack([np.arange(BATCH_SIZE), np.zeros(BATCH_SIZE)]).T,
            shape_holder: [BATCH_SIZE, 1]
        }

        n_test_samples = 1000
        holders_to_test_feed_values = {
            indices_holder: np.vstack([np.arange(n_test_samples), np.zeros(n_test_samples)]).T,
            shape_holder: [n_test_samples, 1]
        }

        sess.run(tf.initialize_all_variables())
        for step in range(N_ITER):
            rs = train_ratings_and_features.sample(BATCH_SIZE)
            holders_to_batch_feed_values.update({ratings_placeholder: rs.rating.values})
            holders_to_batch_feed_values.update(holders_to_feed_values(rs, names_to_values_holders))
            _, loss_value, summary_value = sess.run(
                fetches=[train_step, loss, summary],
                feed_dict=holders_to_batch_feed_values)
            summary_writer.add_summary(summary_value, global_step=step)
            
            if step % 10 == 0:
                test_ratings = test_ratings_and_features.sample(n_test_samples)
                holders_to_test_feed_values.update({
                        ratings_placeholder: test_ratings.rating.values})
                holders_to_test_feed_values.update(holders_to_feed_values(test_ratings, names_to_values_holders))
                
                test_loss_value, test_summary_value = sess.run(
                    fetches=[loss, test_summary],
                    feed_dict=holders_to_test_feed_values)
                print('Step %d: batch/test loss = %.2f/%.2f' % (step, loss_value, test_loss_value))
                summary_writer.add_summary(test_summary_value, global_step=step)

        summary_writer.flush()

Step 0: batch/test loss = 3.68/3.66
Step 10: batch/test loss = 2.96/2.94
Step 20: batch/test loss = 2.30/2.21
Step 30: batch/test loss = 1.64/1.61
Step 40: batch/test loss = 1.22/1.24
Step 50: batch/test loss = 1.11/1.11
Step 60: batch/test loss = 1.12/1.17
Step 70: batch/test loss = 1.10/1.16
Step 80: batch/test loss = 1.12/1.11
Step 90: batch/test loss = 1.09/1.13
