In [1]:
!pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [22]:
import os
import sys
import csv, jsonlines
import numpy as np
import copy
import random
# Importing tensorflow
import tensorflow as tf
# Importing some more libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn import preprocessing

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

In [16]:
# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Reading ratings file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading ratings file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])

In [17]:
# You need an interaction matrix of ratings where column and row align to user_id and movie_id
ratings = ratings.drop('user_emb_id', axis=1).drop('movie_emb_id', axis=1)

In [20]:
num_movies = ratings.movie_id.nunique()
num_users = ratings.user_id.nunique()
print("USERS: {} MOVIES: {}".format(num_users, num_movies))

USERS: 6040 MOVIES: 3706


In [28]:
import collections
import csv
import os

Rating = collections.namedtuple('Rating', ['user_id', 'item_id', 'rating'])

class Dataset(collections.namedtuple('Dataset', ['users', 'items', 'ratings'])):

    #users: set[str]
    #items: set[str]
    #ratings: list[Rating]

    __slots__ = ()

    def __str__(self):
        out = 'Users: {:,d}\n'.format(self.n_users)
        out += 'Items: {:,d}\n'.format(self.n_items)
        out += 'Ratings: {:,d}\n'.format(self.n_ratings)
        return out
    
    @property
    def n_users(self):
        return len(self.users)
    
    @property
    def n_items(self):
        return len(self.items)
    
    @property
    def n_ratings(self):
        return len(self.ratings)
    
    def user_ratings(self, user_id):
        return list(r for r in self.ratings if r.user_id == user_id)

    def item_ratings(self, item_id):
        return list(r for r in self.ratings if r.item_id == item_id)

    def filter_ratings(self, users, items):
        return list(((r.user_id, r.item_id), r.rating)
                    for r in self.ratings
                    if r.user_id in users
                    and r.item_id in items)


def new_dataset(ratings):
    users = set(r.user_id for r in ratings)
    items = set(r.item_id for r in ratings)
    return Dataset(users, items, ratings)


small_dataset = new_dataset([Rating(x['user_id'], x['movie_id'], x['rating']) for i, x in ratings.iterrows()])

print('Dataset\n\n{}'.format(small_dataset))

Dataset

Users: 6,040
Items: 3,706
Ratings: 1,000,209



In [31]:
from random import shuffle

def split_randomly(dataset, train_ratio=0.80):
    ratings = dataset.ratings
    shuffle(ratings)
    size = int(len(ratings) * train_ratio)
    train_ratings = ratings[:size]
    test_ratings = ratings[size:]
    return new_dataset(train_ratings), \
            new_dataset(test_ratings)

train_valid_data, test_data = split_randomly(small_dataset)
train_data, valid_data = split_randomly(train_valid_data)

In [32]:
train_eval = list(((r.user_id, r.item_id), r.rating) for r in train_data.ratings)
print('Evaluation ratings for train: {:,d}'.format(len(train_eval)))

Evaluation ratings for train: 640,133


In [33]:
# only items in train will be available for validation
valid_items = train_data.items & valid_data.items
print('Items in train and validation: {:,d}'.format(len(valid_items)))

Items in train and validation: 3,392


In [34]:
# users from validation that has any item from train
valid_users = set(r.user_id for r in valid_data.ratings if r.item_id in train_data.items)
print('Users in validation with train items: {:,d}'.format(len(valid_users)))

Users in validation with train items: 6,032


In [35]:
# only users in train are available for validation
valid_users &= train_data.users
print('Users in train and validation: {:,d}'.format(len(valid_users)))

Users in train and validation: 6,032


In [36]:
valid_eval = valid_data.filter_ratings(valid_users, valid_items)
print('Evaluation ratings for validation: {:,d}'.format(len(valid_eval)))

Evaluation ratings for validation: 160,004


In [38]:
# Map User <-> index
# Map Item <-> index
IndexMapping = collections.namedtuple('IndexMapping', ['users_to_idx',
                                                       'users_from_idx',
                                                       'items_to_idx',
                                                       'items_from_idx'])

def map_index(values):
    values_from_idx = dict(enumerate(values))
    values_to_idx = dict((value, idx) for idx, value in values_from_idx.items())
    return values_to_idx, values_from_idx

def new_mapping(dataset):
    users_to_idx, users_from_idx = map_index(dataset.users)
    items_to_idx, items_from_idx = map_index(dataset.items)
    return IndexMapping(users_to_idx, users_from_idx, items_to_idx, items_from_idx)

In [39]:
import tensorflow as tf
import numpy as np

from tensorflow.contrib.factorization import WALSModel

class ALSRecommenderModel:
    
    def __init__(self, user_factors, item_factors, mapping):
        self.user_factors = user_factors
        self.item_factors = item_factors
        self.mapping = mapping
    
    def transform(self, x):
        for user_id, item_id in x:
            if user_id not in self.mapping.users_to_idx \
                or item_id not in self.mapping.items_to_idx:
                yield (user_id, item_id), 0.0
                continue
            i = self.mapping.users_to_idx[user_id]
            j = self.mapping.items_to_idx[item_id]
            u = self.user_factors[i]
            v = self.item_factors[j]
            r = np.dot(u, v)
            yield (user_id, item_id), r
    
    def recommend(self, user_id, num_items=10, items_exclude=set()):
        i = self.mapping.users_to_idx[user_id]
        u = self.user_factors[i]
        V = self.item_factors
        P = np.dot(V, u)
        rank = sorted(enumerate(P), key=lambda p: p[1], reverse=True)

        top = list()
        k = 0
        while k < len(rank) and len(top) < num_items:
            j, r = rank[k]
            k += 1

            item_id = self.mapping.items_from_idx[j]
            if item_id in items_exclude:
                continue

            top.append((item_id, r))

        return top        
    
class ALSRecommender:
    
    def __init__(self, num_factors=10, num_iters=10, reg=1e-1):
        self.num_factors = num_factors
        self.num_iters = num_iters
        self.regularization = reg

    def fit(self, dataset, verbose=False):
        with tf.Graph().as_default(), tf.Session() as sess:
            input_matrix, mapping = self.sparse_input(dataset)
            model = self.als_model(dataset)
            self.train(model, input_matrix, verbose)
            row_factor = model.row_factors[0].eval()
            col_factor = model.col_factors[0].eval()
            return ALSRecommenderModel(row_factor, col_factor, mapping)

    def sparse_input(self, dataset):
        mapping = new_mapping(dataset)

        indices = [(mapping.users_to_idx[r.user_id],
                    mapping.items_to_idx[r.item_id])
                   for r in dataset.ratings]
        values = [r.rating for r in dataset.ratings]
        shape = (dataset.n_users, dataset.n_items)

        return tf.SparseTensor(indices, values, shape), mapping
    
    def als_model(self, dataset):
        return WALSModel(
            dataset.n_users,
            dataset.n_items,
            self.num_factors,
            regularization=self.regularization,
            unobserved_weight=0)

    def train(self, model, input_matrix, verbose=False):
        rmse_op = self.rmse_op(model, input_matrix) if verbose else None

        row_update_op = model.update_row_factors(sp_input=input_matrix)[1]
        col_update_op = model.update_col_factors(sp_input=input_matrix)[1]

        model.initialize_op.run()
        model.worker_init.run()
        for _ in range(self.num_iters):
            # Update Users
            model.row_update_prep_gramian_op.run()
            model.initialize_row_update_op.run()
            row_update_op.run()
            # Update Items
            model.col_update_prep_gramian_op.run()
            model.initialize_col_update_op.run()
            col_update_op.run()

            if verbose:
                print('RMSE: {:,.3f}'.format(rmse_op.eval()))

    def approx_sparse(self, model, indices, shape):
        row_factors = tf.nn.embedding_lookup(
            model.row_factors,
            tf.range(model._input_rows),
            partition_strategy="div")
        col_factors = tf.nn.embedding_lookup(
            model.col_factors,
            tf.range(model._input_cols),
            partition_strategy="div")

        row_indices, col_indices = tf.split(indices,
                                            axis=1,
                                            num_or_size_splits=2)
        gathered_row_factors = tf.gather(row_factors, row_indices)
        gathered_col_factors = tf.gather(col_factors, col_indices)
        approx_vals = tf.squeeze(tf.matmul(gathered_row_factors,
                                           gathered_col_factors,
                                           adjoint_b=True))

        return tf.SparseTensor(indices=indices,
                               values=approx_vals,
                               dense_shape=shape)

    def rmse_op(self, model, input_matrix):
        approx_matrix = self.approx_sparse(model, input_matrix.indices, input_matrix.dense_shape)
        err = tf.sparse_add(input_matrix, approx_matrix * (-1))
        err2 = tf.square(err)
        n = input_matrix.values.shape[0].value
        return tf.sqrt(tf.sparse_reduce_sum(err2) / n)


als = ALSRecommender()
als_model = als.fit(train_data, verbose=True)

RMSE: 0.557
RMSE: 0.214
RMSE: 0.194
RMSE: 0.190
RMSE: 0.188
RMSE: 0.187
RMSE: 0.186
RMSE: 0.185
RMSE: 0.185
RMSE: 0.184


In [40]:
for k in range(10):
    x, y  = valid_eval[k]
    _,  y_hat = list(als_model.transform([x]))[0]
    print(*x, y, y_hat)

4277.0 2711.0 1.0 0.6988603
4740.0 2513.0 0.75 0.59964603
2119.0 2688.0 0.5 0.58921945
308.0 2144.0 0.75 0.6795177
3513.0 1917.0 0.5 0.21331201
1447.0 3863.0 1.0 0.7512944
4732.0 2141.0 0.5 0.6447932
5840.0 2268.0 1.0 0.7557846
3911.0 3653.0 1.0 0.5332837
3091.0 608.0 0.75 0.7642213


In [42]:
def _rmse(model, data):
    x, y = zip(*data)
    y_hat = list(r_hat for _, r_hat in model.transform(x))
    return np.sqrt(np.mean(np.square(np.subtract(y, y_hat))))

def eval_rmse(model):
    rmse = _rmse(model, train_eval)
    print('RMSE (train): {:,.3f}'.format(rmse))
    
    rmse = _rmse(model, valid_eval)
    print('RMSE (validation): {:,.3f}'.format(rmse))

eval_rmse(als_model)

RMSE (train): 0.184
RMSE (validation): 0.231


In [43]:
als = ALSRecommender(num_factors=10, num_iters=10, reg=0.1)
print('Training...\n')
als_model = als.fit(train_data, verbose=True)
print('\nEvaluation...\n')
eval_rmse(als_model)

Training...

RMSE: 0.559
RMSE: 0.218
RMSE: 0.197
RMSE: 0.191
RMSE: 0.189
RMSE: 0.188
RMSE: 0.187
RMSE: 0.186
RMSE: 0.185
RMSE: 0.185

Evaluation...

RMSE (train): 0.185
RMSE (validation): 0.231
