Recommender system
--------------------------------

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
ratings = pd.read_csv('../datasets/u.data', delimiter='\t',
                      header=None, names=['user_id','item_id','rating','timestamp']) 

ratings.drop('timestamp', axis=1 , inplace=True)
user_means = ratings.groupby('user_id')['rating'].mean()

### Target user's profile

In [3]:
profile = [
    (355,  3),
    (902,  5),
    (1429, 2),
    (263,  1),
    (235,  1),
    (249,  5),
    (1052, 4),
    (89,   5),
    (97,   4),
    (231,  3)
]

profile = pd.DataFrame(profile, columns=['item_id', 'rating'])
profile_mean = profile['rating'].mean()

In [4]:
ratings_ui = ratings.set_index(['user_id', 'item_id']) # Creates a new index data structure
ratings_iu = ratings.set_index(['item_id', 'user_id'])
profile.set_index('item_id', inplace=True) # Modification is done in-place, hence this cell can only be run once!

print('ratings_ui: users, then movies')
print(ratings_ui.head())
print('ratings_iu: users, then movies')
print(ratings_iu.head())
print('ratings_iu: part of the index for movie 242')
print(ratings_iu.ix[242].head()) # Method `ix` is used to access data via the index

ratings_ui: users, then movies
                 rating
user_id item_id        
196     242           3
186     302           3
22      377           1
244     51            2
166     346           1
ratings_iu: users, then movies
                 rating
item_id user_id        
242     196           3
302     186           3
377     22            1
51      244           2
346     166           1
ratings_iu: part of the index for movie 242
         rating
user_id        
196           3
63            3
226           5
154           3
306           5


Purely for convenience, we create [standard Python named tuples](https://pymotw.com/2/collections/namedtuple.html) to keep relevant objects together.

In [5]:
from collections import namedtuple

Rating_Database = namedtuple(field_names=['raw', 'user_movie', 'movie_user', 'means'], 
                             typename='irrelevant_for_our_purposes')
rating_db = Rating_Database(ratings, ratings_ui, ratings_iu, user_means)

User_Profile = namedtuple(field_names=['ratings', 'mean'], 
                          typename='still_irrelevant')
profile_info = User_Profile(profile, profile_mean)

### Compute the similarity

In [6]:
def similarity (user_profile, other_user_id, rating_db):
    Rv = rating_db.means.ix[other_user_id]
    Ru = user_profile.mean
    
    u_ratings = user_profile.ratings
    v_ratings = rating_db.user_movie.ix[other_user_id]
    
    common_movies = pd.merge(u_ratings, v_ratings, 
                             left_index = True, right_index = True, 
                             suffixes=('_u', '_v'),
                             how='inner')
    
    if (common_movies.empty):
        return 0.0
    
    common_movies['rating_u'] -= Ru
    common_movies['rating_v'] -= Rv
    
    common_movies['RujRvj'] = common_movies['rating_u'] * common_movies['rating_v'] 
    common_movies['RujRuj'] = common_movies['rating_u'] * common_movies['rating_u'] 
    common_movies['RvjRvj'] = common_movies['rating_v'] * common_movies['rating_v'] 
    
    denom1 = common_movies['RujRuj'].sum()
    denom2 = common_movies['RvjRvj'].sum()
    
    if denom1 == 0 or denom2 == 0:
        return 0.0

    return (common_movies['RujRvj'].sum() / 
            np.sqrt(denom1 * denom2))

### Predict the rating for a movie

In [7]:
def co_raters(movie_id, rating_db):                 # Users who rated the target movie
    return rating_db.movie_user.ix[movie_id].copy() # are identified directly from the "movies-then-users" index (`iu`)

def predict (target_movie_id, target_user_profile, rating_db):
    cr = co_raters(target_movie_id, rating_db)
    cr['similarity'] = cr.apply(axis=1, func=                         # Compute similarity with each co-rater for the movie
        lambda v: similarity(target_user_profile, v.name, rating_db)) # `v.name` retrieves the index value, i.e. user ID
    
    cr['rating'] -= rating_db.means[cr.index]
    
    alpha = cr['similarity'].abs().sum()
    prediction = (cr['similarity'] * cr['rating']).sum()
    
    return (target_user_profile.mean + prediction / alpha)   


In [8]:
movie_to_predict = 340
%time predict(movie_to_predict, profile_info, rating_db)

movie_to_predict = 100
%time predict(movie_to_predict, profile_info, rating_db)

CPU times: user 566 ms, sys: 14.3 ms, total: 580 ms
Wall time: 589 ms
CPU times: user 1.75 s, sys: 5.02 ms, total: 1.75 s
Wall time: 1.75 s


3.550112170273322