In [1]:
import numpy as np
import pandas as pd

In [14]:
# load the users and movies data
users = pd.read_table('data/movielens/users.dat',
                      sep='::', header=None, engine='python',
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])
movies = pd.read_table('data/movielens/movies.dat',
                       sep='::', header=None, engine='python',
                       names=['movie_id', 'title', 'genres'])
ratings = pd.read_table('data/movielens/ratings.dat',
                       sep='::', header=None, engine='python',
                       names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [19]:
import sqlite3
conn = sqlite3.connect('data/movielens/movielens.db')
conn.text_factory = str
users.to_sql('users', conn, if_exists='replace')
movies.to_sql('movies', conn, if_exists='replace')
ratings.to_sql('ratings', conn, if_exists='replace')

In [30]:
query = """
    select * 
    from users 
    limit 10;
    """
results = conn.cursor().execute(query).fetchall()
results

[(0, 1, 'F', 1, 10, '48067'),
 (1, 2, 'M', 56, 16, '70072'),
 (2, 3, 'M', 25, 15, '55117'),
 (3, 4, 'M', 45, 7, '02460'),
 (4, 5, 'M', 25, 20, '55455'),
 (5, 6, 'F', 50, 9, '55117'),
 (6, 7, 'M', 35, 1, '06810'),
 (7, 8, 'M', 25, 12, '11413'),
 (8, 9, 'M', 25, 17, '61614'),
 (9, 10, 'F', 35, 1, '95370')]

In [31]:
pd.read_sql_query(query, conn)

Unnamed: 0,index,user_id,gender,age,occupation,zip
0,0,1,F,1,10,48067
1,1,2,M,56,16,70072
2,2,3,M,25,15,55117
3,3,4,M,45,7,2460
4,4,5,M,25,20,55455
5,5,6,F,50,9,55117
6,6,7,M,35,1,6810
7,7,8,M,25,12,11413
8,8,9,M,25,17,61614
9,9,10,F,35,1,95370


In [23]:
data = pd.read_sql_table(users, conn)

NotImplementedError: read_sql_table only supported for SQLAlchemy connectable.

In [17]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 2. How to load the training and testing subsets

In [12]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0)
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0)

In [13]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [14]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [15]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [16]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [17]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [18]:
print 'RMSE for my estimate function: %s' % evaluate(my_estimate_func)

RMSE for my estimate function: 1.23237195265


In [19]:
def content_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    return movielens_train.loc[user_condition, 'rating'].mean()

print 'RMSE for estimate1: %s' % evaluate(content_mean)

RMSE for estimate1: 1.23078247597


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [20]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_others.empty:
        return 3.0
    else:
        return ratings_others.rating.mean()
    
print 'RMSE for estimate1: %s' % evaluate(collab_mean)

# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

RMSE for estimate1: 1.1234279896


4.0

In [21]:
movielens_train.groupby('gender')['rating'].mean()

gender
F    3.594928
M    3.530507
Name: rating, dtype: float64

In [22]:
movielens_train.groupby(['gender', 'age'])['rating'].mean()

gender  age
F       1      3.500000
        18     3.528958
        25     3.548507
        35     3.730104
        45     3.581818
        50     3.617978
        56     3.725490
M       1      3.305556
        18     3.507712
        25     3.489764
        35     3.569591
        45     3.565574
        50     3.728125
        56     3.611111
Name: rating, dtype: float64

In [23]:
# transform the ratings frame into a ratings matrix
ratings_mtx_df = movielens_train.pivot_table(values='rating',
                                             index='user_id',
                                             columns='movie_id')
ratings_mtx_df.head(3)

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [24]:
ratings_mtx_df.loc[11:16, 1196:1200]

movie_id,1196,1197,1198,1199,1200
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13,,,,,
15,,,,,


In [25]:
movielens_train.pivot_table(values='rating', index='age', columns='gender', aggfunc='mean')

gender,F,M
age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.5,3.305556
18,3.528958,3.507712
25,3.548507,3.489764
35,3.730104,3.569591
45,3.581818,3.565574
50,3.617978,3.728125
56,3.72549,3.611111


In [26]:
movielens_train.pivot_table(values='rating', index='age', columns='gender', aggfunc=[np.mean, np.std])

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.5,3.305556,1.242757,1.335765
18,3.528958,3.507712,1.162283,1.151606
25,3.548507,3.489764,1.146094,1.101005
35,3.730104,3.569591,0.984159,1.112843
45,3.581818,3.565574,1.18385,1.082775
50,3.617978,3.728125,1.049953,1.009899
56,3.72549,3.611111,0.981396,1.073106


In [27]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [28]:
def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.loc[user_id, 'gender']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.loc[movie_id, user_gender]
    else:
        return means_by_gender.loc[movie_id].mean()

print 'RMSE for collab_gender: %s' % evaluate(collab_gender)

RMSE for collab_gender: 1.17400824171


In [36]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.loc[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.loc[movie_id, user_gender]):
            return self.means_by_gender.loc[movie_id, user_gender]
        else:
            return self.means_by_gender.loc[movie_id].mean()

In [37]:
reco = CollabGenderReco()
reco.learn()
print 'RMSE for CollabGenderReco: %s' % evaluate(reco.estimate)

RMSE for CollabGenderReco: 1.17400824171


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [38]:
def collab_gender_age(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender and age. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender_age = ratings_by_others.pivot_table('rating', index='movie_id', columns=['gender', 'age'])
    user_gender = user_info.loc[user_id, 'gender']
    user_age = user_info.loc[user_id, 'age']
    gender_age = (user_gender, user_age)
    if gender_age in means_by_gender_age.columns: 
        if np.isnan(means_by_gender_age.loc[movie_id, gender_age]):
            return means_by_gender_age.loc[movie_id, user_gender].mean() 
        else:
            return means_by_gender_age.loc[movie_id, gender_age]
    else:
        return means_by_gender_age.loc[movie_id].mean()

In [39]:
print 'RMSE for collab_gender_age: %s' % evaluate(collab_gender_age)

RMSE for collab_gender_age: 1.20662302052


In [40]:
def collab_zip1(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender and first 2 digits of zip code. """
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    ratings_by_others['zip1'] = ratings_by_others['zip'].apply(lambda x: int(str(x)[:1]))
    
    means_by_zip1 = ratings_by_others.pivot_table('rating', index='movie_id', columns=['zip1'])
    user_zip = user_info.loc[user_id, 'zip']
    user_zip1 = int(str(user_zip)[:1])
    if user_zip1 in means_by_zip1.columns: 
        if np.isnan(means_by_zip1.loc[movie_id, user_zip1]): 
            means_by_zip1.loc[movie_id].mean()
        else:
            return means_by_zip1.loc[movie_id, user_zip1]
    else:
        return means_by_zip1.loc[movie_id].mean()

In [41]:
print 'RMSE for collab_zip1: %s' % evaluate(collab_zip1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


RMSE for collab_zip1: 1.22268590294


In [42]:
def collab_zip2(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender and first 2 digits of zip code. """
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    ratings_by_others['zip2'] = ratings_by_others['zip'].apply(lambda x: int(str(x)[:2]))
    
    means_by_zip2 = ratings_by_others.pivot_table('rating', index='movie_id', columns=['zip2'])
    user_zip = user_info.loc[user_id, 'zip']
    user_zip2 = int(str(user_zip)[:2])
    if user_zip2 in means_by_zip2.columns: 
        if np.isnan(means_by_zip2.loc[movie_id, user_zip2]): 
            means_by_zip2.loc[movie_id].mean()
        else:
            return means_by_zip2.loc[movie_id, user_zip2]
    else:
        return means_by_zip2.loc[movie_id].mean()

In [43]:
print 'RMSE for collab_zip2: %s' % evaluate(collab_zip2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


RMSE for collab_zip2: 1.15577068981


In [44]:
def collab_gender_zip2(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender and first 2 digits of zip code. """
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    ratings_by_others['zip2'] = ratings_by_others['zip'].apply(lambda x: int(str(x)[:2]))
    
    means_by_gender_zip = ratings_by_others.pivot_table('rating', index='movie_id', columns=['gender', 'zip2'])
    user_gender = user_info.loc[user_id, 'gender']
    user_zip = user_info.loc[user_id, 'zip']
    user_zip2 = int(str(user_zip)[:2])
    gender_zip2 = (user_gender, user_zip2)
    if gender_zip2 in means_by_gender_zip.columns: 
        if np.isnan(means_by_gender_zip.loc[movie_id, gender_zip2]):
            return means_by_gender_zip.loc[movie_id, user_gender].mean() 
        else:
            return means_by_gender_zip.loc[movie_id, gender_zip2]
    else:
        return means_by_gender_zip.loc[movie_id].mean()


In [45]:
print 'RMSE for collab_gender_zip2: %s' % evaluate(collab_gender_zip2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


RMSE for collab_gender_zip2: 1.14602421652


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [48]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = (s1.index & s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

In [49]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)

In [50]:
reco = CollabPearsonReco()
reco.learn()
print 'RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate)

  


RMSE for CollabPearsonReco: 1.12640340009


In [52]:
class CollabGenrePearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        
        my_condition =  movielens_test.user_id == user_id
        movie_genre_condition = movielens_test.movie_id == movie_id
        genre_df = movielens_test.loc[my_condition & movie_genre_condition, ['genres']]
        genre = genre_df.iloc[0, 0]
        genre_condition = movielens_train.genres == genre
        ratings_by_genre = movielens_train.loc[genre_condition]
        
        if ratings_by_others.empty:
            if ratings_by_genre.empty:
                return 3.0
            else:
                return ratings_by_genre['rating'].mean()
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)

In [53]:
reco = CollabGenrePearsonReco()
reco.learn()
print 'RMSE for CollabGenrePearsonReco: %s' % evaluate(reco.estimate)

  


RMSE for CollabGenrePearsonReco: 1.13524007024
