In [1]:
import numpy as np
import pandas as pd

In [2]:
ser = pd.Series([2,3,4])
ser

0    2
1    3
2    4
dtype: int64

In [3]:
values = np.array([2.0, 1.0, 5.0, 0.97, 3.0, 10.0, 0.0599, 8.0,])
labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',]
ser = pd.Series(data=values, index=labels)
ser.head()

A    2.00
B    1.00
C    5.00
D    0.97
E    3.00
dtype: float64

In [4]:
movie_rating = {
    'age': 1,
    'gender': 'F',
    'genres': 'Drama',
    'movie_id': 1193,
    'occupation': 10,
    'rating': 5,
    'timestamp': 978300760,
    'title': "One Flew Over the Cuckoo's Nest (1975)",
    'user_id': 1,
    'zip': '48067'
    }
ser = pd.Series(movie_rating)
ser

age                                                1
gender                                             F
genres                                         Drama
movie_id                                        1193
occupation                                        10
rating                                             5
timestamp                                  978300760
title         One Flew Over the Cuckoo's Nest (1975)
user_id                                            1
zip                                            48067
dtype: object

In [5]:
ser.loc['gender']

'F'

In [6]:
dd = ser.loc[['gender','rating']]
type(dd)

pandas.core.series.Series

In [7]:
users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

ratings = pd.read_table('data/ml-1m/ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'], engine='python')

# show how one of them looks
ratings.head(5)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
movielens = pd.merge(pd.merge(ratings, users), movies)
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [9]:
#subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding='iso-8859-1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding='iso-8859-1')
movielens_test.shape

(2668, 11)

In [10]:
movie_rating = {
    'gender': 'F',
    'genres': 'Drama',
    'movie_id': 1193,
    'rating': 5,
    'timestamp': 978300760,
    'user_id': 1,
    }
ser_1 = pd.Series(movie_rating)
ser_2 = pd.Series(movie_rating)
df = pd.DataFrame({'r_1': ser_1, 'r_2': ser_2})
df.columns.name = 'rating_events'
df.index.name = 'rating_data'
df.columns.name

'rating_events'

In [11]:
movie_rating = {
    'gender': False,
    'genres': False,
    'movie_id': False,
    'rating': True,
    'timestamp': False,
    'user_id': False
    }
ser = pd.Series(movie_rating)
ser

gender       False
genres       False
movie_id     False
rating        True
timestamp    False
user_id      False
dtype: bool

In [12]:
df.loc[ser,'r_1']

rating_data
rating    5
Name: r_1, dtype: object

In [13]:
tuids = ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
#estimated = np.array([content_mean(u,i) for (u,i) in ids_to_estimate])
list(ids_to_estimate    )

[(4653, 2648),
 (2259, 1270),
 (3032, 1378),
 (3029, 2289),
 (4186, 2403),
 (2092, 3448),
 (3180, 2244),
 (1962, 292),
 (5042, 1425),
 (1150, 2331),
 (770, 1302),
 (4169, 388),
 (3200, 866),
 (5257, 1219),
 (4520, 1676),
 (1051, 552),
 (5281, 3363),
 (795, 1747),
 (1185, 2123),
 (5112, 2450),
 (5453, 1198),
 (1647, 2058),
 (2909, 1090),
 (2615, 2188),
 (5762, 1721),
 (3163, 553),
 (3266, 527),
 (53, 587),
 (5555, 1183),
 (5959, 3273),
 (3529, 1913),
 (980, 904),
 (5892, 318),
 (2232, 1653),
 (3625, 1625),
 (1842, 194),
 (3384, 1127),
 (4183, 1243),
 (1086, 1617),
 (1150, 2305),
 (5405, 1387),
 (3751, 2406),
 (5219, 3638),
 (1331, 1950),
 (2102, 1094),
 (1523, 1214),
 (3370, 1262),
 (2934, 2445),
 (3256, 110),
 (1586, 3060),
 (1632, 3167),
 (4397, 2396),
 (4793, 2959),
 (3859, 1681),
 (5826, 3252),
 (3942, 1924),
 (33, 1997),
 (1392, 2105),
 (2419, 186),
 (3167, 2395),
 (3993, 2082),
 (4053, 1228),
 (2627, 480),
 (3953, 589),
 (5831, 3753),
 (5869, 1214),
 (5164, 969),
 (26, 3686),
 (36

In [14]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [15]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)


In [17]:
def content_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    return movielens_train.loc[user_condition, 'rating'].mean()

In [7]:
evaluate(content_mean)

1.2307824759704096

In [77]:
def coll_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    rating_by_others = movielens_train.loc[user_condition & movie_condition, 'rating']
    if rating_by_others.empty:
        return 'NO'
    else:
        #return rating_by_others
        return rating_by_others.mean()
    

In [61]:
evaluate(coll_mean)

1.1234279896011794

In [80]:
coll_mean(145, 2193)

3.3333333333333335

In [28]:
movielens_train.groupby(['gender','age'])['rating'].mean()

gender  age
F       1      3.500000
        18     3.528958
        25     3.548507
        35     3.730104
        45     3.581818
        50     3.617978
        56     3.725490
M       1      3.305556
        18     3.507712
        25     3.489764
        35     3.569591
        45     3.565574
        50     3.728125
        56     3.611111
Name: rating, dtype: float64

In [14]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [88]:
ratings_mtx_df = movielens_train.pivot_table(values='rating',index='user_id',columns='movie_id')
ratings_mtx_df.head()

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
15,,,,,,,,,,,...,,,,,,,,,,


In [180]:
movielens_train.pivot_table(values='rating', index='age', columns='gender', aggfunc='count')

gender,F,M
age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,46,108
18,259,778
25,536,1905
35,289,855
45,110,366
50,89,320
56,51,126


In [194]:
df2 = movielens_train.pivot_table(values='rating', index='age', columns='gender', aggfunc=[np.mean, np.std])
df2[['mean','std']]

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.5,3.305556,1.242757,1.335765
18,3.528958,3.507712,1.162283,1.151606
25,3.548507,3.489764,1.146094,1.101005
35,3.730104,3.569591,0.984159,1.112843
45,3.581818,3.565574,1.18385,1.082775
50,3.617978,3.728125,1.049953,1.009899
56,3.72549,3.611111,0.981396,1.073106


In [9]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [10]:
user_id = 3
user_info.loc[user_id,'gender']

'M'

In [108]:
def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    
    user_gender = user_info.ix[user_id, 'gender']
    
    if user_gender in means_by_gender.columns: 
        return means_by_gender.loc[movie_id, user_gender]
    else:
        return means_by_gender.loc[movie_id].mean()

In [117]:
evaluate(collab_gender)

1.1740082417112805

In [118]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.loc[movie_id, user_gender]):
            return self.means_by_gender.loc[movie_id, user_gender]
        else:
            return self.means_by_gender.loc[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
evaluate(reco.estimate)

1.1740082417112805

In [120]:
class CollabAgeReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.loc[movie_id, user_age]):
            return self.means_by_age.loc[movie_id, user_age]
        else:
            return self.means_by_age.loc[movie_id].mean()

reco = CollabAgeReco()
reco.learn()
evaluate(reco.estimate)

1.2052013344107597

In [121]:
class CollabZipReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same zipcode. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.loc[movie_id, user_zip]):
            return self.means_by_zip.loc[movie_id, user_zip]
        else:
            return self.means_by_zip.loc[movie_id].mean()

reco = CollabZipReco()
reco.learn()
evaluate(reco.estimate)

1.1256640319176328

In [123]:
class CollabOccupationReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_occupation = movielens_train.pivot_table('rating', index='movie_id', columns='occupation')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if movie_id not in self.means_by_occupation.index: 
            return 3.0
        
        user_occupation = user_info.ix[user_id, 'occupation']
        if ~np.isnan(self.means_by_occupation.loc[movie_id, user_occupation]):
            return self.means_by_occupation.loc[movie_id, user_occupation]
        else:
            return self.means_by_occupation.loc[movie_id].mean()

reco = CollabOccupationReco()
reco.learn()
evaluate(reco.estimate)

1.202876964364191

In [205]:
movielens_train.pivot_table('rating', index='genres', columns='user_id').head()

user_id,5,8,10,13,15,18,19,24,25,26,...,6016,6018,6019,6021,6022,6025,6030,6031,6036,6037
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Action,,,,3.0,,,,,,,...,,,,,,,,,,
Action|Adventure,,,,,,,,,,,...,,,,,,,,,,
Action|Adventure|Animation,,,,,,,,,,,...,,,,,,,,,,
Action|Adventure|Animation|Children's|Fantasy,,,,,,,,,,,...,,,,,,,,,,
Action|Adventure|Children's|Comedy,,,,,,,,,,,...,,,,,,,,,,


In [130]:
movie_info = movies.set_index('movie_id').head()
movie_info

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [206]:
class ContentGenresReco:
    """ Content filtering using an implicit sim(i,i'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_geners = movielens_train.pivot_table('rating', index='user_id', columns='genres')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if user_id not in self.means_by_geners.index: 
            return 3.0
        
        movie_genre = movie_info.ix[movie_id, 'genres']
        if ~np.isnan(self.means_by_geners.loc[user_id, movie_genre]):
            return self.means_by_geners.loc[user_id, movie_genre]
        else:
            return self.means_by_geners.loc[user_id].mean()

reco = ContentGenresReco()
reco.learn()
reco.estimate(145, 2193)

AttributeError: 'ContentGenresReco' object has no attribute 'means_by_geners'

In [9]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [19]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
reco.estimate(145, 2193)

2.6109152449013782

In [20]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

In [22]:
class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: cosine(profile, user_profile), axis=0)
        
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabCosineReco()
reco.learn()
reco.estimate(145, 2193)

2.9651166824174418