# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [4]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2'] = ['A', 'B', 4, 5]
df

Unnamed: 0,col1,col2
0,1,A
1,2,B
2,3,4
3,4,5


## 2. Deleting a row in a DataFrame

In [3]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df = df[df.index != 'd']
df

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [4]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df = pd.DataFrame({'col1':ser_1,'col2':ser_2,'col3':ser_3})
df

Unnamed: 0,col1,col2,col3
0,-0.371619,1.676866,-1.739614
1,1.493604,0.492691,-0.395227
2,0.324302,0.630079,1.401328
3,-0.090019,1.253274,1.753865
4,0.083443,1.881352,-0.015278
5,0.747033,-0.081008,0.551983


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [5]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
#df.set_index='col_2'
#df
df.col_2

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [6]:
# using the same DataFrame, index into the row whose index is 'obs3'
# returns series where index is 'obs3' 
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [7]:
# using the same DataFrame, index into into its first row
# returns series where index is 0 
df.ix[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

In [8]:
pd.merge?

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [9]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],engine = 'python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'],engine = 'python')

## 2. How to load the training and testing subsets

In [5]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, engine='python')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0,engine='python')
#pd.read_csv?

In [31]:
movielens_train.head()
print(movielens_train.size)

64218


In [32]:
movielens_test.head()
print(movielens_test.size)

29348


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [6]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [7]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [35]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [37]:
print ('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [48]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    movie_condition = movielens_train.movie_id == movie_id
    user_condition = movielens_train.user_id != user_id
    # second, compute the mean of those ratings
    rating_matrix = movielens_train.loc[movie_condition & user_condition]
    if rating_matrix.empty:
        return 3.0
    else:
        return rating_matrix.rating.mean()

    
# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))

print ('RMSE for estimate1: %s' % evaluate(collab_mean))

4.0
RMSE for estimate1: 1.1234279896


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [52]:
movielens_train.groupby('gender').rating.mean()

gender
F    3.594928
M    3.530507
Name: rating, dtype: float64

In [53]:
movielens_train.groupby(['gender','age']).rating.mean()

gender  age
F       1      3.500000
        18     3.528958
        25     3.548507
        35     3.730104
        45     3.581818
        50     3.617978
        56     3.725490
M       1      3.305556
        18     3.507712
        25     3.489764
        35     3.569591
        45     3.565574
        50     3.728125
        56     3.611111
Name: rating, dtype: float64

In [54]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [56]:
pvt=movielens_train.pivot_table(values='rating',index='user_id',columns='movie_id')
pvt.head()

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
15,,,,,,,,,,,...,,,,,,,,,,


In [64]:
pvt.loc[10:16,1190:1201]

movie_id,1191,1192,1193,1194,1196,1197,1198,1199,1200,1201
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,,,,,,,,,,
13,,,,,,,,,,
15,,,,,,,,,,


In [65]:
pvt2=movielens_train.pivot_table(values='rating',index='age',columns='gender',aggfunc='mean')
pvt2.head()

gender,F,M
age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.5,3.305556
18,3.528958,3.507712
25,3.548507,3.489764
35,3.730104,3.569591
45,3.581818,3.565574


In [66]:
pvt3=movielens_train.pivot_table(values='rating',index='age',columns='gender',aggfunc=[np.mean,np.std])
pvt3.head()

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.5,3.305556,1.242757,1.335765
18,3.528958,3.507712,1.162283,1.151606
25,3.548507,3.489764,1.146094,1.101005
35,3.730104,3.569591,0.984159,1.112843
45,3.581818,3.565574,1.18385,1.082775


In [69]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [77]:
means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender',aggfunc='mean')
means_by_gender.head()

gender,F,M
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.0,4.428571
2,,2.666667
4,,3.0
5,3.0,3.0
6,,3.833333


In [74]:
user_gender = user_info.ix[11, 'gender']
user_gender

'F'

In [70]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            return self.means_by_gender.ix[movie_id, user_gender]
        else:
            return self.means_by_gender.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print ('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

RMSE for CollabGenderReco: 1.17400824171


In [81]:
means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')
means_by_zip.head()

zip,00606,00961,01002,01003,01036,01054,01060,01096,01375,01379,...,98765,98826,98908,99005,99016,99114,99163,99352,99353,99701
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,


In [83]:
class CollabzipReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same zip. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            return self.means_by_zip.ix[movie_id, user_zip]
        else:
            return self.means_by_zip.ix[movie_id].mean()

reco = CollabzipReco()
reco.learn()
print ('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

RMSE for CollabGenderReco: 1.12566403192


In [8]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [88]:
class ContentGenreReco:
    """ Content filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same zip. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            return self.means_by_zip.ix[movie_id, user_zip]
        else:
            return self.means_by_zip.ix[movie_id].mean()

reco = CollabzipReco()
reco.learn()
print ('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

RMSE for CollabGenderReco: 1.12566403192


In [None]:
# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [19]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        #print(ratings_by_others)
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        #print(ratings_by_others)
        their_ids = ratings_by_others.index
        #print (their_ids)
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        #print (their_profiles)
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
print (reco.estimate(5,10))
#print ('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

user_id   4279  5531  3108  1680
movie_id                        
1          NaN   NaN   NaN   NaN
2          NaN   NaN   NaN   NaN
4          NaN   NaN   NaN   NaN
5          NaN   NaN   NaN   NaN
6          NaN   NaN   NaN   NaN
7          NaN   NaN   NaN   NaN
10         3.0   5.0   4.0   3.0
11         NaN   NaN   NaN   NaN
12         NaN   NaN   NaN   NaN
13         NaN   NaN   NaN   NaN
15         NaN   NaN   NaN   NaN
16         NaN   NaN   NaN   NaN
17         NaN   NaN   NaN   NaN
18         NaN   NaN   NaN   NaN
19         NaN   NaN   NaN   NaN
20         NaN   NaN   NaN   NaN
21         NaN   NaN   NaN   NaN
24         NaN   NaN   NaN   NaN
25         NaN   NaN   NaN   NaN
28         NaN   NaN   NaN   NaN
29         NaN   NaN   NaN   NaN
31         NaN   NaN   NaN   NaN
32         NaN   NaN   NaN   NaN
34         NaN   NaN   NaN   NaN
36         NaN   NaN   NaN   NaN
38         NaN   NaN   NaN   NaN
39         NaN   NaN   NaN   NaN
40         NaN   NaN   NaN   NaN
41        

In [92]:
all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

In [93]:
all_user_profiles.head()

user_id,5,8,10,13,15,18,19,24,25,26,...,6016,6018,6019,6021,6022,6025,6030,6031,6036,6037
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,


In [94]:
user_condition = movielens_train.user_id != 10
movie_condition = movielens_train.movie_id == 5

In [98]:
ratings_by_others = movielens_train.loc[user_condition & movie_condition]
ratings_by_others.set_index('user_id')

Unnamed: 0_level_0,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4568,5,3,964478586,F,25,4,90034,Father of the Bride Part II (1995),Comedy,False
2996,5,3,972767508,M,18,0,63011,Father of the Bride Part II (1995),Comedy,False


In [106]:
their_ids = ratings_by_others.index
their_ids

Int64Index([766786, 766708], dtype='int64')