In [1]:
import numpy as np
import pandas as pd

# set some print options
np.set_printoptions(precision=4)
np.set_printoptions(threshold=5)
np.set_printoptions(suppress=True)
pd.set_option('precision', 3, 'notebook_repr_html', True, )

# init random gen
np.random.seed(2)

In [2]:
users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      engine='python',
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       engine='python',
                       names=['movie_id', 'title', 'genres'])

ratings = pd.read_table('data/ml-1m/ratings.dat',
                        sep='::', header=None,
                        engine='python',
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   int64 
 4   zip         6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [4]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [6]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Merge all three data sets. 
First on user_id, then on movie_id.

In [3]:
movielens = pd.merge(pd.merge(ratings,users),movies)

In [10]:
movielens.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestamp   1000209 non-null  int64 
 4   gender      1000209 non-null  object
 5   age         1000209 non-null  int64 
 6   occupation  1000209 non-null  int64 
 7   zip         1000209 non-null  object
 8   title       1000209 non-null  object
 9   genres      1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [10]:
movielens.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,occupation
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,3025.0,1866.0,3.582,972200000.0,29.74,8.036
std,1728.0,1096.0,1.117,12150000.0,11.75,6.531
min,1.0,1.0,1.0,956700000.0,1.0,0.0
25%,1506.0,1030.0,3.0,965300000.0,25.0,2.0
50%,3070.0,1835.0,4.0,973000000.0,25.0,7.0
75%,4476.0,2770.0,4.0,975200000.0,35.0,14.0
max,6040.0,3952.0,5.0,1046000000.0,56.0,20.0


In [11]:
movielens.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
758856,5567,2915,4,959349063,M,50,3,78704,Risky Business (1983),Comedy
345912,2050,1959,3,975522916,F,35,3,99504,Out of Africa (1985),Drama|Romance
586911,2841,1263,3,1022086809,M,50,12,98056,"Deer Hunter, The (1978)",Drama|War
424866,3578,1252,5,966711654,M,50,3,96714,Chinatown (1974),Film-Noir|Mystery|Thriller
538336,2995,2021,4,970683967,M,25,15,97333,Dune (1984),Fantasy|Sci-Fi


## Create a subset for speed reasons

In [4]:
movielens_sub = movielens.iloc[np.random.choice(movielens.index, size=10000, replace=False)]
print(movielens_sub.shape)
# Check the number of unique users and movies in the subset.
print(movielens_sub.user_id.nunique())
print(movielens_sub.movie_id.nunique())

(10000, 10)
3698
2275


Select users which have rated more than one movie.

In [5]:
# Boolean series indicating if a user_id has rated more than one movie.
user_ids_larger_1 = pd.value_counts(movielens_sub.user_id, sort=False) > 1
# Array of user_ids that have rated more than one movie.
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index

In [6]:
movielens_sub = movielens_sub.loc[movielens_sub.index.map(lambda x: movielens_sub.loc[x, 'user_id'] in user_ids_larger_1)]

In [7]:
movielens_sub.shape

(8442, 10)

In [8]:
# Double check that all users in the subset have more than one movie rating.
assert np.all(movielens_sub.user_id.value_counts() > 1)

## Generate Training and Test Sets

The test set will be comprised of 20% of each user's ratings.

In [9]:
def assign_to_set(df):
    """ 
        Assign 20% of a user's ratings to a test set.
        df.index returns a list of indices from a dataframe.
        Choose 20% of those indices without replacement.
        Set the column 'for_testing'=True for the 20% choosen.
    """
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

In [10]:
# Create new column and set as False for every row
movielens_sub.loc[:,'for_testing'] = False
# Group ratings by user_id and assign 20% of each user's ratings to the test set.
grouped = movielens_sub.groupby('user_id', group_keys=False).apply(assign_to_set)

df_train = movielens_sub[grouped.for_testing == False]
df_test = movielens_sub[grouped.for_testing == True]

print(movielens_sub.shape)
# Make sure the training and test sets are not empty.
print(df_train.shape)
print(df_test.shape)
assert len(df_train.index & df_test.index) == 0

(8442, 11)
(5801, 11)
(2641, 11)


In [11]:
# Save the training and test sets
df_train.to_csv('data/df_train.csv')
df_test.to_csv('data/df_test.csv')

## Load the MovieLens Subsample

In [4]:
df_train = pd.read_csv('data/df_train.csv', index_col=0)
df_test = pd.read_csv('data/df_test.csv', index_col=0)

In [23]:
df_train = pd.read_csv('data/movielens_train.csv', index_col=0)
df_test = pd.read_csv('data/movielens_test.csv', index_col=0)

In [3]:
df_train.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
570002,3213,2883,1,968518282,F,25,1,83705,Mumford (1999),Comedy,False
603824,2315,1608,3,974479133,M,56,7,48114,Air Force One (1997),Action|Thriller,False
71519,1651,1210,3,1002181287,M,25,12,94520,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,False
138823,2771,2359,3,973097213,M,18,1,92037,Waking Ned Devine (1998),Comedy,False
513372,3462,2003,4,967394093,F,25,4,73160,Gremlins (1984),Comedy|Horror,False


In [4]:
df_train.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,occupation
count,5801.0,5801.0,5801.0,5801.0,5801.0,5801.0
mean,3023.148,1873.502,3.527,972400000.0,29.521,7.938
std,1721.933,1088.238,1.113,12340000.0,11.289,6.621
min,8.0,1.0,1.0,956700000.0,1.0,0.0
25%,1510.0,1047.0,3.0,965300000.0,25.0,2.0
50%,3067.0,1904.0,4.0,973000000.0,25.0,6.0
75%,4446.0,2791.0,4.0,975300000.0,35.0,14.0
max,6040.0,3952.0,5.0,1046000000.0,56.0,20.0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5801 entries, 758856 to 775932
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      5801 non-null   int64 
 1   movie_id     5801 non-null   int64 
 2   rating       5801 non-null   int64 
 3   timestamp    5801 non-null   int64 
 4   gender       5801 non-null   object
 5   age          5801 non-null   int64 
 6   occupation   5801 non-null   int64 
 7   zip          5801 non-null   object
 8   title        5801 non-null   object
 9   genres       5801 non-null   object
 10  for_testing  5801 non-null   bool  
dtypes: bool(1), int64(6), object(4)
memory usage: 504.2+ KB


In [6]:
df_test.sample(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
330846,2306,597,5,974493350,F,18,4,2138,Pretty Woman (1990),Comedy|Romance,False
756612,3067,175,5,969999335,F,25,0,2148,Kids (1995),Drama,False
825706,5448,2932,3,960052606,M,45,19,60626,Days of Heaven (1978),Drama,False
324481,3087,2406,4,969682171,F,1,1,90802,Romancing the Stone (1984),Action|Adventure|Comedy|Romance,False
503143,3529,2863,3,966896486,F,45,1,97361,"Hard Day's Night, A (1964)",Comedy|Musical,False


In [7]:
df_test.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,occupation
count,2641.0,2641.0,2641.0,2641.0,2641.0,2641.0
mean,3015.204,1877.985,3.551,972100000.0,29.49,7.981
std,1732.383,1094.477,1.127,11980000.0,11.667,6.554
min,8.0,1.0,1.0,956700000.0,1.0,0.0
25%,1484.0,1073.0,3.0,965300000.0,25.0,2.0
50%,3054.0,1876.0,4.0,972700000.0,25.0,7.0
75%,4458.0,2795.0,4.0,975200000.0,35.0,14.0
max,6040.0,3949.0,5.0,1046000000.0,56.0,20.0


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2641 entries, 586911 to 284781
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      2641 non-null   int64 
 1   movie_id     2641 non-null   int64 
 2   rating       2641 non-null   int64 
 3   timestamp    2641 non-null   int64 
 4   gender       2641 non-null   object
 5   age          2641 non-null   int64 
 6   occupation   2641 non-null   int64 
 7   zip          2641 non-null   object
 8   title        2641 non-null   object
 9   genres       2641 non-null   object
 10  for_testing  2641 non-null   bool  
dtypes: bool(1), int64(6), object(4)
memory usage: 229.5+ KB


## Evaluation: performance criterion

RMSE: $\sqrt{\frac{\sum(\hat y - y)^2}{n}}$

In [29]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

## Evaluation: the 'evaluate' method

In [30]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(df_test.user_id, df_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = df_test.rating.values
    return compute_rmse(estimated, real)

In [31]:
def my_estimate_function(user_id, movie_id):
    return 3

In [32]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_function))

RMSE for my estimate function: 1.2323719526527521


On a scale of 1 to 5, let's take a rating of 3 to be a baseline and compute the RMSE based on that assumption. Now we'll have an idea if the following models are doing a "good" or "bad" job in predicting recommendations.

## Rec Engine v1.0: mean ratings

Content-based: Compute the mean of the user's ratings.

In [24]:
def content_mean(user_id, movie_id):
    """ Content-filtering based on mean ratings. """
    
    user_condition = df_train.user_id == user_id
    return df_train.loc[user_condition, 'rating'].mean()

print('RMSE for content1: %s' % evaluate(content_mean))

RMSE for content1: 1.2307824759704098


Collaborative-filtering: Compute the mean of all users that rated a movie.

In [25]:
def collab_mean(user_id, movie_id):
    user_condition = df_train.user_id != user_id
    movie_condition = df_train.movie_id == movie_id
    ratings_by_others = df_train.loc[user_condition & movie_condition]
    # Handling edge cases
    if ratings_by_others.empty:
        return 3.0
    else:
        return ratings_by_others.rating.mean()

    
# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))
print('RMSE for collab1: %s' % evaluate(collab_mean))

4.0
RMSE for collab1: 1.1234279896011794


## Pivoting

Extract a ratings matrix

In [34]:
ratings_mtx_df = df_train.pivot_table(values='rating',
                                      index='user_id',
                                      columns='movie_id')
ratings_mtx_df.sample(5)

movie_id,1,2,3,4,5,6,7,10,11,14,...,3930,3932,3935,3938,3943,3945,3948,3949,3950,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3410,,,,,,,,,,,...,,,,,,,,,,
3694,,,,,,,,,,,...,,,,,,,,,,
3582,,,,,,,,,,,...,,,,,,,,,,
4150,,,,,,,,,,,...,,,,,,,,,,
3415,,,,,,,,,,,...,,,,,,,,,,


In [35]:
ratings_mtx_df.loc[11:16, 1196:1200]

movie_id,1196,1197,1198,1199,1200
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11,,,,,
13,5.0,,,,
15,,,,,


In [38]:
df_train.pivot_table(values='rating', index='age', columns='gender', aggfunc=[np.mean,np.std])

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.605,3.384,1.285,1.303
18,3.425,3.438,1.196,1.149
25,3.613,3.431,1.082,1.143
35,3.648,3.614,1.019,1.039
45,3.549,3.552,1.01,1.073
50,3.671,3.759,1.138,1.038
56,4.069,3.697,0.961,1.086


## Rec engine v1.1: implicit sim functions

Set the user_id as the index for the users data frame for future convenience

In [17]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


### Collaborative filtering using implicit sim functions

Use an implicit `sim(u,u')` function to compare different users

In [14]:
def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    # Only use ratings from other users
    user_condition = df_train.user_id != user_id
    # Boolean panda series where movie_id=True and is set as False every where else
    movie_condition = df_train.movie_id == movie_id
    ratings_by_others = df_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty:
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.loc[user_id, 'gender']
    if user_gender in means_by_gender.columns:
        return means_by_gender.loc[movie_id, user_gender]
    else:
        return means_by_gender.loc[movie_id].mean()

print('RMSE for collab_gender: %s' % evaluate(collab_gender))

RMSE for collab_gender: 1.194027622310891


Filtering just based on gender appears to do worse than a simple collaborative filtering using both genders.

In [26]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = df_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.loc[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.loc[movie_id, user_gender]):
            return self.means_by_gender.loc[movie_id, user_gender]
        else:
            return self.means_by_gender.loc[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

RMSE for CollabGenderReco: 1.1740082417112805


## Minimal reco engine v1.2: custom similarity functions

### A few similarity functions

These were all written to operate on two pandas Series, each one representing the rating history of two different users. You can also apply them to any two feature vectors that describe users or items. In all cases, the higher the return value, the more similar two Series are. You might need to add checks for edge cases, such as divisions by zero, etc.

- Euclidean 'similarity'

$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [62]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

- Cosine similarity

$$ sim(x,y) = \frac{(x \cdot y)}{\sqrt{(x \cdot x) (y \cdot y)}} $$

In [63]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

- Pearson correlation

$$ sim(x,y) = \frac{(x - \bar x)\cdot(y - \bar y)}{\sqrt{(x - \bar x)\cdot(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [27]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

- Jaccard similarity

$$ sim(x,y) = \frac{(x \cdot y)}{(x \cdot x) + (y \cdot y) - (x \cdot y)} $$

In [65]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = (s1.index & s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

### Collaborative-based filtering using custom sim functions

In [33]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = df_train.user_id != user_id
        movie_condition = df_train.movie_id == movie_id
        ratings_by_others = df_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        # A 'profile' is a pandas series with movie_id as the index and rating as the value
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        # Handle the edge cases
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
print('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

RMSE for CollabPearsonReco: 1.1227767489967162


In [23]:
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
