In [4]:
import numpy as np
import pandas as pd

In [5]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv("../movielens_data/u.user", sep="|", names=u_cols, encoding="latin-1")
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv("../movielens_data/u.item", sep="|", names=i_cols, encoding="latin-1")
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


We see that this file gives us information regarding the movie's title, release date, IMDb
URL, and its genre(s). Since we are focused on building only collaborative filters in this
chapter, we do not require any of this information, apart from the movie title and its
corresponding ID:

In [7]:
movies = movies[['movie_id', 'title']]

In [8]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../movielens_data/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
# drop the timestamp column
ratings = ratings.drop('timestamp', axis = 1)

In [10]:
display(users.head())
display(movies.head())
display(ratings.head())

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [11]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings["user_id"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)




In [12]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    if y_true.shape != y_pred.shape:
        raise ValueError("Shape of y_true and y_pred mush match")
    return np.sqrt(mean_squared_error(y_true, y_pred))

def baseline(user_id, movie_id):
    """how user_id rate the movie_id"""
    return 3.0

In [30]:
X_test.head()

Unnamed: 0,user_id,movie_id,rating
53814,459,16,2
11263,389,429,4
41460,666,122,2
3853,7,162,5
40713,506,198,2


In [14]:
def score(cf_model):
    id_pairs = zip(X_test["user_id"], X_test["movie_id"])
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test["rating"])
    return rmse(y_true, y_pred)

score(baseline)    

1.2488234462885457

## User-based collaborative filtering

In [12]:
# build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values="rating", index="user_id", columns="movie_id")
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


#### Mean
Let's first build one of the simplest collaborative filters possible. This simply takes
in user_id and movie_id and outputs the mean rating for the movie by all the users who
have rated it. No distinction is made between the users. In other words, the rating of each
user is assigned equal weight.

In [13]:
def cf_user_mean(user_id, movie_id):
    """rate the movie_id by taking the mean of all the ratings done by the user"""
    if movie_id in r_matrix:
        mean_rating = r_matrix[movie_id].mean()
    else:
        mean_rating = 3.0
    return mean_rating

score(cf_user_mean)

1.0300824802393536

We see that the score obtained for this model is lower and therefore better than the baseline.

### Weighted mean

In the previous model, we assigned equal weights to all the users. However, it makes
intuitive sense to give more preference to those users whose ratings are similar to the user
in question than the other users whose ratings are not.

In [14]:
# create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.0,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.0,0.139805,0.0,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.0,0.17506,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.0
4,0.029577,0.130237,0.139805,1.0,0.0,0.04519,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.0,0.130343,0.077357,0.15789,0.063911
5,0.245753,0.054918,0.0,0.0,1.0,0.176443,0.28186,0.132205,0.03879,0.1342,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259
6,0.335853,0.190552,0.032485,0.04519,0.176443,1.0,0.394725,0.143385,0.125126,0.372679,...,0.328643,0.070809,0.135806,0.17167,0.125446,0.086464,0.230566,0.095478,0.197307,0.185268
7,0.344724,0.079399,0.043869,0.088586,0.28186,0.394725,1.0,0.215861,0.121224,0.378723,...,0.339853,0.110866,0.096055,0.10469,0.126108,0.075012,0.270071,0.020036,0.236086,0.266571
8,0.191582,0.076146,0.080968,0.199526,0.132205,0.143385,0.215861,1.0,0.116173,0.169088,...,0.150048,0.064242,0.118297,0.053969,0.168057,0.095736,0.164157,0.076269,0.089871,0.210995
9,0.057149,0.167992,0.022263,0.135013,0.03879,0.125126,0.121224,0.116173,1.0,0.152694,...,0.082819,0.0644,0.127051,0.069251,0.095673,0.0,0.131458,0.106763,0.089297,0.089583
10,0.251979,0.147376,0.059925,0.026919,0.1342,0.372679,0.378723,0.169088,0.152694,1.0,...,0.279849,0.087828,0.131888,0.111841,0.094423,0.080883,0.255758,0.063461,0.169309,0.181031


In [15]:
cosine_sim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 943 entries, 1 to 943
Columns: 943 entries, 1 to 943
dtypes: float64(943)
memory usage: 6.8 MB


In [16]:
r_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 943 entries, 1 to 943
Columns: 1641 entries, 1 to 1682
dtypes: float64(1641)
memory usage: 11.8 MB


In [17]:
print("cosine similarity between users")
display(cosine_sim.head()) # contains 943 users * 943 users
print("ratings of movies by individual users")
display(r_matrix.head()) # contains 943 users * 1641 movies

cosine similarity between users


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.0,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.0,0.139805,0.0,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.0,0.17506,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.0
4,0.029577,0.130237,0.139805,1.0,0.0,0.04519,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.0,0.130343,0.077357,0.15789,0.063911
5,0.245753,0.054918,0.0,0.0,1.0,0.176443,0.28186,0.132205,0.03879,0.1342,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259


ratings of movies by individual users


movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [18]:
def cf_user_wmean(user_id, movie_id):
    """rate movie_id by user_id using the weighted mean"""
    if movie_id in r_matrix:
        # get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        # get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id]
        # extract the indices containing NaN inthe m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        # drop the nan values from the m_ratings series
        m_ratings = m_ratings.dropna()
        # drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        # compute the final weighted mean
        if sim_scores.sum() == 0:
            # raise Exception(f"The sum of sim_scores cannot be zero. user_id: {user_id} movie_id: {movie_id}")
            wmean_rating = 3
        else:
            wmean_rating = np.dot(sim_scores, m_ratings) / (sim_scores.sum() + 0.000001)
    else:
        wmean_rating = 3.0
    
    return wmean_rating

In [19]:
score(cf_user_wmean)

1.0237209914385483

### User Demographics
Unlike the previous models, these filters do not take into account the ratings given by all
users to a particular movie. Instead, they only look at those users that fit a certain
demographic.

Let's now build a gender demographic filter. All this filter does is identify the gender of a
user, compute the (weighted) mean rating of a movie by that particular gender, and return
that as the predicted value.


In [20]:
merged_df = pd.merge(X_train, users)
merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,70,193,4,27,M,engineer,60067
2,666,527,4,44,M,administrator,61820
3,535,168,5,45,F,educator,80302
4,603,1240,5,21,M,programmer,47905


In [21]:
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
display(gender_mean[:5])

movie_id  sex
1         F      3.797872
          M      3.888446
2         F      3.285714
          M      3.202703
3         F      2.916667
Name: rating, dtype: float64

In [22]:
# set the index of the users dataframe to the user_id
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [23]:
# gender based collaborative filter using mean ratings
def cf_gender(user_id, movie_id):
    """predict rating of movie_id by user_id"""
    if movie_id in r_matrix:
        gender = users.loc[user_id]['sex']
        if gender in gender_mean[movie_id]:
            gender_rating = gender_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0392906999935203

We see that this model actually performs worse than the standard mean ratings
collaborative filter. This indicates that a user's gender isn't the strongest indicator of their
taste in movies.

Let's try building one more demographic filter, but this time using both gender and
occupation:

In [24]:
# compute the mean rating by gender and occupation
gen_occ_mean = merged_df[["sex", "rating", "movie_id", "occupation"]].pivot_table(values="rating", index="movie_id", columns=["occupation", "sex"], aggfunc='mean')
gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,3.9375,3.75,5.0,3.4,3.666667,3.25,3.884615,4.0,4.083333,4.0,...,,4.0,3.5,4.0,4.043478,3.796296,4.0,3.75,4.0,3.0
2,3.0,3.666667,,,,4.0,3.5,,3.066667,,...,,,,3.0,2.666667,3.277778,,2.714286,,2.333333
3,3.5,4.0,,,,,2.0,,3.777778,,...,,,,,3.0,3.391304,,4.25,,1.0
4,3.666667,3.6,,4.666667,3.0,2.5,3.8,4.0,3.65,,...,4.0,4.0,,3.4,3.25,3.777778,,3.333333,4.25,3.25
5,4.0,2.333333,,,,4.0,2.333333,,3.5,,...,,,,4.0,4.333333,3.111111,,3.333333,4.0,2.0


In [25]:
def cf_gen_occ(user_id, movie_id):
    # check if movie_id exists in gen_occ_mean
    if movie_id in gen_occ_mean.index:
        user = users.loc[user_id]
        gender = user['sex']
        occ = user['occupation']
        # check if the occupation has rated the movie
        if occ in gen_occ_mean.loc[movie_id]:
            if gender in gen_occ_mean.loc[movie_id][occ]:
                rating = gen_occ_mean.loc[movie_id][occ][gender]
                if np.isnan(rating):
                    rating = 3.0
                return rating 
    return 3.0

score(cf_gen_occ)

1.1419651376788005

We see that this model performs the worst out of all the filters we've built so far, beating
only the baseline. This strongly suggests that tinkering with user demographic data may
not be the best way to go forward with the data that we are currently using.

### Item-based Collaborative Filtering
Item-based collaborative filtering is essentially user-based collaborative filtering where the
users now play the role that items played, and vice versa.

In item-based collaborative filtering, we compute the pairwise similarity of every item in
the inventory. Then, given user_id and movie_id, we compute the weighted mean of the
ratings given by the user to all the items they have rated. The basic idea behind this model
is that a particular user is likely to rate two items that are similar to each other similarly.

Building an item-based collaborative filter is left as an exercise to the reader. The steps
involved are exactly the same except now, as mentioned earlier, the movies and users have
swapped places.

In [27]:
X_test.head()

Unnamed: 0,user_id,movie_id,rating
53814,459,16,2
11263,389,429,4
41460,666,122,2
3853,7,162,5
40713,506,198,2


In [26]:
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [28]:
# create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)
r_matrix_dummy.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,0.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

item_cosine_sim = cosine_similarity(r_matrix_dummy.T, r_matrix_dummy.T)
print(item_cosine_sim.shape)
item_cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.columns, columns=r_matrix.columns)
item_cosine_sim.head(10)

(1641, 1641)


movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.260375,0.28411,0.339919,0.188551,0.075488,0.493766,0.346421,0.408303,0.196823,...,0.0,0.03838,0.040708,0.0,0.0,0.0,0.0,0.0,0.054278,0.0
2,0.260375,1.0,0.18335,0.362014,0.256462,0.098676,0.286996,0.271497,0.186905,0.099162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095783
3,0.28411,0.18335,1.0,0.261785,0.164305,0.063693,0.296699,0.175637,0.225768,0.124924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107006
4,0.339919,0.362014,0.261785,1.0,0.192404,0.049803,0.357379,0.367472,0.337266,0.190223,...,0.0,0.046614,0.0,0.0,0.10987,0.0,0.0,0.0,0.065922,0.087896
5,0.188551,0.256462,0.164305,0.192404,1.0,0.060136,0.276375,0.18241,0.261563,0.045282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.075488,0.098676,0.063693,0.049803,0.060136,1.0,0.112621,0.070239,0.087844,0.108924,...,0.0,0.048564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.493766,0.286996,0.296699,0.357379,0.276375,0.112621,1.0,0.329754,0.442689,0.248735,...,0.0,0.042164,0.0,0.059628,0.0,0.0,0.0,0.0,0.059628,0.0
8,0.346421,0.271497,0.175637,0.367472,0.18241,0.070239,0.329754,1.0,0.360556,0.170206,...,0.0,0.0,0.0,0.094694,0.075755,0.094694,0.0,0.0,0.0,0.0
9,0.408303,0.186905,0.225768,0.337266,0.261563,0.087844,0.442689,0.360556,1.0,0.262773,...,0.0,0.0,0.0,0.0,0.065591,0.081989,0.0,0.0,0.0,0.081989
10,0.196823,0.099162,0.124924,0.190223,0.045282,0.108924,0.248735,0.170206,0.262773,1.0,...,0.0,0.07009,0.0,0.0,0.099123,0.0,0.0,0.0,0.0,0.0


In [35]:
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [54]:
def cf_item_wmean(user_id, movie_id):
    """rate movie_id by user_id using the weighted mean"""
    if movie_id in r_matrix:
        # get the similarity scores for the user in question with every other user
        sim_scores = item_cosine_sim[movie_id]
        # get the user ratings for the movie in question
        m_ratings = r_matrix.loc[user_id]
        # extract the indices containing NaN inthe m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        # drop the nan values from the m_ratings series
        m_ratings = m_ratings.dropna()
        # drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        # compute the final weighted mean
        if sim_scores.sum() == 0:
            # raise Exception(f"The sum of sim_scores cannot be zero. user_id: {user_id} movie_id: {movie_id}")
            wmean_rating = 3
        else:
            wmean_rating = np.dot(sim_scores, m_ratings) / (sim_scores.sum() + 0.000001)
    else:
        wmean_rating = 3.0
    
    return wmean_rating

In [55]:
score(cf_item_wmean)

1.0184931272898352

We can observe that, the item-based collaborative recommendation has lower RMSE than the weighted user-based collaborative recommendation.

## Model-based approaches
The collaborative filters we have built thus far are known as memory-based filters. This is
because they only make use of similarity metrics to come up with their results.
They learn any parameters from the data or assign classes/clusters to the data. In other
words, they do not make use of machine learning algorithms.

### Clustering
One of the major drawbacks of the demographic filters was that they were based on the
assumption that people from a certain demographic think and rate alike. However, we can
safely say that this is an overreached assumption. Not all men like action movies. Nor do all children like animated movies. Similarly, it is extremely far-fetched to assume that people from a particular area or occupation will have the same taste.


In this section, we will use k-means' sister algorithm, kNN, to build our clustering-based
collaborative filter. In a nutshell, given an user, u, and a movie, m, these are the steps
involved:
1. Find the k-nearest neighbors of u who have rated movie m
2. Output the average rating of the k users for the movie m

### Surprise
Surprise is a scikit (or scientific kit) for building recommender systems in Python. You can
think of it as scikit-learn's recommender systems counterpart. According to its
documentation, surprise stands for Simple Python Recommendation System Engine.
Within a very short span of time, surprise has gone on to become one of the most
popularly used recommender libraries. This is because it is extremely robust and easy to
use. It gives us ready-to-use implementations of most of the popular collaborative filtering
algorithms and also allows us to integrate an algorithm of our own into the framework.

In [15]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [32]:
ratings.isnull().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

In [33]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(ratings, reader)
knn = KNNBasic()

cross_validate(knn, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9841  0.9773  0.9708  0.9819  0.9825  0.9793  0.0048  
MAE (testset)     0.7764  0.7730  0.7684  0.7758  0.7743  0.7735  0.0028  
Fit time          0.35    0.34    0.40    0.37    0.42    0.38    0.03    
Test time         3.27    3.17    2.95    3.32    3.40    3.22    0.15    


{'test_rmse': array([0.98407687, 0.97726987, 0.97076385, 0.98189263, 0.98248595]),
 'test_mae': array([0.77637547, 0.77295085, 0.76837626, 0.77577312, 0.77425459]),
 'fit_time': (0.3491382598876953,
  0.34187936782836914,
  0.3959615230560303,
  0.37213611602783203,
  0.4196145534515381),
 'test_time': (3.2672736644744873,
  3.17242169380188,
  2.953244209289551,
  3.3167967796325684,
  3.3968868255615234)}

**We see that the RMSE obtained by this model is 0.97. This is, by far, the best result we
have achieved.**

The output indicates that the filter is making use of a technique known as fivefold cross-
validation. In a nutshell, this means that surprise divides the data into five equal parts.
It then uses four parts as the training data and tests it on the fifth part. This is done five
times, in such a way that every part plays the role of the test data once.

In [34]:
X_test.head()

Unnamed: 0,user_id,movie_id,rating
53814,459,16,2
11263,389,429,4
41460,666,122,2
3853,7,162,5
40713,506,198,2


In [35]:
reader = Reader()
data = Dataset.load_from_df(X_test, reader)
knn = KNNBasic()

cross_validate(knn, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0686  1.0541  1.0599  1.0740  1.0739  1.0661  0.0079  
MAE (testset)     0.8509  0.8307  0.8429  0.8480  0.8493  0.8443  0.0073  
Fit time          0.06    0.05    0.04    0.03    0.03    0.04    0.01    
Test time         0.34    0.24    0.25    0.24    0.24    0.26    0.04    


{'test_rmse': array([1.06861501, 1.05405771, 1.05990289, 1.07400329, 1.07389395]),
 'test_mae': array([0.85086214, 0.83070557, 0.84287903, 0.84801723, 0.84925077]),
 'fit_time': (0.060788869857788086,
  0.04543185234069824,
  0.03997945785522461,
  0.031974077224731445,
  0.03447985649108887),
 'test_time': (0.33794713020324707,
  0.24402213096618652,
  0.2522573471069336,
  0.23864150047302246,
  0.2433621883392334)}

## Supervised learning and dimensionality reduction
Consider our ratings matrix once again. It is of the m × n shape, where every row represents
one of the m users and every column represents one of the n items.

Let's now remove one of the n columns (say nj). We now have an m × (n-1) matrix. If we
treat the m × (n-1) matrix as the predictor variables and nj as the target variable, we can use
supervised learning algorithms to train on the values available in nj to predict values that
are not. This can be repeated n times for every column to eventually complete our matrix.

One big problem is that most supervised learning algorithms do not work with missing
data. In standard problems, it is common practice to impute the missing values with the
mean or median of the column it belongs to.

In [36]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [38]:
from surprise import SVD

In [39]:
reader = Reader()
data = Dataset.load_from_df(ratings, reader)
svd = SVD()

cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9376  0.9428  0.9377  0.9304  0.9373  0.9372  0.0040  
MAE (testset)     0.7392  0.7431  0.7389  0.7328  0.7376  0.7383  0.0033  
Fit time          1.29    1.30    1.29    1.28    1.28    1.29    0.01    
Test time         0.11    0.19    0.14    0.19    0.11    0.15    0.03    


{'test_rmse': array([0.93763828, 0.94283292, 0.93767089, 0.93038109, 0.93734996]),
 'test_mae': array([0.73923354, 0.74306381, 0.73889103, 0.73275099, 0.73762304]),
 'fit_time': (1.287940263748169,
  1.3043556213378906,
  1.2862825393676758,
  1.2752282619476318,
  1.2828142642974854),
 'test_time': (0.11468076705932617,
  0.188554048538208,
  0.1400449275970459,
  0.18929147720336914,
  0.11342144012451172)}

The SVD filter outperforms all other filters, with an RMSE score of 0.9367.