# Hands On Recommendation Systems with Python
#### Chapter 6 : Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import os

## Data

In [2]:
# User Data
datapath = os.getcwd()
datafile = '/Data/movielens_100k/u.user'

user_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
df_users = pd.read_csv(datapath + datafile, sep='|', names=user_cols, encoding='latin-1')
df_users.head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [3]:
# Movie Data
datapath = os.getcwd()
datafile = '/Data/movielens_100k/u.item'

movie_cols = ['movie_id', 'title_year' ,'release_date','video_release_date',
              'IMDB URL', 'unknown', 'Action', 'Adventure','Animation', 
              'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 
              'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
              'Sci-Fi', 'Thriller', 'War', 'Western']

df_movies = pd.read_csv(datapath + datafile, sep='|', names=movie_cols, encoding='latin-1')
df_movies.head(3)

Unnamed: 0,movie_id,title_year,release_date,video_release_date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
# Remove all columns except movie_id and title_year
df_movies = df_movies[['movie_id', 'title_year']]

In [5]:
# Rating Data
datapath = os.getcwd()
datafile = '/Data/movielens_100k/u.data'

rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

df_ratings = pd.read_csv(datapath + datafile, sep='\t', names=rating_cols, encoding='latin-1')
df_ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [6]:
# Drop timestamp column from Rating Data
df_ratings = df_ratings.drop(columns=['timestamp'], axis=1)

In [7]:
print( df_ratings.user_id.nunique()  )
print( df_ratings.movie_id.nunique() )

943
1682


In [8]:
# df_ratings['user_id'].value_counts().sort_values().head(35).sort_index()
# df_ratings.groupby(['user_id']).size().sort_values().head(35).sort_index()

## Data Split

In [9]:
from sklearn.model_selection import train_test_split

X = df_ratings.copy()
y = df_ratings['user_id']

# Note that y_train and y_test are just place holders. Do not use them for any calculation.   
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, stratify=y, random_state=42)

## Model 1 : Baseline Model

In [10]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [11]:
def baseline_model(user_id, movie_id):
    return 3.0

In [12]:
def score(fitted_model):    
    # Construct a list of user-movie tuples from the testing dataset
    user_movie_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    # Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #  Predict the rating for every user-movie tuple
    y_pred = np.array([fitted_model(user, movie) for (user, movie) in user_movie_pairs])
     
    return rmse(y_true, y_pred)

In [13]:
score(baseline_model)

1.2488234462885457

## Model 2 : User Based Collaborative Filtering

#### User_Movie Matrix

In [14]:
user_movie = X_train.pivot(index='user_id', columns='movie_id', values='rating')
user_movie.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


#### Utility Matrix

Imput null values by 0

In [15]:
utility = user_movie.copy().fillna(0)
utility.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,0.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_ratings.movie_id.sort_values().tail(3)

80394    1680
92329    1681
95376    1682
Name: movie_id, dtype: int64

#### Similarity Matrix

In [17]:
from sklearn.metrics import pairwise_distances
cosine_sim = pairwise_distances(utility, utility, metric='cosine')

In [18]:
cosine_sim = pd.DataFrame(cosine_sim, index=user_movie.index, columns=user_movie.index)
cosine_sim.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.332268e-15,0.891639,0.953362,0.9704227,0.7542473,0.664147,0.655276,0.808418,0.942851,0.748021,...,0.742927,0.930588,0.768357,0.891907,0.823158,0.895201,0.767528,0.948472,0.870445,0.743667
2,0.8916386,0.0,0.942387,0.8697628,0.9450823,0.809448,0.920601,0.923854,0.832008,0.852624,...,0.863007,0.747113,0.744546,0.714807,0.767249,0.850912,0.897193,0.937614,0.890857,0.892314
3,0.9533615,0.942387,0.0,0.8601954,1.0,0.967515,0.956131,0.919032,0.977737,0.940075,...,0.972598,1.0,0.82494,0.989657,0.894365,0.980948,0.872901,0.976083,0.939608,1.0
4,0.9704227,0.869763,0.860195,1.110223e-16,1.0,0.95481,0.911414,0.800474,0.864987,0.973081,...,0.944608,0.950227,0.923451,0.860618,0.886114,1.0,0.869657,0.922643,0.84211,0.936089
5,0.7542473,0.945082,1.0,1.0,6.661338e-16,0.823557,0.71814,0.867795,0.96121,0.8658,...,0.816031,0.980695,0.926286,0.958193,0.918912,0.970257,0.811608,0.931658,0.944443,0.792741


#### 2.1 : Mean Value Model

In [19]:
# User Based Collaborative Filter using Mean Ratings
def mean_model(user_id, movie_id):
    
    # Check if movie_id exists in rating matrix
    if movie_id in user_movie:
        # Compute the mean of all the ratings given to the movie
        mean_rating = user_movie[movie_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [20]:
# Check
# y_true = X_test['rating'].to_list()
# y_pred = [ round(mean_model(user, movie),2) for (user,movie) in zip(X_test['user_id'], X_test['movie_id']) ]
# print( y_true[0:5], y_pred[0:5])

In [21]:
score(mean_model)

1.0300824802393536

#### 2.2 : Weighted Mean Value Model

In [22]:
# User-User Collaborative Filter using Weighted Mean Ratings
def weightedmean_model(user_id, movie_id):
    
    # Check if movie_id exists in r_matrix
    if movie_id in user_movie:
        
        # Get the similarity scores for the user in question with every other user
        similarity_scores = cosine_sim[user_id]
        
        # Get the user ratings for the movie in question
        movie_ratings = user_movie[movie_id]
        
        # Extract the indices containing NaN in the m_ratings series
        idx = movie_ratings[movie_ratings.isnull()].index
        
        # Drop NaN values from movie_ratings Series
        movie_ratings = movie_ratings.dropna()
        
        # Drop corresponding cosine scores from the similarity_scores series
        similarity_scores = similarity_scores.drop(idx)
        
        # Compute the final weighted mean
        wmean_rating = np.dot(similarity_scores, movie_ratings)/ similarity_scores.sum()
    
    else:
        wmean_rating = 3.0
    
    return wmean_rating

In [23]:
score(weightedmean_model)

1.032506794519864

## Model 3 : Mean Value using Information from Additional Feature

In [24]:
# Merge the original users dataframe with the training set 
merged_df = pd.merge(X_train, df_users)
merged_df.head(3)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,862,416,3,25,M,executive,13820
2,862,1093,5,25,M,executive,13820


In [25]:
print( merged_df['rating'].value_counts() )

4    25733
3    20347
5    15832
2     8538
1     4550
Name: rating, dtype: int64


In [26]:
gender_mean = merged_df.groupby(['movie_id', 'sex']).agg({'rating': 'mean'})
gender_mean = gender_mean.reset_index(level=[0,1], drop=False)
gender_mean.head(5)

Unnamed: 0,movie_id,sex,rating
0,1,F,3.797872
1,1,M,3.888446
2,2,F,3.285714
3,2,M,3.202703
4,3,F,2.916667


In [27]:
# Compute the mean rating of every movie by gender
tmpdf = merged_df[['movie_id', 'sex', 'rating']]
gender_mean = tmpdf.groupby(['movie_id','sex'])['rating'].mean()
gender_mean.head(5)

movie_id  sex
1         F      3.797872
          M      3.888446
2         F      3.285714
          M      3.202703
3         F      2.916667
Name: rating, dtype: float64

In [28]:
# Set the index of the users dataframe to the user_id
users = df_users.set_index('user_id')
users.head(3)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067


## Model 4 :

In [29]:
# Gender Based Collaborative Filter using Mean Ratings
def cf_gender(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if movie_id in user_movie:
        #Identify the gender of the user
        gender = users.loc[user_id]['sex']
    
        
        #Check if the gender has rated the movie
        if gender in gender_mean[movie_id]:
            
            #Compute the mean rating given by that gender to the movie
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        gender_rating = 3.0
    
    return gender_rating

In [30]:
score(cf_gender)

1.0392906999935203

In [31]:
# Compute the mean rating by gender and occupation
tmpdf = merged_df[['sex', 'rating', 'movie_id', 'occupation']]
gen_occ_mean = tmpdf.pivot_table(index='movie_id', columns=['occupation', 'sex'], 
                                 values='rating', 
                                 aggfunc='mean'
                                )
gen_occ_mean.head(3)

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,3.9375,3.75,5.0,3.4,3.666667,3.25,3.884615,4.0,4.083333,4.0,...,,4.0,3.5,4.0,4.043478,3.796296,4.0,3.75,4.0,3.0
2,3.0,3.666667,,,,4.0,3.5,,3.066667,,...,,,,3.0,2.666667,3.277778,,2.714286,,2.333333
3,3.5,4.0,,,,,2.0,,3.777778,,...,,,,,3.0,3.391304,,4.25,,1.0


In [32]:
# Gender and Occupation Based Collaborative Filter using Mean Ratings
def cf_gen_occ(user_id, movie_id):
    
    #Check if movie_id exists in gen_occ_mean
    if movie_id in gen_occ_mean.index:
        
        #Identify the user
        user = users.loc[user_id]
        
        #Identify the gender and occupation
        gender = user['sex']
        occ = user['occupation']
        
        #Check if the occupation has rated the movie
        if occ in gen_occ_mean.loc[movie_id]:
            
            #Check if the gender has rated the movie
            if gender in gen_occ_mean.loc[movie_id][occ]:
                
                #Extract the required rating
                rating = gen_occ_mean.loc[movie_id][occ][gender]
                
                #Default to 3.0 if the rating is null
                if np.isnan(rating):
                    rating = 3.0
                
                return rating
            
    #Return the default rating    
    return 3.0

In [33]:
score(cf_gen_occ)

1.1419651376788005

## Model-based Approach

In [34]:
# Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic, SVD, Prediction
from surprise.model_selection import cross_validate

In [35]:
# Define a Reader object
# The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

# Create the dataset to be used for building the filter
data = Dataset.load_from_df(df_ratings, reader=reader)

# Define the algorithm object; in this case kNN
knn = KNNBasic()

cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
#Evaluate the performance in terms of RMSE
#evaluate(knn, data, measures=['RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9773  0.9810  0.9725  0.9781  0.9854  0.9789  0.0043  
MAE (testset)     0.7722  0.7753  0.7693  0.7710  0.7776  0.7731  0.0030  
Fit time          0.30    0.36    0.37    0.37    0.38    0.36    0.03    
Test time         2.18    2.31    2.40    2.35    2.41    2.33    0.08    


{'test_rmse': array([0.97728832, 0.98103448, 0.97250272, 0.97808569, 0.98536242]),
 'test_mae': array([0.77221028, 0.77533119, 0.76931955, 0.77101197, 0.77760957]),
 'fit_time': (0.303433895111084,
  0.363616943359375,
  0.3738858699798584,
  0.3682229518890381,
  0.37815189361572266),
 'test_time': (2.1821062564849854,
  2.3141798973083496,
  2.4034509658813477,
  2.3515231609344482,
  2.414177179336548)}

In [36]:
# Define the SVD object
svd = SVD()

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9296  0.9401  0.9397  0.9316  0.9373  0.9357  0.0043  
MAE (testset)     0.7342  0.7384  0.7389  0.7335  0.7395  0.7369  0.0025  
Fit time          4.09    4.04    4.07    4.03    4.08    4.06    0.02    
Test time         0.10    0.17    0.09    0.09    0.18    0.13    0.04    


{'test_rmse': array([0.92959822, 0.9401207 , 0.93965357, 0.93156499, 0.93733371]),
 'test_mae': array([0.73415956, 0.73839397, 0.73893792, 0.73354688, 0.73946374]),
 'fit_time': (4.087656021118164,
  4.0390918254852295,
  4.072702169418335,
  4.027628183364868,
  4.078943967819214),
 'test_time': (0.10000896453857422,
  0.17186307907104492,
  0.09473800659179688,
  0.09419798851013184,
  0.18438196182250977)}