In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

#Reading ratings file:
ratings = pd.read_csv('/Users/paramanandbhat/Downloads/UserBasedCollaborativeFilteringfromscratch-201024-234223/ratings.csv')

#Reading Movie Info File
movie_info = pd.read_csv('/Users/paramanandbhat/Downloads/UserBasedCollaborativeFilteringfromscratch-201024-234223/movie_info.csv')

In [2]:
ratings = ratings.merge(movie_info[['movie id','movie title']], how='left', left_on = 'movie_id', right_on = 'movie id')

In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie id,movie title
0,196,242,3,881250949,242,Kolya (1996)
1,186,302,3,891717742,302,L.A. Confidential (1997)
2,22,377,1,878887116,377,Heavyweights (1994)
3,244,51,2,880606923,51,Legends of the Fall (1994)
4,166,346,1,886397596,346,Jackie Brown (1997)


In [4]:
'''Lets also combine movie id and movie title separated by ': ' and store it in a new column named movie'''

"Lets also combine movie id and movie title separated by ': ' and store it in a new column named movie"

In [5]:
ratings['movie'] = ratings['movie_id'].map(str) + str(': ') + ratings['movie title'].map(str)

In [6]:
ratings.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp', 'movie id',
       'movie title', 'movie'],
      dtype='object')

In [7]:
ratings = ratings.drop(['movie id', 'movie title', 'movie_id','unix_timestamp'], axis = 1)

In [8]:
ratings = ratings[['user_id','movie','rating']]

In [9]:
## 3. Creating Train & Test Data & Setting Evaluation Metric

In [10]:
#Assign X as the original ratings dataframe
X = ratings.copy()

#Split into training and test datasets
X_train, X_test = train_test_split(X, test_size = 0.25, random_state=42)

In [11]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [12]:
## 4. Simple Baseline using average of all ratings

In [13]:
#Define the baseline model to always return average of all available ratings
def baseline(user_id, movie):
    return X_train['rating'].mean()

In [14]:
#Function to compute the RMSE score obtained on the test set by a model
def rmse_score(model):
    
    #Construct a list of user-movie tuples from the test dataset
    id_pairs = zip(X_test['user_id'], X_test['movie'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [15]:
rmse_score(baseline)

1.1244396573898978

In [16]:
#6. User based Collaborative filtering with simple user mean


In [17]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie')

r_matrix.head()

movie,1000: Lightning Jack (1994),"1001: Stupids, The (1996)","1002: Pest, The (1997)",1003: That Darn Cat! (1997),1004: Geronimo: An American Legend (1993),"1005: Double vie de Véronique, La (Double Life of Veronique, The) (1991)",1006: Until the End of the World (Bis ans Ende der Welt) (1991),1007: Waiting for Guffman (1996),1008: I Shot Andy Warhol (1996),1009: Stealing Beauty (1996),...,992: Head Above Water (1996),993: Hercules (1997),"994: Last Time I Committed Suicide, The (1997)","995: Kiss Me, Guido (1997)","996: Big Green, The (1995)",997: Stuart Saves His Family (1995),998: Cabin Boy (1994),999: Clean Slate (1994),99: Snow White and the Seven Dwarfs (1937),9: Dead Man Walking (1995)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,3.0,5.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [22]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie):
    
    #Check if movie exists in r_matrix
    if movie in r_matrix:
        
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movie].mean()
    
    else:
        #Default to average rating from the train set
        mean_rating = X_train['rating'].mean()
    
    return mean_rating

In [23]:
#Compute RMSE for the Mean model
rmse_score(cf_user_mean)

1.0224465207437918

In [20]:
## 7. User based Collaborative filtering with similarity weighted mean

In [21]:
#Compute the Pearson Correlation using the ratings matrix with corr function from Pandas
pearson_corr = r_matrix.T.corr()

In [24]:
#Convert into pandas dataframe 
pearson_corr = pd.DataFrame(pearson_corr, index=r_matrix.index, columns=r_matrix.index)

pearson_corr.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.01785714,-0.2758386,-0.688247,0.343604,0.167618,0.35613,0.669623,-0.3015113,-0.2648507,...,0.116327,-0.255377,0.3556769,0.0,0.148884,0.787562,0.4268828,-2.166933e-16,-0.4372411,0.102244
2,-0.017857,1.0,9.930137e-17,0.57735,0.0,0.411569,0.514376,0.0,0.5,0.06933752,...,0.104828,0.174078,0.1518871,0.081044,-0.09505,,0.2000817,,0.02054554,0.583333
3,-0.275839,9.930137e-17,1.0,0.207514,,-0.265949,-0.735147,0.102598,,0.5773503,...,,,-0.1705606,-0.57735,-0.158777,,-8.392497000000001e-17,,0.3370999,
4,-0.688247,0.5773503,0.2075143,1.0,,,-0.328897,0.57735,,,...,,,,,0.866025,,0.7938842,,,
5,0.343604,0.0,,,1.0,0.237095,0.239475,0.636003,,-0.0181116,...,0.121353,-0.5,0.2973177,0.5,0.678003,0.904534,-0.1607116,0.4082483,0.3185591,0.475075
6,0.167618,0.4115688,-0.2659489,,0.237095,1.0,0.145616,0.726489,0.07537784,0.362786,...,-0.144049,-0.229416,0.4193636,0.296961,0.038835,,0.03869116,0.1324532,0.1098244,0.078826
7,0.35613,0.5143759,-0.735147,-0.328897,0.239475,0.145616,1.0,0.291131,-0.1075829,0.2729831,...,-0.109807,-0.340307,0.5053534,0.592965,0.125578,0.26968,-0.08774509,,0.4660431,0.361683
8,0.669623,0.0,0.1025978,0.57735,0.636003,0.726489,0.291131,1.0,,0.3887408,...,-0.110657,,0.7644708,0.944911,0.877515,,0.3994298,-1.0,-1.532253e-16,0.239229
9,-0.301511,0.5,,,,0.075378,-0.107583,,1.0,3.4399e-16,...,0.866025,,0.0,0.755929,,,-0.5,1.0,,
10,-0.264851,0.06933752,0.5773503,,-0.018112,0.362786,0.272983,0.388741,3.4399e-16,1.0,...,-0.219333,-0.075378,1.279469e-16,-0.612372,,,0.1356748,0.5,0.1814575,-0.114645


In [25]:
#Fill all the missing correlations with 0
pearson_cor = pearson_corr.fillna(0)

In [26]:
'''Now, we have the user user similarities stored in the matrix pearson_cor. We will define a function to predict the unknown ratings in the test set using user based collarborative filtering with simiarity as pearson correlation and using all neighbours with positive correlation. For each user movie pair:
1. Check if a movie is there in train set, if its not in that case we will just predict the mean rating as the predicted rating
2. Calculate the mean rating for the active user
3. Extract correlation values from matrix pearson_corr and sort it in decreasing order of correlation values
4. Keep only similarity scores for users with positive correlation with the active user
5. Drop all the users similar to active user but haven't rated the target movie
6. Do a check and predict mean rating if there are no similar users who have rated the target movie'''

"Now, we have the user user similarities stored in the matrix pearson_cor. We will define a function to predict the unknown ratings in the test set using user based collarborative filtering with simiarity as pearson correlation and using all neighbours with positive correlation. For each user movie pair:\n1. Check if a movie is there in train set, if its not in that case we will just predict the mean rating as the predicted rating\n2. Calculate the mean rating for the active user\n3. Extract correlation values from matrix pearson_corr and sort it in decreasing order of correlation values\n4. Keep only similarity scores for users with positive correlation with the active user\n5. Drop all the users similar to active user but haven't rated the target movie\n6. Do a check and predict mean rating if there are no similar users who have rated the target movie"

In [27]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
        
        #Mean rating for active user
        ra = r_matrix.loc[user_id].mean()

        #Get the similarity scores for the user in question with every other user
        sim_scores = pearson_corr[user_id].sort_values(ascending = False)
        
        # Keep similarity scores for users with positive correlation with active user
        sim_scores_pos = sim_scores[sim_scores > 0]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id][sim_scores_pos.index]
        
        #Extract the indices containing NaN in the m_ratings series (Users who have not rated the target movie)
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        # If there are no ratings from similar users we cannot use this method so we predict just 
        # the average rating of the movie else we use the prediction formula
        if len(m_ratings) == 0:
            #Default to average rating in the absence of ratings by similar users
            wmean_rating = r_matrix[movie_id].mean()
        else:   
            #Drop the corresponding correlation scores from the sim_scores series
            sim_scores_pos = sim_scores_pos.drop(idx)
            
            #Subtract average rating of each user from the rating (rbp - mean(rb))
            m_ratings = m_ratings - r_matrix.loc[m_ratings.index].mean(axis = 1)
            
            #Compute the final weighted mean using np.dot which is nothing but the product divided by sum of weights
            wmean_rating = ra + (np.dot(sim_scores_pos, m_ratings)/ sim_scores_pos.sum())
   
    else:
        #Default to average rating in the absence of any information on the movie in train set
        wmean_rating = X_train['rating'].mean()
    
    return wmean_rating

In [28]:
rmse_score(cf_user_wmean)

0.9568512581492972