# Memory Based Collaborative Filtering (Item-Based Approach)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
ratings_all = pd.read_csv("data/raw/ratings.csv")

In [99]:
def data_sampling (df,item_nos=900,item_split=[0.75,0.25]):
    
    ##############################    Data Preprocessing from User Perspective   #########################
    
    #Frequency of movie rating by each user
    user_rtgs_cnt = (df.groupby(['userId']).count()).iloc[:,0:1].reset_index().rename(columns={"movieId":"rating_cnt"})
    print ("Original number of users in dataset : ",len(user_rtgs_cnt))
    
    quantile_user=user_rtgs_cnt.quantile([0.1,.25,.75,0.9], axis = 0).drop(["userId"],axis=1)
    print("Data distribution of frequency of movies rated by users : \n ", quantile_user)
    
    #Removing the higher and lower 10% of the outliers.
    user_rtgs_cnt=user_rtgs_cnt[np.logical_and((user_rtgs_cnt.rating_cnt>=quantile_user.iloc[0,0]),(user_rtgs_cnt.rating_cnt<=quantile_user.iloc[3,0]))]
    print ("Number of users in dataset post removal of bias based on user activity: ",len(user_rtgs_cnt))
    
    #These users are then removed from the dataset
    df=df.merge(user_rtgs_cnt[['userId']],on="userId", how="inner")   
    
    ##############################  Data Preprocessing from Item Perspective   #########################
    
    #Count of Ratings per movie
    item_count = (df[["movieId","rating"]].groupby(['movieId']).count()).reset_index().rename(columns={"rating":"rating_per_item"})
    print("Original number of movies in dataset :\n ",len(item_count))
    
    quantile_item=item_count.quantile([0.1,.25,.75,1], axis = 0).drop(["movieId"],axis=1)
    print("Data distribution of frequency of ratings per movie : \n ", quantile_item)
    
    #Removing all items which have less than 3 user counts i.e Q1 or based on a fixed number 
    #item_count=item_count[item_count.rating_per_item>=quantile_item.iloc[1,0]].reset_index(drop=True)
    item_count=item_count[item_count.rating_per_item>=5].reset_index(drop=True)
    item_count["item_subset"]=np.where(item_count.rating_per_item < quantile_item.iloc[2,0],1,2)
    print("Total number of movies in dataset post removal of low rated movies: ",len(item_count))
    
    
    ##############################  Data Sampling   #########################
    
    sampled_ratings=pd.DataFrame()
    j=len(item_split)-1
    
    for i in item_count.item_subset.unique():
        sampled_ratings=sampled_ratings.append(item_count[item_count.item_subset==i].sample(n=int(item_split[j]*item_nos), random_state=10))
        j=j-1
        
    sampled_ratings.reset_index(drop=True, inplace=True)   
    print ("Sum of all the ratings for the selected movies : ",sampled_ratings['rating_per_item'].sum())
    
    
    #Select user rows for only those movies which have been sampled
    df=df.merge(sampled_ratings[['movieId']],on="movieId", how="inner")
    
    #Since not all items are selected it may happen that we again get items with only user frequency.
    #Removing single frequency users so as to reduce sparsity and enable item-item comparison between pairs
    
    user_rtgs_cnt_2=(df.groupby(['userId']).count()).iloc[:,0:1].reset_index().rename(columns={"movieId":"user_freq"})
    df=df.merge(user_rtgs_cnt_2,on="userId", how="inner")
    
    #For any personalized recommendation to a user, we are setting a rule that user should have watched 5 movies before. 
    #Before that only popular recommendations to him
    df=df[df.user_freq>5] 
    
    
    print("Number of rows in sampled dataset : ", len(df))
    
    return df

In [109]:
#ratings = ratings_all.sample(n=100000).reset_index(drop=True)
ratings = data_sampling(ratings_all, item_split=[0.90, 0.10])
ratings.drop(columns=['timestamp'], inplace=True)
print(ratings.shape)
ratings.head()

Original number of users in dataset :  138493
Data distribution of frequency of movies rated by users : 
        rating_cnt
0.10        24.0
0.25        35.0
0.75       155.0
0.90       334.0
Number of users in dataset post removal of bias based on user activity:  111593
Original number of movies in dataset :
  18128
Data distribution of frequency of ratings per movie : 
        rating_per_item
0.10              1.0
0.25              3.0
0.75            150.0
1.00          53083.0
Total number of movies in dataset post removal of low rated movies:  12376
Sum of all the ratings for the selected movies :  245692
Number of rows in sampled dataset :  76894
(76894, 4)


Unnamed: 0,userId,movieId,rating,user_freq
0,1,151,4.0,6
1,1,2542,4.0,6
2,1,2692,3.5,6
3,1,4993,5.0,6
4,1,6093,4.0,6


In [110]:
ratings.shape

(76894, 4)

In [111]:
# split the ratings into training and test with 70-30%
ratings_training = ratings.sample(frac=0.7)
ratings_test = ratings.drop(ratings_training.index)
ratings_training.reset_index(drop=True, inplace=True)
ratings_test.reset_index(drop=True, inplace=True)

In [113]:
print("training data size", ratings_training.shape)
print("test data size", ratings_test.shape)
ratings_training.head()

training data size (53826, 4)
test data size (23068, 4)


Unnamed: 0,userId,movieId,rating,user_freq
0,91423,500,3.5,6
1,3573,6709,4.0,7
2,105275,280,4.0,9
3,82564,2542,4.0,7
4,50071,33493,4.0,7


## Build an item-to-item matrix which will be used for prediction

In [115]:
def build_weight_matrix(ratings):
    
    # define weight matrix
    w_matrix_columns = ['movie_1', 'movie_2', 'weight']
    w_matrix = pd.DataFrame(columns = w_matrix_columns)

    # calculate the similarity between pairs of movies
    unique_movies = np.unique(ratings['movieId'])
    print("Number of unique movies: ", len(unique_movies))
    
    i = 0
    for movie_1 in unique_movies:
        if i%10==0:
            print("Processing ", i , " movie out of ", len(unique_movies), " movies")

        # extract all users who rated movie_1
        user_data = ratings[ratings['movieId'] == movie_1]
        unique_users = np.unique(user_data['userId'])

        # record the ratings for users who rated both movie_1 and movie_2
        record_row_columns = ['userId', 'movie_1', 'movie_2', 'rating_1', 'rating_2']
        record_movie_1_2 = pd.DataFrame(columns=record_row_columns)
        
        # for each customer C who rated movie_1 record the her ratings for movie_2 
        for c_userid in unique_users:
            c_movie_1_rating = user_data[user_data['userId'] == c_userid]['rating'].iloc[0]
            # all movies of user c excluding movie_1
            c_user_data = ratings[(ratings['userId'] == c_userid) & (ratings['movieId'] != movie_1)]
            c_unique_movies = np.unique(c_user_data['movieId'])

            # Iterate through all movies rated by customer C as movie=2
            for movie_2 in c_unique_movies:
               # the customer's rating for movie_2
                c_movie_2_rating = c_user_data[c_user_data['movieId'] == movie_2]['rating'].iloc[0]
                record_row = pd.Series([c_userid, movie_1, movie_2, c_movie_1_rating, c_movie_2_rating], index=record_row_columns)
                record_movie_1_2 = record_movie_1_2.append(record_row, ignore_index=True)
        
        # computing the similarity between movie_1 and the other recorded movies tagged as movie_2
        unique_movie_2 = np.unique(record_movie_1_2['movie_2'])
        # going through each movie 2
        for movie_2 in unique_movie_2:
            paired_movie_1_2 = record_movie_1_2[record_movie_1_2['movie_2'] == movie_2]
            cosine_sim_numerator = (paired_movie_1_2['rating_1'] * paired_movie_1_2['rating_2']).sum()
            cosine_sim_denominator = np.sqrt(np.square(paired_movie_1_2['rating_1']).sum()) * np.sqrt(np.square(paired_movie_1_2['rating_2']).sum())
            cosine_sim_denominator = cosine_sim_denominator if cosine_sim_denominator != 0 else 1e-8
            sim_value = cosine_sim_numerator / cosine_sim_denominator
            w_matrix = w_matrix.append(pd.Series([movie_1, movie_2, sim_value], index=w_matrix_columns), ignore_index=True)
            
        i = i + 1
    #return the computed weight matrix
    return w_matrix

In [116]:
w_matrix = build_weight_matrix(ratings_training)


Number of unique movies:  887
Processing  0  movie out of  887  movies
Processing  10  movie out of  887  movies
Processing  20  movie out of  887  movies
Processing  30  movie out of  887  movies
Processing  40  movie out of  887  movies
Processing  50  movie out of  887  movies
Processing  60  movie out of  887  movies
Processing  70  movie out of  887  movies
Processing  80  movie out of  887  movies
Processing  90  movie out of  887  movies
Processing  100  movie out of  887  movies
Processing  110  movie out of  887  movies
Processing  120  movie out of  887  movies
Processing  130  movie out of  887  movies
Processing  140  movie out of  887  movies
Processing  150  movie out of  887  movies
Processing  160  movie out of  887  movies
Processing  170  movie out of  887  movies
Processing  180  movie out of  887  movies
Processing  190  movie out of  887  movies
Processing  200  movie out of  887  movies
Processing  210  movie out of  887  movies
Processing  220  movie out of  887 

In [118]:
len(np.unique(w_matrix['weight']))

6635

## Predict the rating of unrated movies for each user

In [119]:
# predict a rating for a given user and given movie
def predict(userId, movieId, w_matrix, ratings):
    # predict the rating of the given movie by the given user
    user_other_ratings = ratings[ratings['userId'] == userId]
    user_unique_movies = np.unique(user_other_ratings['movieId'])
    sum_weighted_other_ratings = 0
    sum_weghts = 0
    for movie_j in user_unique_movies:
        # only calculate the weighted values when the weight between movie_1 and movie_2 exists in weight matrix
        w_movie_1_2 = w_matrix[(w_matrix['movie_1'] == movieId) & (w_matrix['movie_2'] == movie_j)]
        if len(w_movie_1_2) > 0:
            user_rating_j = user_other_ratings[user_other_ratings['movieId']==movie_j]
            sum_weighted_other_ratings += (user_rating_j['rating'].iloc[0] * w_movie_1_2['weight'].iloc[0])
            sum_weghts += np.abs(w_movie_1_2['weight'].iloc[0])

    # when sum_weights is 0 (in case there is no ratings from new users), use the mean ratings as 2.5
    if sum_weghts == 0:
        predicted_rating = 2.5
    else:
        predicted_rating = sum_weighted_other_ratings/sum_weghts
    predicted_rating = round(predicted_rating, 3)
    return predicted_rating

In [123]:
# predict a rating for a given user and given movie
predicted_rating = predict(1, 151, w_matrix, ratings_training)
print('The predicted rating: %f' % predicted_rating)

The predicted rating: 3.835000


### RMSE of Model

In [131]:
# evaluate the learned recommender system on test data by converting the ratings to negative and positive
def rmse_eval(ratings_test, w_matrix, ratings_training):
    # predict all the ratings for test data
    ratings_test['prediction'] = pd.Series(np.zeros(ratings_test.shape[0]))
    
    for index, row_rating in ratings_test.iterrows():
        predicted_rating = predict(row_rating['userId'], row_rating['movieId'], w_matrix, ratings_training)
        ratings_test.loc[index, 'prediction'] = predicted_rating
    
    rmse = np.sqrt(np.mean(( ratings_test['prediction']-ratings_test['rating'])**2))
    return rmse, ratings_test

In [132]:
# run the evaluation
rmse, ratings_test_predicted = rmse_eval(ratings_test, w_matrix, ratings_training)
print('Evaluation result on test data (RMSE) : ', rmse)

Evaluation result on test data (RMSE) :  1.094312474949409


### Top K recommendations for any given user

In [180]:
# recommend top k movies for given userId from movies that he/she has not seen
def recommend(userID, w_matrix, ratings, k=10):
    
    distinct_movies = np.unique(ratings['movieId'])
    user_rated_movies = np.unique(ratings[ratings['userId']==userID]['movieId'])

    user_unrated_movies = pd.DataFrame(columns=['movieId', 'rating'])

    # predict the ratings for all movies that the user hasn't rated
    i = 0
    for movie in distinct_movies:
        if movie not in user_rated_movies:
            rating_value = predict(userID, movie, w_matrix, ratings)
            user_unrated_movies.loc[i] = [movie, rating_value]
            i = i + 1
        else:
            continue
            
    # select top k movies based on predicted ratings
    recommendations = user_unrated_movies.sort_values(by=['rating'], ascending=False).head(k)
    recommendations_list = [ [int(row['movieId']), row['rating']] for i,row in recommendations.iterrows() ]
    return recommendations_list

In [181]:
user = 12
print("Recommended movies for User: ", user)
recommend(user, w_matrix, ratings_training, k=10)

Recommended movies for User:  12


[[30, 2.5],
 [48825, 2.5],
 [45506, 2.5],
 [46337, 2.5],
 [46948, 2.5],
 [47200, 2.5],
 [47254, 2.5],
 [47287, 2.5],
 [47306, 2.5],
 [47330, 2.5]]

In [182]:
# taking top k recommendation for given list of users
def make_recommendation_for_users(users_list, ratings_training):
    users_recommendations_df = pd.DataFrame(columns=['userId', 'recommendation'])
    count = 0
    for user in users_list:
        recommendations = recommend(user, w_matrix, ratings_training, k=10)
        users_recommendations_df.loc[count] = [user, recommendations]
        count+=1
    return users_recommendations_df


In [None]:
users_list_for_reconmmendation = list(set(ratings_training['userId']) & set(ratings_test['userId']))
users_recommendations_df = make_recommendation_for_users(users_list_for_reconmmendation, ratings_training)
users_recommendations_df.head()

### normalize users