USER-BASED COLLABORATIVE FILTERING RECOMMENDATIONS

In [427]:
import pandas as pd
import numpy as np
import math as m
import random as r
from scipy.stats import pearsonr
from scipy.stats import spearmanr


In [428]:
links = pd.read_csv('ml-latest-small/links.csv')
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [429]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [430]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [431]:
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [432]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [433]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [434]:
#dropping the timestamp column
ratings = ratings.drop(['timestamp'], axis=1)


In [435]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [436]:
ratings.shape

(100836, 3)

In [437]:
ratings['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [438]:
#movie and ratings dataset
movie_ratings = pd.merge(ratings, movies, on='movieId')
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [439]:
#reshaping the data to table based on column values
user_ptable= ratings.pivot(index='userId', columns='movieId', values='rating')
user_ptable.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [440]:
#limiting data
#user_ptable = user_ptable.head(2000)

In [441]:
#pearson correlation coefficient
def pearson_correlation(user_a_ratings,user_b_ratings):
    corr,_ = pearsonr(user_a_ratings,user_b_ratings)
    return corr


In [442]:
def user_collaborative_filtering(target_user,p_table,correlationfunction):
    '''
    Gets the most similar users and their correlations to the target user
    Parameters: int target_user -user id in the dataset
                p_table - data as a pivot table
                correlationfunction - the correlation function to be used
    Return: dict similar_users -dictionary of users who have rated similar movies as the target user
    with their ratings.
    '''
    similar_users = {}
    #other users who are not the target user
    for user_b in p_table.index:
        if user_b != target_user:
            # ratings for the target user and user_b
            target_user_ratings = p_table.loc[target_user].dropna()
            user_b_ratings = p_table.loc[user_b].dropna()

            # common rated movies
            common_rated_movies = target_user_ratings.index.intersection(user_b_ratings.index)
            #filter for at least 2  common rated movies
            if len(common_rated_movies) >= 2:
                #filter  ratings to include only common rated movies
                target_user_ratings = target_user_ratings[common_rated_movies]
                user_b_ratings = user_b_ratings[common_rated_movies]
                #check if either contains all the same elements as correlation will be 1 regardless of actual rating
                if len(set(target_user_ratings)) == 1 or len(set(user_b_ratings)) == 1:
                    continue
                similar_users[user_b] = correlationfunction(target_user_ratings,user_b_ratings)
                    
    return similar_users

In [444]:
#top 10 similar users to the target user
user_ids = user_ptable.index
#random  target user from the list of user IDs
target_user = r.choice(user_ids)

similar_users = user_collaborative_filtering(target_user,user_ptable,pearson_correlation)
sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
top_similar_users = sorted_similar_users[:10]
top_10_similar_users_dict={}
print(f'10 most similar users to User {target_user}')
for user,similarity in top_similar_users:
    top_10_similar_users_dict[user]=similarity
    print(f'UserID {user} : Similarity {similarity}')



10 most similar users to User 351
UserID 5 : Similarity 1.0
UserID 31 : Similarity 1.0
UserID 40 : Similarity 1.0
UserID 46 : Similarity 1.0
UserID 48 : Similarity 1.0
UserID 120 : Similarity 1.0
UserID 138 : Similarity 1.0
UserID 145 : Similarity 1.0
UserID 157 : Similarity 1.0
UserID 242 : Similarity 1.0


In [445]:
def user_prediction(user_a,item_p,p_table,similarities):
    '''
    Calculates the predicted rating of user `user_a` for item `item_p`.
    Parameters: int user_a - the index of the target user
                int item_p - the index of the unseen movie by target user
                p_table - pivot table of data
                similarities - the dictionary of correlations between target user
                  and other users.
    Return: int prediction - rating of user a for item p
    '''
    user_a_ratings = p_table.loc[user_a]
    mean_usera_ratings = user_a_ratings.mean()
    unseen_item_ratings = p_table.loc[:, item_p].dropna()

    # Get the similarity scores between the target user and other users who have rated the unseen item.
    #relevant_similarities = {}
    predicted_rating = 0
    weighted_difference = 0
    similarity_sum = 0
    for user_b, similarity in similarities.items():
        if user_b != user_a and user_b in unseen_item_ratings.index:
            user_b_ratings = p_table.loc[user_b]
            mean_userb_ratings = user_b_ratings.mean()
            rating_difference = unseen_item_ratings.loc[user_b] - mean_userb_ratings
            weighted_difference += (similarity*rating_difference)
            similarity_sum += abs(similarity)

    if similarity_sum != 0:
        # the prediction as the active user's mean plus the weighted rating differences
        predicted_rating = mean_usera_ratings + (weighted_difference / similarity_sum)
    else:
        predicted_rating = mean_usera_ratings

    return np.clip(predicted_rating,0.5,5)
   
   

In [446]:
recommended = user_prediction(1,193571,user_ptable,top_10_similar_users_dict)
recommended

4.366379310344827

In [447]:
#unseen movies by target user
target_unrated_movies = user_ptable.loc[target_user].isna()
unrated_movie_ids = target_unrated_movies.index[target_unrated_movies]
movie_ids_list = list(unrated_movie_ids)

#predicting ratings using the top 10 similar users to the target user
recommended = {}
for movie_id in movie_ids_list:
    recommended[movie_id] = user_prediction(target_user,movie_id,user_ptable,top_10_similar_users_dict)
sorted_recommendations =  sorted(recommended.items(), key=lambda item: item[1],reverse=True)
top_movies = sorted_recommendations[:10]

In [448]:
movie_titles = {}
for index,row in movie_ratings.iterrows():
    movie_titles [row['movieId']]  = row['title']
    
# movie names based on the movie IDs 
recommendations = {}
for movie_id, rating in top_movies:
    recommendations[movie_titles[movie_id]] = rating
print(f'Top 10 movies for User {target_user}')

df = pd.DataFrame(list(recommendations.items()), columns=['Movie', 'Predicted Rating'])
df

Top 10 movies for User 351


Unnamed: 0,Movie,Predicted Rating
0,"Postman, The (Postino, Il) (1994)",5.0
1,Circle of Friends (1995),5.0
2,Heavenly Creatures (1994),5.0
3,Star Wars: Episode IV - A New Hope (1977),5.0
4,Once Were Warriors (1994),5.0
5,In the Name of the Father (1993),5.0
6,Schindler's List (1993),5.0
7,Snow White and the Seven Dwarfs (1937),5.0
8,Pinocchio (1940),5.0
9,Kingpin (1996),5.0


The Spearmans rank correlation coefficient is useful in getting similarities in the user based collaborative filtering approach since it does not assume a linear relationship between variables which can be a more realistic assumption in many real world recommendation problems.The relationship between users and their preferences can be non linear and the spearmans rank correlation can capture these relationships effectively.

In [449]:
def spearman_rank(user_a_ratings,user_b_ratings):
    rho,_ = spearmanr(user_a_ratings,user_b_ratings)
    return rho

In [450]:
similar_users = user_collaborative_filtering(target_user,user_ptable,spearman_rank)
sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
top_similar_users = sorted_similar_users[:10]
print(f'10 most similar users to User {target_user}')
for user,similarity in top_similar_users:
    print(f'UserID {user} : Similarity {similarity}')

10 most similar users to User 351
UserID 5 : Similarity 1.0
UserID 128 : Similarity 1.0
UserID 242 : Similarity 1.0
UserID 394 : Similarity 1.0
UserID 403 : Similarity 1.0
UserID 604 : Similarity 1.0
UserID 609 : Similarity 1.0
UserID 31 : Similarity 0.9999999999999999
UserID 40 : Similarity 0.9999999999999999
UserID 46 : Similarity 0.9999999999999999
