COLLABORATIVE FILTERING

In [201]:
import pandas as pd
import numpy as np
import math as m
import random as r
from scipy.stats import pearsonr
from scipy.stats import spearmanr


In [51]:
links = pd.read_csv('ml-latest-small/links.csv')
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [52]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [53]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [54]:
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [55]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [56]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [57]:
#dropping the timestamp column
ratings2 = ratings.drop(['timestamp'], axis=1)


In [58]:
ratings2.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [192]:
ratings2['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [193]:
#reshaping the data to table based on column values
user_ptable= ratings2.pivot(index='userId', columns='movieId', values='rating')
user_ptable.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [154]:
#limiting data
user_ptable = user_ptable.head(2000)

In [164]:
def pearson_correlation(user_a_ratings,user_b_ratings):
    corr,_ = pearsonr(user_a_ratings,user_b_ratings)
    return corr


In [166]:
def user_collaborative_filtering(target_user,p_table,correlationfunction):
    similar_users = {}
    for user_b in p_table.index:
        if user_b != target_user:
            # ratings for the target user and user_b
            target_user_ratings = p_table.loc[target_user].dropna()
            user_b_ratings = p_table.loc[user_b].dropna()

            # common rated movies
            common_rated_movies = target_user_ratings.index.intersection(user_b_ratings.index)
            #at least 2 values for correlation calculation
            if len(common_rated_movies) >= 2:
                # Filter the ratings to include only common rated movies
                target_user_ratings = target_user_ratings[common_rated_movies]
                user_b_ratings = user_b_ratings[common_rated_movies]
                #check for constant input arrays
                if not target_user_ratings.isin([target_user_ratings.iloc[0]]).all() and not user_b_ratings.isin([user_b_ratings.iloc[0]]).all():
                    similar_users[user_b] = correlationfunction(target_user_ratings,user_b_ratings)
    return similar_users

In [167]:
user_ids = user_ptable.index
#random  target user from the list of user IDs
target_user = r.choice(user_ids)

similar_users = user_collaborative_filtering(target_user,user_ptable,pearson_correlation)
sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
top_similar_users = sorted_similar_users[:10]
print(f'10 most similar users to User {target_user}')
for user,similarity in top_similar_users:
    print(f'UserID {user} : Similarity {similarity}')



10 most similar users to User 372
UserID 13 : Similarity 1.0
UserID 48 : Similarity 1.0
UserID 245 : Similarity 1.0
UserID 252 : Similarity 1.0
UserID 278 : Similarity 1.0
UserID 281 : Similarity 1.0
UserID 392 : Similarity 1.0
UserID 473 : Similarity 1.0
UserID 511 : Similarity 1.0
UserID 550 : Similarity 0.9999999999999999


In [202]:
def prediction(user_a,item_p,p_table,similarities):
    ''' Returns the prediction of user a for item p'''
    user_a_ratings = p_table.loc[user_a]
    
    mean_usera_ratings = user_a_ratings.mean()
    
    prediction = 0
    weighted_sum = 0
    similarity_sum = 0
    
    for user_b, similarity in similarities.items():
        if user_b != user_a and not pd.isna(p_table.at[user_b, item_p]):
             user_b_ratings = p_table.loc[user_b]
             mean_userb_ratings = user_b_ratings.mean()
             rating_difference = p_table.at[user_b,item_p] - mean_userb_ratings
             weighted_sum +=(similarity)*rating_difference
             similarity_sum +=abs(similarity)
    if similarity_sum > 0:
        # the prediction as the active user's mean plus the weighted rating differences
        prediction = mean_usera_ratings + (weighted_sum / similarity_sum)
    return np.clip(prediction,0.5,5.0)
   

In [203]:
movie_ids = user_ptable.columns
movie_ids_list = list(movie_ids)
recommended = {}
for movie_id in movie_ids_list:
    recommended[movie_id] = prediction(target_user,movie_id,user_ptable,similar_users)
sorted_recommendations =  sorted(recommended.items(), key=lambda item: item[1],reverse=True)
top_movies = sorted_recommendations[:10]
top_movies

[(476, 5.0),
 (2164, 5.0),
 (2314, 5.0),
 (3086, 5.0),
 (3379, 5.0),
 (5490, 5.0),
 (5706, 5.0),
 (6347, 5.0),
 (6818, 5.0),
 (8477, 5.0)]

In [147]:
def spearman_rank(user_a_ratings,user_b_ratings):
    rho,_ = spearmanr(user_a_ratings,user_b_ratings)
    return rho

In [150]:
similar_users = user_collaborative_filtering(target_user,user_ptable,spearman_rank)
sorted_similar_users =  sorted(similar_users.items(), key=lambda item: item[1],reverse=True)
top_similar_users = sorted_similar_users[:10]
print(f'10 most similar users to User {target_user}')
for user,similarity in top_similar_users:
    print(f'UserID {user} : Similarity {similarity}')

10 most similar users to User 514
UserID 529 : Similarity 1.0
UserID 60 : Similarity 0.9999999999999999
UserID 259 : Similarity 0.9999999999999999
UserID 147 : Similarity 0.8660254037844387
UserID 316 : Similarity 0.8660254037844387
UserID 442 : Similarity 0.8660254037844387
UserID 161 : Similarity 0.8326688171765703
UserID 481 : Similarity 0.8164965809277261
UserID 257 : Similarity 0.8029550685469662
UserID 454 : Similarity 0.7905694150420948
