# Collaborative Filtering

In [31]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
import random

In [2]:
ratings_raw =  pd.read_csv("../dataset/ratings_raw.csv")

In [3]:
ratings_raw.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
ratings_raw["userId"].nunique()

270896

In [5]:
ratings_raw.shape

(26024289, 4)

In [6]:
ratings_raw["movieId"].nunique()

45115

In [7]:
movies_ratings_df = pd.DataFrame({'movie_id': ratings_raw['movieId'].unique()})
movies_ratings_df['rate_avg'] = ratings_raw.groupby('movieId')['rating'].mean().values
movies_ratings_df['ratings_count'] = ratings_raw['movieId'].value_counts().values

In [8]:
movies_ratings_df

Unnamed: 0,movie_id,rate_avg,ratings_count
0,110,3.888157,91921
1,147,3.236953,91082
2,858,3.175550,87901
3,1221,2.875713,84078
4,1246,3.079565,77960
...,...,...,...
45110,159050,4.000000,1
45111,159053,3.500000,1
45112,165649,5.000000,1
45113,171051,1.000000,1


In [9]:
dt_object = datetime.fromtimestamp(ratings_raw["timestamp"][20000000])

In [14]:
user_ids = ratings_raw['userId'].unique()
movie_ids = ratings_raw['movieId'].unique()
movie_ids = np.sort(movie_ids)

num_users = len(user_ids)
num_movies = len(movie_ids)

user_idx_dict = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_idx_dict = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

data = ratings_raw['rating']
row = ratings_raw['userId'].map(user_idx_dict)
col = ratings_raw['movieId'].map(movie_idx_dict)

user_movie_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_movies))

print(user_movie_matrix)

  (0, 108)	1.0
  (0, 145)	4.5
  (0, 843)	5.0
  (0, 1195)	5.0
  (0, 1218)	5.0
  (0, 1885)	4.0
  (0, 2677)	4.5
  (0, 2833)	5.0
  (0, 2874)	4.0
  (0, 4133)	4.0
  (0, 4783)	5.0
  (0, 5481)	5.0
  (0, 10177)	4.0
  (0, 12047)	3.5
  (0, 12545)	4.0
  (0, 12652)	5.0
  (0, 13711)	5.0
  (0, 13963)	5.0
  (0, 14631)	5.0
  (0, 16227)	5.0
  (0, 18359)	2.5
  (0, 18373)	5.0
  (0, 18622)	5.0
  (0, 19603)	5.0
  (0, 20118)	0.5
  :	:
  (270895, 11132)	4.5
  (270895, 11365)	4.0
  (270895, 11367)	5.0
  (270895, 11375)	4.5
  (270895, 11413)	5.0
  (270895, 11414)	5.0
  (270895, 11464)	4.0
  (270895, 11497)	4.0
  (270895, 11627)	5.0
  (270895, 11638)	2.5
  (270895, 11677)	4.5
  (270895, 11722)	4.5
  (270895, 11872)	3.5
  (270895, 11987)	3.0
  (270895, 11989)	4.0
  (270895, 12027)	4.5
  (270895, 12047)	4.0
  (270895, 12233)	5.0
  (270895, 12280)	3.5
  (270895, 12293)	4.5
  (270895, 12545)	5.0
  (270895, 12768)	5.0
  (270895, 13126)	4.5
  (270895, 13285)	4.5
  (270895, 14380)	2.0


In [18]:
# user_movie_train, user_movie_test = train_test_split(user_movie_matrix, test_size=0.2, random_state=42)

In [78]:
# user_movie_train.shape

In [79]:
# user_movie_test.shape

In [80]:
# user_movie_test[0].toarray()

Get random users for testing

In [81]:
# random_test_user_ids = []
# for _ in range(54180):
#     random_user = random.randint(1, 270896)
#     random_test_user_ids.append(random_user)
# random_test_user_ids = np.sort(random_test_user_ids)

In [82]:
# mask = np.ones(user_movie_matrix.shape[0], dtype=bool)
# mask[random_test_user_ids] = False
# train_user_movie_matrix = user_movie_matrix[mask, :]

In [83]:
# train_user_movie_matrix.shape

In [279]:
class RecommenderSystem:
    def __init__(self, user_movie_matrix):
        self.user_movie_matrix = user_movie_matrix
#         self.collaborative_filtering()

    def collaborative_filtering(self, num_latent_factors=10):
        num_latent_factors = num_latent_factors  
        model = NMF(n_components=num_latent_factors, init='random', random_state=0)
        self.user_factors = model.fit_transform(self.user_movie_matrix)
        self.movie_factors = model.components_
        
    def get_collab_recom(self, user_id):
        user_idx = user_idx_dict[user_id]
        recommendations = np.dot(self.user_factors[user_idx, :], self.movie_factors)

        sorted_indices = np.argsort(recommendations)[::-1]
        sorted_movie_ids = [movie_ids[idx] for idx in sorted_indices]
        return sorted_movie_ids
    
    def ndcg(self):
        sum = 0
        total_count = 0
        users_count = self.user_movie_matrix.shape[0]
        for user_id in range(0,users_count,5000):
            total_count += 1
            recommended_movies_id = self.get_collab_recom(user_id+1)
            ground_truth = ratings_raw[ratings_raw["userId"] == user_id]
            rated_movies_id = [x for x in recommended_movies_id if x in ground_truth["movieId"].values]
            rated_movies_rate = []
            for id in rated_movies_id:
                rated_movies_rate.append(self.user_movie_matrix[user_idx_dict[target_user_id],movie_idx_dict[id]].item())
            
            prod_relevance_dict = dict(zip(rated_movies_id, rated_movies_rate))
            ideal_prod_relevance_dict = {k: v for k, v in sorted(prod_relevance_dict.items(),key=lambda item: item[1], reverse=True)}
            true_relevance = np.fromiter(ideal_prod_relevance_dict.values(), dtype=int).reshape(1,-1)
            relevance_score = np.fromiter(prod_relevance_dict.values(), dtype=int).reshape(1,-1)
            print(true_relevance, relevance_score)
            dcg = dcg_score(true_relevance, relevance_score)
            idcg = dcg_score(true_relevance, true_relevance)
            sum += dcg/idcg
            print('nDCG score: ', dcg/idcg)
        return round(sum/total_count,4)

In [277]:
rs = RecommenderSystem(user_movie_matrix)

In [280]:
ndcg_score = rs.ndcg()
print("nDCG score is:", ndcg_score)

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [281]:
def ndcg():
        sum = 0
        total_count = 0
        users_count = user_movie_matrix.shape[0]
        for user_id in range(0,users_count,10000):
            total_count += 1
            recommended_movies_id = rs.get_collab_recom(user_id+1)
            ground_truth = ratings_raw[ratings_raw["userId"] == target_user_id]
            rated_movies_id = [x for x in recommended_movies_id if x in ground_truth["movieId"].values]
            rated_movies_rate = []
            for id in rated_movies_id:
                rated_movies_rate.append(user_movie_matrix[user_idx_dict[target_user_id],movie_idx_dict[id]].item())
            
            prod_relevance_dict = dict(zip(rated_movies_id, rated_movies_rate))
            ideal_prod_relevance_dict = {k: v for k, v in sorted(prod_relevance_dict.items(),key=lambda item: item[1], reverse=True)}
            true_relevance = np.fromiter(ideal_prod_relevance_dict.values(), dtype=int).reshape(1,-1)
            relevance_score = np.fromiter(prod_relevance_dict.values(), dtype=int).reshape(1,-1)
            dcg = dcg_score(true_relevance, relevance_score)
            idcg = dcg_score(true_relevance, true_relevance)
            sum += dcg/idcg
#             print('nDCG score: ', dcg/idcg)
        return round(sum/total_count,4)

In [282]:
ndcg()

0.9304

In [285]:
target_user_id = 1
recoms = rs.get_collab_recom(target_user_id)
recom_np = np.array(recoms)
print("Collaborative Filtering Recommendations for User", target_user_id)
for i in range(5):
    print(recoms[i])

Collaborative Filtering Recommendations for User 1
318
2959
296
2571
79132


In [283]:
ground_truth = ratings_raw[ratings_raw["userId"] == target_user_id]
ground_truth

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
5,1,1968,4.0,1425942148
6,1,2762,4.5,1425941300
7,1,2918,5.0,1425941593
8,1,2959,4.0,1425941601
9,1,4226,4.0,1425942228


In [291]:
ind = np.argwhere(recom_np == 68358)
ind

array([[65]], dtype=int64)