In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from tqdm import tqdm
from utils import Jaccard, nlp, movies_per_user, count_recall_at
from models import ContentBasedFiltering
from consts import VALID_SET

# Content-based Filtering

In [2]:
content_based_filtering = ContentBasedFiltering(meta_file="../data/processed_data.parquet")

In [4]:
%%time
content_based_filtering.compute_cosine_similarity()

CPU times: user 59.6 s, sys: 15.8 s, total: 1min 15s
Wall time: 1min 25s


In [5]:
content_based_filtering.tfidf_matrix.shape

(45116, 2235939)

In [6]:
%%time
content_based_filtering.compute_jaccard_similarity(fname="../data/JACCARD_SIM.npz")

CPU times: user 25 s, sys: 4.75 s, total: 29.8 s
Wall time: 30.1 s


In [7]:
validation_set = pd.read_csv(VALID_SET, usecols=["userId", "movieId"])
validation_set

Unnamed: 0,userId,movieId
0,529,1617
1,19,1275
2,639,46
3,358,1339
4,641,539
...,...,...
8795,494,318
8796,85,316
8797,580,4247
8798,580,1562


In [8]:
def recommend(user_id, movie_id, k):
    recomm = content_based_filtering.recommend(movie_id, topk=20)
    print(f"Recommended movie id: {recomm}")
    print(f"movies watched by user: {movies_per_user[user_id]}")
    recall = count_recall_at(user=user_id, prediction=recomm, k=k)
    print(f"recall {recall}")

In [9]:
for i, (u, m) in enumerate(validation_set.values):
    if i == 10: break
    
    recommend(user_id=int(u), movie_id=int(m), k=5)

Recommend for L.A. Confidential:
['L.A. Confidential', 'The Negotiator', 'The Nice Guys', 'The Killing Jar', 'Prince of the City', 'The Big Easy', 'The Long Goodbye', 'Memories of Murder', 'Columbus Circle', 'Jennifer Eight', 'Night Falls on Manhattan', 'Jesse Stone: Innocents Lost', 'Best Laid Plans', 'Mirrors', 'The First Deadly Sin', 'Lee Rock', "The General's Daughter", 'Hotel Noir', 'The Bedroom Window', 'Witness']
Recommended movie id: [1617, 2058, 158238, 127090, 3734, 4086, 2511, 31364, 94496, 3557, 1404, 99873, 2842, 61262, 4940, 139647, 2688, 120108, 2753, 1674]
movies watched by user: {1, 2057, 2065, 2067, 24, 26649, 2074, 25, 2077, 2078, 30749, 32, 2080, 34, 8228, 2085, 52, 55, 57, 58, 62, 2112, 68, 6216, 2132, 6228, 32853, 2136, 4187, 6235, 2145, 2146, 105, 6254, 111, 2160, 4217, 6269, 2174, 6271, 141, 8337, 2194, 6299, 164, 26788, 6327, 6331, 2236, 2243, 2247, 4298, 2268, 2272, 230, 6380, 2289, 2291, 2297, 2300, 2302, 2303, 260, 261, 265, 4361, 41226, 39183, 6416, 2321, 2