In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import recall_at_k, read_meta_data, read_training_data, read_validation_data, read_testing_data, get_movies_per_user
from consts import JACCARD_SIM
from models import ContentBasedFiltering
from collections import defaultdict
from surprise import dump
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp

# Load data

In [2]:
%%time
train_set = read_training_data()
validation_set = read_validation_data()
test_set = read_testing_data()
movies_per_user = get_movies_per_user(train_set)

CPU times: user 1min 2s, sys: 5.53 s, total: 1min 7s
Wall time: 1min 8s


# 1. Content-based filtering recommender

In [3]:
content_based_filtering = ContentBasedFiltering(meta_df=read_meta_data())

In [4]:
%%time
content_based_filtering.compute_jaccard_similarity(fname=JACCARD_SIM)

CPU times: user 28.6 s, sys: 6.44 s, total: 35 s
Wall time: 36.8 s


In [5]:
%%time
content_based_filtering.compute_cosine_similarity()

CPU times: user 1min 5s, sys: 20.1 s, total: 1min 25s
Wall time: 1min 40s


## Recommend 40 movies for each validation pair

In [6]:
%%time
topk = 40
content_based_recommendation = defaultdict(list)
for u, m, _ in tqdm(validation_set.values, position=0, leave=True):
    if m not in content_based_recommendation.keys():
        try:
            content_based_recommendation[m] = content_based_filtering.recommend(int(m), topk=topk)
        except:
            print(u, m)

100%|██████████| 2597428/2597428 [03:09<00:00, 13686.47it/s]

CPU times: user 1min 30s, sys: 44.9 s, total: 2min 15s
Wall time: 3min 9s





### Recommendations based on `Toy Story`

In [7]:
movie_id = content_based_filtering.meta_df[content_based_filtering.meta_df.title=="Toy Story"]["movieId"].squeeze()

In [8]:
topk = 10
for i, idx in enumerate(content_based_recommendation[str(movie_id)]):
    if i == topk: break
    print(content_based_filtering.movieId_to_title[int(idx)])

Toy Story 2
Toy Story 3
Toy Story of Terror!
Toy Story That Time Forgot
Small Fry
Hawaiian Vacation
Partysaurus Rex
Tom and Jerry: Shiver Me Whiskers
The Tangerine Bear: Home in Time for Christmas!
Superstar Goofy


### Recommendations based on `Iron Man`

In [9]:
movie_id = content_based_filtering.meta_df[content_based_filtering.meta_df.title=="Iron Man"]["movieId"].squeeze()

In [10]:
# several movies named `Iron Man`...
movie_id

12578     59315
34468    147070
41503    167296
Name: movieId, dtype: int64

In [11]:
topk = 10
for i, idx in enumerate(content_based_recommendation[str(59315)]):
    if i == topk: break
    print(content_based_filtering.movieId_to_title[int(idx)])

Iron Man 2
Iron Man 3
Jocks
Avengers: Age of Ultron
Ant-Man
The Invincible Iron Man
Thor
Marvel One-Shot: The Consultant
Iron Man & Hulk: Heroes United
Relentless


## Evaluation: Recall at 10 based on 40 movies

In [12]:
%%time
k =10
avg_recall = 0
for u, m, _ in tqdm(validation_set.values, position=0, leave=True):
    avg_recall += recall_at_k(user=u, 
                              prediction=content_based_recommendation[m], 
                              k=k, 
                              items_per_user=movies_per_user)

100%|██████████| 2597428/2597428 [00:38<00:00, 67582.84it/s] 

CPU times: user 18.5 s, sys: 8.94 s, total: 27.4 s
Wall time: 38.4 s





In [13]:
print(f"Recall at {k}: {avg_recall / validation_set.shape[0]}")

Recall at 10: 0.04991483883318549


# 2. Hybrid Recommender: Content-based filtering + Collaborative filtering

### Load Collaborative-filtering model

In [14]:
_, svd = dump.load("../data/svd_k1_reg_b_0135_reg_r_012_epochs_50.pkl")

## Recommend movies for each validation pair

In [15]:
%%time
hybrid_recommendation = defaultdict(list)
for user, movie, _ in tqdm(validation_set.values, position=0, leave=True):
    for m in content_based_recommendation[movie]:
        try:
            pred_rating = svd.predict(user, m).est
        except:
            print(f"ERROR: pair({user},{movie}) | m: {m}")
            pred_rating = svd.default_prediction()
        
        hybrid_recommendation[user, movie].append((pred_rating, str(m)))    

100%|██████████| 2597428/2597428 [18:40<00:00, 2318.20it/s]  

CPU times: user 16min 13s, sys: 5min 43s, total: 21min 57s
Wall time: 18min 40s





In [16]:
%%time
hybrid_recommendation_sorted = {key: sorted(val, key=lambda x: x[0], reverse=True) for key, val in hybrid_recommendation.items()}

CPU times: user 1min 9s, sys: 2min 15s, total: 3min 24s
Wall time: 4min 28s


In [17]:
%%time
hybrid_recommendation_sorted_idonly = {key: [m for (r, m) in l] for (key, l) in hybrid_recommendation_sorted.items()}

CPU times: user 1min 4s, sys: 2min 37s, total: 3min 42s
Wall time: 5min


## Evaluation: Recall at 10 based on 40 movies

In [18]:
%%time
k =10
avg_recall = 0
for u, m, _ in tqdm(validation_set.values, position=0, leave=True):
    avg_recall += recall_at_k(user=u, 
                              prediction=hybrid_recommendation_sorted_idonly[u, m], 
                              k=10,
                              items_per_user=movies_per_user)

100%|██████████| 2597428/2597428 [01:05<00:00, 39868.90it/s]

CPU times: user 25 s, sys: 24.5 s, total: 49.5 s
Wall time: 1min 5s





In [19]:
print(f"Recall at {k}: {avg_recall / validation_set.shape[0]}")

Recall at 10: 0.05109804776144093
