In [13]:
from pathlib import Path
import pandas as pd
import torch
from tqdm import tqdm, trange

from metrics import mean_average_precision, precision_recall_at_k

In [2]:
folder = Path("data/data_kion")
train_data = pd.read_parquet(folder / "train_val_triplets.parquet")
test_data   = pd.read_parquet(folder / "test_pos.parquet")

# Top pop

In [3]:
recommendations = torch.tensor(train_data['film_pos'].value_counts()[:100].index.tolist())
recommendations

tensor([13387,  9393,  3599, 14769, 10072,   139,  8333,  4000,  1784,  7309,
        10839,  4337,  4300, 13933,  7159, 14197,  6576, 12539,  4279,  8852,
         6855, 15616,  7364,   636,  3067,  9651, 13821, 14235, 14927,  5496,
        12525, 12562,  4571,  3791,  3648, 11362, 11745,  7560,  5462, 12057,
         1761, 10908,  1247, 11338,  6394, 10388,  1728, 12295, 10096, 13960,
        13434,  9724, 14390,  1096,  7320, 10377, 12390,  4313,  5243, 10553,
        15716,  5232, 13863,   825, 11918,  6174,  3383, 15806,  6227,  6961,
        12781,  2844,  2161,  5940, 11888, 15538, 15369, 11333,  8315,  2958,
         5556, 12700,  2144, 11564,  4528,  8148,   283, 14738,  2625,  1506,
         8673,  8140,   355, 12177, 10068,  4550,  5112,  8512,  2222,  8895])

In [4]:
true_relevant = test_data.groupby('user_id')['film_pos'].apply(lambda x: torch.tensor(pd.unique(x).tolist()))
true_relevant

user_id
3         [tensor(5986), tensor(139), tensor(9879), tens...
17                                          [tensor(13986)]
23        [tensor(10855), tensor(1072), tensor(2265), te...
40                           [tensor(15870), tensor(15089)]
75                                [tensor(496), tensor(87)]
                                ...                        
879403    [tensor(496), tensor(10726), tensor(7179), ten...
879411    [tensor(11552), tensor(3599), tensor(6656), te...
879414    [tensor(3243), tensor(9393), tensor(15716), te...
879419                         [tensor(9393), tensor(9001)]
879476                                      [tensor(14927)]
Name: film_pos, Length: 44085, dtype: object

In [5]:
watched_before = train_data.groupby('user_id')['film_pos'].apply(lambda x: torch.tensor(pd.unique(x).tolist())).to_dict()
watched_before

{2: tensor([ 7309, 15616, 11167, 14671, 11274,  6542, 12509,  4317,  2846, 14738,
          1961,   325,  5686,  9764,  9178, 11129,  3413, 12295,  2844,  9016,
          6961,  5496, 12390,  6591, 13458,   543, 10628,   372,  6859,  8183,
          7646,  5616, 12006,  3496,   239,  8847, 13389, 15481]),
 3: tensor([ 3599,  9393, 10072,  4279,  4000, 13312, 10096,  7968,  2144, 15175]),
 9: tensor([ 9393, 13387]),
 12: tensor([10839, 13933, 15040, 10096]),
 17: tensor([ 9921, 15449,  4189, 11828,   825,   483, 10839,  1023,  4373, 13387,
          7201,   301, 12539, 11756,  4904, 12390,  8512,  6855,  6157,  7967,
          9092,  7309, 10072, 10490,  4337, 14098]),
 23: tensor([  655, 11975,  9393,  2643,  9504,  6997,  2919]),
 28: tensor([14769,  3599,   443, 10072, 12361,  9393]),
 32: tensor([13750]),
 35: tensor([10769,  4422,  8333,   139, 10458]),
 40: tensor([13387, 15558,  4458, 10148,  6376,   139,  3599, 11819]),
 47: tensor([ 5462,  8565,  1152, 12539, 13387, 10424,  129

In [6]:
pred_recs = [recommendations] * len(true_relevant)
filtered_pred_recs = [None] * len(true_relevant)
for i, user_id in enumerate(tqdm(true_relevant.index)):
    filtered_pred_recs[i] = recommendations[torch.isin(recommendations, watched_before.get(user_id, torch.tensor([], dtype=int)), invert=True)]

100%|██████████| 44085/44085 [00:01<00:00, 22669.78it/s]


In [7]:
true_relevant = true_relevant.tolist()

In [8]:
K = 10
N_JOBS = 4

Without filtering

In [9]:
precision_score, recall_score = precision_recall_at_k(true_relevant, pred_recs, k=K, n_jobs=N_JOBS)
map_score = mean_average_precision(true_relevant, pred_recs, k=K, n_jobs=N_JOBS)
f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
print(f'Precision@{K}: {precision_score:.5f} | Recall@{K}: {recall_score:.5f} | F1@{K}: {f1_score:.5f} | mAP@{K}: {map_score:.5f}')

                                                                                                

Precision@10: 0.02890 | Recall@10: 0.14585 | F1@10: 0.04824 | mAP@10: 0.06038


With filtering

In [10]:
ecision_score, recall_score = precision_recall_at_k(true_relevant, filtered_pred_recs, k=K, n_jobs=N_JOBS)
map_score = mean_average_precision(true_relevant, filtered_pred_recs, k=K, n_jobs=N_JOBS)
f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
print(f'Precision@{K}: {precision_score:.5f} | Recall@{K}: {recall_score:.5f} | F1@{K}: {f1_score:.5f} | mAP@{K}: {map_score:.5f}')

                                                                                               

Precision@10: 0.02890 | Recall@10: 0.14900 | F1@10: 0.04841 | mAP@10: 0.06728


# Random

In [11]:
all_train_films = torch.from_numpy(train_data['film_pos'].unique())
all_train_films

tensor([ 7128,  8172,  9767,  ..., 12475,   101,  5945], dtype=torch.int32)

In [14]:
rand_pred_recs = [None] * len(true_relevant)
for i in (trange(len(true_relevant))):
    rand_pred_recs[i] = all_train_films[torch.randperm(len(all_train_films))[:100]]

100%|██████████| 44085/44085 [00:04<00:00, 10601.17it/s]


In [15]:
precision_score, recall_score = precision_recall_at_k(true_relevant, rand_pred_recs, k=K, n_jobs=N_JOBS)
map_score = mean_average_precision(true_relevant, rand_pred_recs, k=K, n_jobs=N_JOBS)
f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
print(f'Precision@{K}: {precision_score:.5f} | Recall@{K}: {recall_score:.5f} | F1@{K}: {f1_score:.5f} | mAP@{K}: {map_score:.5f}')

                                                                                               

Precision@10: 0.00026 | Recall@10: 0.00108 | F1@10: 0.00042 | mAP@10: 0.00030
