# Categories: Feature Extraction and Similarity Computation

We now focus on the category of each video.

## Stages of Implementation:
1. Data Loading and Pre-processing
2. One Hot Encoding of Categories
3. Cosine Similarity Computation
4. Metrics Calculation

In [5]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# set plot size
plt.rcParams["figure.figsize"] = (20, 13)
%matplotlib inline
%config InlineBackend.figure_format = "retina"

train_data = pd.read_csv("dataset/kuairec/data/big_matrix.csv")
test_data = pd.read_csv("dataset/kuairec/data/small_matrix.csv")
categories = pd.read_csv("dataset/kuairec/data/item_categories.csv")

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)
train_data = train_data[train_data["timestamp"] >= 0]
test_data = test_data[test_data["timestamp"] >= 0]

# Construct item feature matrix
items = train_data.groupby("video_id")
items = items.agg(
    {
        "user_id": "count",
        "video_duration": "mean",
        "timestamp": "max",
        "watch_ratio": "mean",
        "time": "max",
        "date": "max",
    }
)
items.drop(columns=["user_id"], inplace=True)
items.drop(columns=["timestamp"], inplace=True)
items.drop(columns=["watch_ratio"], inplace=True)
items.drop(columns=["date"], inplace=True)

# videos represent by their categories
import ast

for i in range (31):
    categories['category_' + str(i)] = 0

categories['feat'] = categories['feat'].apply(ast.literal_eval)
for index, row in categories.iterrows():
    # Get the list of features for the current row
    features = row['feat']

    # Set the corresponding category columns to 1
    for feat in features:
        col_name = f'category_{feat}'
        if col_name in categories.columns:
            categories.at[index, col_name] = 1
categories.drop(columns=['feat'], inplace=True)



In [6]:
from sklearn.metrics.pairwise import cosine_similarity
item_similarity = cosine_similarity(categories.drop(columns=['video_id']), categories.drop(columns=['video_id']))
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=categories['video_id'],
    columns=categories['video_id']
)

In [7]:
from collections import defaultdict
def generate_top_n_recommendations_from_logs(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    video_ids = pd.Index(video_ids)  # FIX HERE

    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        sim_scores = np.array(np.sum(cosine_sim[watched_indices], axis=0)).ravel()

        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations


In [8]:
def prepare_test_ground_truth(interactions_df, min_watch_ratio=0.75):
    """
    Return user -> set of relevant video_ids from test interactions.
    """
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    return filtered.groupby('user_id')['video_id'].apply(set).to_dict()

test_truth = prepare_test_ground_truth(
    interactions_df=test_data,
    min_watch_ratio=0.75
)

from sklearn.metrics import ndcg_score
def hit_rate_log(recommendations, test_ground_truth):
    hits, total = 0, 0
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        hits += len(set(recs) & true_items)
        total += len(true_items)
    return hits / total if total else 0

def precision_at_k_log(recommendations, test_ground_truth, k=10):
    precisions = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        hits = len(set(recs[:k]) & true_items)
        precisions.append(hits / k)
    return np.mean(precisions) if precisions else 0

def ndcg_at_k_log(recommendations, test_ground_truth, k=10):
    ndcgs = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        y_true = [1 if vid in true_items else 0 for vid in recs[:k]]
        y_score = list(range(k, 0, -1))
        ndcgs.append(ndcg_score([y_true], [y_score]))
    return np.mean(ndcgs) if ndcgs else 0

def mrr_log(recommendations, test_ground_truth):
    rr = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        for rank, vid in enumerate(recs, 1):
            if vid in true_items:
                rr.append(1 / rank)
                break
        else:
            rr.append(0)
    return np.mean(rr) if rr else 0

In [9]:
N = [10, 20, 50, 100]
for n in N:
    recs_train_category = generate_top_n_recommendations_from_logs(
        interactions_df=train_data,
        cosine_sim=item_similarity,
        video_ids=pd.Index(train_data['video_id'].unique()),
        N=n,
        min_watch_ratio=0.75
    )

    prec  = precision_at_k_log(recs_train_category, test_truth, k=n)
    ndcg = ndcg_at_k_log(recs_train_category, test_truth, k=n)
    mrr = mrr_log(recs_train_category, test_truth)
    print(f"Precision@{n} for category sim: {prec:.4f}")
    print(f"NDCG@{n} for category sim: {ndcg:.4f}")
    print(f"MRR@{n} for category sim: {mrr:.4f}")
    print("-" * 50)

Precision@10 for category sim: 0.2243
NDCG@10 for category sim: 0.5511
MRR@10 for category sim: 0.4385
--------------------------------------------------
Precision@20 for category sim: 0.1934
NDCG@20 for category sim: 0.5742
MRR@20 for category sim: 0.4447
--------------------------------------------------
Precision@50 for category sim: 0.1596
NDCG@50 for category sim: 0.5768
MRR@50 for category sim: 0.4458
--------------------------------------------------
Precision@100 for category sim: 0.1747
NDCG@100 for category sim: 0.5849
MRR@100 for category sim: 0.4458
--------------------------------------------------
