# Content-Based Recommendation System for KuaiRec Dataset

This notebook implements a content-based filtering approach for video recommendations using the KuaiRec dataset. Content-based filtering recommends items based on their features and characteristics rather than user interactions. In this implementation, we'll use video metadata (captions, categories, etc.) to create feature vectors for each video and recommend similar videos based on these features.

## Stages of Implementation:
1. Data Loading and Pre-processing
2. Feature Engineering and Extraction
3. Creating Content-Based Filters
4. Similarity Computation
5. Building the Recommendation Engine
6. Evaluation and Analysis
7. Discussion and Improvements

In [87]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# set plot size
plt.rcParams["figure.figsize"] = (20, 13)
%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [88]:
train_data = pd.read_csv("dataset/kuairec/data/big_matrix.csv")

test_data = pd.read_csv("dataset/kuairec/data/small_matrix.csv")

categories = pd.read_csv("dataset/kuairec/data/item_categories.csv")

In [89]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)
train_data = train_data[train_data["timestamp"] >= 0]
test_data = test_data[test_data["timestamp"] >= 0]

In [90]:
test_data

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4676370,7162,9177,5315,37205,2020-09-01 20:06:35.984,20200901.0,1.598962e+09,0.142857
4676371,7162,4987,10085,8167,2020-09-02 14:44:51.342,20200902.0,1.599029e+09,1.234848
4676372,7162,7988,50523,49319,2020-09-03 08:45:01.474,20200903.0,1.599094e+09,1.024412
4676373,7162,6533,2190,8000,2020-09-04 22:56:32.021,20200904.0,1.599231e+09,0.273750


In [91]:
train_data

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1.593879e+09,1.273397
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1.593879e+09,1.244082
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1.593879e+09,0.107613
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1.593880e+09,0.089885
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1.593881e+09,0.078000
...,...,...,...,...,...,...,...,...
12530801,7175,1281,34618,140017,2020-09-05 15:07:10.576,20200905,1.599290e+09,0.247241
12530802,7175,3407,12619,21888,2020-09-05 15:08:45.228,20200905,1.599290e+09,0.576526
12530803,7175,10360,2407,7067,2020-09-05 19:10:29.041,20200905,1.599304e+09,0.340597
12530804,7175,10360,6455,7067,2020-09-05 19:10:36.995,20200905,1.599304e+09,0.913400


In [92]:
# Construct item feature matrix
items = train_data.groupby("video_id")
items = items.agg(
    {
        "user_id": "count",
        "video_duration": "mean",
        "timestamp": "max",
        "watch_ratio": "mean",
        "time": "max",
        "date": "max",
    }
)
items.drop(columns=["user_id"], inplace=True)
items.drop(columns=["timestamp"], inplace=True)
items.drop(columns=["watch_ratio"], inplace=True)
items.drop(columns=["date"], inplace=True)
items

Unnamed: 0_level_0,video_duration,time
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5968.000000,2020-09-04 05:35:08.164
1,25264.000000,2020-07-10 13:43:10.133
2,8034.000000,2020-08-02 08:47:29.037
3,23267.000000,2020-07-12 19:54:25.605
4,18228.000000,2020-07-09 04:59:09.775
...,...,...
10723,4847.500000,2020-09-05 17:45:32.098
10724,54765.000000,2020-09-05 17:48:20.05
10725,15829.454545,2020-09-05 22:37:11.064
10726,5152.250000,2020-09-05 21:18:09.055


In [93]:
categories

Unnamed: 0,video_id,feat
0,0,[8]
1,1,"[27, 9]"
2,2,[9]
3,3,[26]
4,4,[5]
...,...,...
10723,10723,[11]
10724,10724,[2]
10725,10725,[15]
10726,10726,[19]


In [94]:
# videos represent by their categories
import ast

for i in range (31):
    categories['category_' + str(i)] = 0

categories['feat'] = categories['feat'].apply(ast.literal_eval)
for index, row in categories.iterrows():
    # Get the list of features for the current row
    features = row['feat']

    # Set the corresponding category columns to 1
    for feat in features:
        col_name = f'category_{feat}'
        if col_name in categories.columns:
            categories.at[index, col_name] = 1
categories.drop(columns=['feat'], inplace=True)


In [95]:
categories

Unnamed: 0,video_id,category_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,...,category_21,category_22,category_23,category_24,category_25,category_26,category_27,category_28,category_29,category_30
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723,10723,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10724,10724,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10725,10725,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10726,10726,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
from sklearn.metrics.pairwise import cosine_similarity
item_similarity = cosine_similarity(categories.drop(columns=['video_id']), categories.drop(columns=['video_id']))
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=categories['video_id'],
    columns=categories['video_id']
)
item_similarity_df

video_id,0,1,2,3,4,5,6,7,8,9,...,10718,10719,10720,10721,10722,10723,10724,10725,10726,10727
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.000000,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.707107,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.707107,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10724,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10725,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10726,0.0,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [97]:
from collections import defaultdict
def generate_top_n_recommendations_from_logs(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    video_ids = pd.Index(video_ids)  # FIX HERE

    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        sim_scores = np.array(np.sum(cosine_sim[watched_indices], axis=0)).ravel()

        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations


In [98]:


recs = generate_top_n_recommendations_from_logs(
    train_data,
    item_similarity,
    categories['video_id'].reset_index(drop=True),  # optional safety
    N=10,
    min_watch_ratio=0.5
)

# Check the type of the recommendations variable
print(f"Type of recs: {type(recs)}")
print(f"Number of users with recommendations: {len(recs)}")
# Display first 3 recommendations if available
if len(recs) > 0:
    first_user = list(recs.keys())[0]
    print(f"Example recommendations for user {first_user}: {recs[first_user][:5]}...")

Type of recs: <class 'dict'>
Number of users with recommendations: 7176
Example recommendations for user 0: [705, 8899, 4717, 555, 591]...


In [99]:
# Build ground truth from test data - using the same threshold as training (0.5)
THRESHOLD = 0.75
high_watch_test = test_data[test_data['watch_ratio'] >= THRESHOLD]
test_truth = high_watch_test.groupby('user_id')['video_id'].apply(set).to_dict()

In [105]:
N = 10
recs_train_category = generate_top_n_recommendations_from_logs(
    interactions_df=train_data,
    cosine_sim=item_similarity,
    # video_ids=categories['video_id'].reset_index(drop=True),
    video_ids=pd.Index(train_data['video_id'].unique()),
    N=N,
    min_watch_ratio=0.75
)

In [106]:
def prepare_test_ground_truth(interactions_df, min_watch_ratio=0.75):
    """
    Return user -> set of relevant video_ids from test interactions.
    """
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    return filtered.groupby('user_id')['video_id'].apply(set).to_dict()

test_truth = prepare_test_ground_truth(
    interactions_df=test_data,
    min_watch_ratio=0.75
)


In [107]:
from sklearn.metrics import ndcg_score
def hit_rate_log(recommendations, test_ground_truth):
    hits, total = 0, 0
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        hits += len(set(recs) & true_items)
        total += len(true_items)
    return hits / total if total else 0

def precision_at_k_log(recommendations, test_ground_truth, k=10):
    precisions = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        hits = len(set(recs[:k]) & true_items)
        precisions.append(hits / k)
    return np.mean(precisions) if precisions else 0

def ndcg_at_k_log(recommendations, test_ground_truth, k=10):
    ndcgs = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        y_true = [1 if vid in true_items else 0 for vid in recs[:k]]
        y_score = list(range(k, 0, -1))
        ndcgs.append(ndcg_score([y_true], [y_score]))
    return np.mean(ndcgs) if ndcgs else 0

def mrr_log(recommendations, test_ground_truth):
    rr = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        for rank, vid in enumerate(recs, 1):
            if vid in true_items:
                rr.append(1 / rank)
                break
        else:
            rr.append(0)
    return np.mean(rr) if rr else 0


In [108]:
hit_rate = hit_rate_log(recs_train_category, test_truth)
prec  = precision_at_k_log(recs_train_category, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_category, test_truth, k=N)
mrr = mrr_log(recs_train_category, test_truth)
print(f"Hit Rate@{N} for category sim: {hit_rate:.4f}")
print(f"Precision@{N} for category sim: {prec:.4f}")
print(f"NDCG@{N} for category sim: {ndcg:.4f}")
print(f"MRR@{N} for category sim: {mrr:.4f}")
print("---")

Hit Rate@10 for category sim: 0.0014
Precision@10 for category sim: 0.2243
NDCG@10 for category sim: 0.5511
MRR@10 for category sim: 0.4385
---
