# Caption: Feature Extraction and Similarity Computation for Content-Based Recommendation Systems

## Stages of Implementation:
1. Loading of the kuairec_caption_category.csv dataset
2. Cleaning of the chinese text
3. TF-IDF Vectorization
4. Cosine Similarity Calculation
5. Metrics Calculation

## Loading the dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# set plot size
plt.rcParams["figure.figsize"] = (20, 13)
%matplotlib inline
%config InlineBackend.figure_format = "retina"

train_data = pd.read_csv("dataset/kuairec/data/big_matrix.csv")
test_data = pd.read_csv("dataset/kuairec/data/small_matrix.csv")
categories = pd.read_csv("dataset/kuairec/data/item_categories.csv")
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)
train_data = train_data[train_data["timestamp"] >= 0]
test_data = test_data[test_data["timestamp"] >= 0]

# Construct item feature matrix
items = train_data.groupby("video_id")
items = items.agg(
    {
        "user_id": "count",
        "video_duration": "mean",
        "timestamp": "max",
        "watch_ratio": "mean",
        "time": "max",
        "date": "max",
    }
)
items.drop(columns=["user_id"], inplace=True)
items.drop(columns=["timestamp"], inplace=True)
items.drop(columns=["watch_ratio"], inplace=True)
items.drop(columns=["date"], inplace=True)
categories.drop(columns=["feat"], inplace=True)

captions = pd.read_csv(
    "dataset/kuairec/data/kuairec_caption_category.csv",
    encoding='utf-8',
    na_values=[],
    keep_default_na=False,
    on_bad_lines='skip',   # skip problematic rows
    engine='python'
)

captions = captions.dropna()
captions = captions.drop_duplicates()
# print first_level_category_name != UNKNOWN
captions = captions[captions["first_level_category_name"] != "UNKNOWN"]
# get all the non empty tags
captions = captions[captions["topic_tag"] != "[]"]
# print number of lines
print(f"Number of lines in captions: {captions.shape[0]}")
# cast the video_id to int
captions['video_id'] = captions['video_id'].astype(int)
# cast the video_id to int

# Merge captions with item features
categories = pd.merge(categories, captions, left_on='video_id', right_on='video_id', how='left')

# Display the merged dataset
print(f"Content features shape: {categories.shape}")

Number of lines in captions: 6097
Content features shape: (10728, 10)


## Cleaning the dataset

In [2]:
import re
import nltk
from nltk.corpus import stopwords

try:
    nltk.data.find('corpora/stopwords')
except:
    nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    if not isinstance(text, str) or text == '' or text == 'UNKNOWN':
        return ''
        
    # Keep original Chinese characters but remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text.lower())
    return text.strip()

# Apply cleaning to caption and manual_cover_text
categories['clean_caption'] = categories['caption'].apply(clean_text)
categories['clean_manual_cover'] = categories['manual_cover_text'].apply(clean_text)

# Combine text features
categories['combined_text'] = categories['clean_caption'] + ' ' + categories['clean_manual_cover']
categories['combined_text'] = categories['combined_text'].str.strip()

categories.head()

Unnamed: 0,video_id,manual_cover_text,caption,topic_tag,first_level_category_id,first_level_category_name,second_level_category_id,second_level_category_name,third_level_category_id,third_level_category_name,clean_caption,clean_manual_cover,combined_text
0,0,,,,,,,,,,,,
1,1,,,,,,,,,,,,
2,2,,,,,,,,,,,,
3,3,,,,,,,,,,,,
4,4,五爱街最美美女 一天1q,#搞笑 #感谢快手我要上热门 #五爱市场 这真是完美搭配啊！,"[五爱市场,感谢快手我要上热门,搞笑]",5.0,时尚,737.0,营销售卖,2596.0,女装,搞笑 感谢快手我要上热门 五爱市场 这真是完美搭配啊,五爱街最美美女 一天1q,搞笑 感谢快手我要上热门 五爱市场 这真是完美搭配啊 五爱街最美美女 一天1q


In [3]:
sub_categories = categories[['video_id', 'combined_text', 'topic_tag']]
categories.drop(columns=["topic_tag"], inplace=True)


from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF to the combined text
# Replace empty strings with NaN and then with a space to avoid vectorizer issues
sub_categories['combined_text'].replace('', np.nan, inplace=True)
sub_categories['combined_text'].fillna(' ', inplace=True)

video_ids = sub_categories['video_id'].reset_index(drop=True)

# Create TF-IDF vectorizer
# For Chinese text, we'll use character-level n-grams
tfidf = TfidfVectorizer(
    analyzer='char_wb',  # Character n-grams with word boundaries
    ngram_range=(1, 3),  # Use 1 to 3-character n-grams
    max_features=1000,   # Limit features to manage dimensionality
    min_df=5,            # Minimum document frequency
    max_df=0.7          # Maximum document frequency
)

text_features = tfidf.fit_transform(sub_categories['combined_text'])

# Convert to DataFrame and add video_id
text_feature_names = ['text_' + str(i) for i in range(text_features.shape[1])]
tfidf_df = pd.DataFrame(
    text_features.toarray(),
    columns=text_feature_names
)
tfidf_df['video_id'] = video_ids

# Optionally, move video_id to the front
cols = ['video_id'] + [c for c in tfidf_df.columns if c != 'video_id']
tfidf_df = tfidf_df[cols]

# Done!
print(f"Shape of TF-IDF features: {tfidf_df.shape}")
# Merge TF-IDF features with categories
categories = pd.merge(categories, tfidf_df, on='video_id', how='left')
# make vidoe_id as index
# find column where type is string
string_columns = categories.select_dtypes(include=['object']).columns
# drop all string columns
categories.drop(columns=string_columns, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sub_categories['combined_text'].replace('', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_categories['combined_text'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True

Shape of TF-IDF features: (10728, 1001)


## Cosine Similarity Calculation

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
# replace NaN with 0
categories.fillna(0, inplace=True)
item_similarity = cosine_similarity(categories.drop(columns=['video_id']), categories.drop(columns=['video_id']))
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=categories['video_id'],
    columns=categories['video_id']
)

In [5]:
from collections import defaultdict
def generate_top_n_recommendations_from_logs(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    video_ids = pd.Index(video_ids)  # FIX HERE

    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        sim_scores = np.array(np.sum(cosine_sim[watched_indices], axis=0)).ravel()

        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations

# video_ids = categories['video_id'].reset_index(drop=True)  # Must match cosine_sim
# video_id_set = set(video_ids)
# N = 10
# recs_train_category = generate_top_n_recommendations_from_logs(
#     interactions_df=train_data,
#     cosine_sim=item_similarity,
#     # video_ids=categories['video_id'].reset_index(drop=True),
#     video_ids=pd.Index(train_data['video_id'].unique()),
#     N=N,
#     min_watch_ratio=0.75
# )

def prepare_test_ground_truth(interactions_df, min_watch_ratio=0.75):
    """
    Return user -> set of relevant video_ids from test interactions.
    """
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    return filtered.groupby('user_id')['video_id'].apply(set).to_dict()

test_truth = prepare_test_ground_truth(
    interactions_df=test_data,
    min_watch_ratio=0.75
)

from sklearn.metrics import ndcg_score
def hit_rate_log(recommendations, test_ground_truth):
    hits, total = 0, 0
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        hits += len(set(recs) & true_items)
        total += len(true_items)
    return hits / total if total else 0

def precision_at_k_log(recommendations, test_ground_truth, k=10):
    precisions = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        hits = len(set(recs[:k]) & true_items)
        precisions.append(hits / k)
    return np.mean(precisions) if precisions else 0

def ndcg_at_k_log(recommendations, test_ground_truth, k=10):
    ndcgs = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        y_true = [1 if vid in true_items else 0 for vid in recs[:k]]
        y_score = list(range(k, 0, -1))
        ndcgs.append(ndcg_score([y_true], [y_score]))
    return np.mean(ndcgs) if ndcgs else 0

def mrr_log(recommendations, test_ground_truth):
    rr = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        for rank, vid in enumerate(recs, 1):
            if vid in true_items:
                rr.append(1 / rank)
                break
        else:
            rr.append(0)
    return np.mean(rr) if rr else 0



## Evaluation Results

In [None]:
N = [10, 20, 50, 100, 500]
for n in N:
    recs_train_category = generate_top_n_recommendations_from_logs(
        interactions_df=train_data,
        cosine_sim=item_similarity,
        video_ids=pd.Index(train_data['video_id'].unique()),
        N=n,
        min_watch_ratio=0.75
    )

    prec  = precision_at_k_log(recs_train_category, test_truth, k=n)
    ndcg = ndcg_at_k_log(recs_train_category, test_truth, k=n)
    mrr = mrr_log(recs_train_category, test_truth)
    print(f"Precision@{n} for category sim: {prec:.4f}")
    print(f"NDCG@{n} for category sim: {ndcg:.4f}")
    print(f"MRR@{n} for category sim: {mrr:.4f}")
    print("-" * 50)

Precision@10 for category sim: 0.1748
NDCG@10 for category sim: 0.4423
MRR@10 for category sim: 0.2832
--------------------------------------------------
