In [13]:
import pandas as pd
import sys
from pathlib import Path

# Ensure project root is in Python path
sys.path.append(str(Path("..").resolve()))

from src.recommender.hybrid import hybrid_recommender

df = pd.read_csv("../data/tmdb_titles.csv")
df.head()


Unnamed: 0,tmdb_id,title,overview,genres,popularity,vote_average,vote_count,language,release_date
0,1306368,The Rip,Trust frays when a team of Miami cops discover...,"['Action', 'Thriller', 'Crime']",566.971,7.11,495,en,2026-01-13
1,1242898,Predator: Badlands,"Cast out from his clan, a young Predator finds...","['Action', 'Science Fiction', 'Adventure']",320.969,7.799,1540,en,2025-11-05
2,83533,Avatar: Fire and Ash,In the wake of the devastating war against the...,"['Science Fiction', 'Adventure', 'Fantasy']",264.0534,7.346,1521,en,2025-12-17
3,1242501,Icefall,A young Indigenous game warden arrests an infa...,"['Action', 'Crime', 'Thriller']",213.1218,6.489,87,en,2025-10-16
4,1043197,Dust Bunny,Ten-year-old Aurora asks her hitman neighbor t...,"['Action', 'Fantasy', 'Thriller']",302.2415,6.871,68,en,2025-12-11


In [14]:
expected_cols = {
    "title", "overview", "genres",
    "popularity", "language"
}

assert expected_cols.issubset(df.columns)


In [15]:
df.columns

Index(['tmdb_id', 'title', 'overview', 'genres', 'popularity', 'vote_average',
       'vote_count', 'language', 'release_date'],
      dtype='object')

In [16]:
import ast

df["genres"] = df["genres"].apply(ast.literal_eval)


In [17]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    df["overview"].fillna("").tolist(),
    show_progress_bar=True
)

df["embedding"] = list(embeddings)


Batches: 100%|██████████| 4/4 [00:01<00:00,  3.04it/s]


In [29]:
sample_title = df["title"].iloc[0]
recs = hybrid_recommender(df, title=sample_title, top_n=5)

recs[["title", "genres", "popularity"]]


Unnamed: 0,title,genres,popularity
3,Icefall,"[Action, Crime, Thriller]",213.1218
30,Yadang: The Snitch,"[Crime, Action, Comedy, Drama, Thriller]",67.9816
12,Trap House,"[Action, Crime, Thriller]",100.9872
14,The Shadow's Edge,"[Action, Crime, Drama, Thriller]",102.2578
19,Risqué,"[Action, Crime, Thriller]",86.203


In [28]:
import numpy as np
from src.recommender.explanations import explain_similarity

# Ensure 2D arrays
target_embedding = np.array(target["embedding"], dtype=float).reshape(1, -1)
history_embedding = np.array(
    df.loc[df["title"] == sample_title, "embedding"].values[0],
    dtype=float
).reshape(1, -1)

# Call function with correct shapes
similar_titles = explain_similarity(
    item_embedding=target_embedding,
    user_history_embeddings=history_embedding,  # just the 2D array
    user_history_titles=[sample_title]
)

similar_titles

['The Rip']

In [24]:
print(target_embedding.shape)
print(history_embedding.shape)


(1, 384)
(1, 384)


In [30]:
df.to_pickle("../data/tmdb_titles_enriched.pkl")