In [1]:
# Setup: project path and OMDb API key
import os
import sys
from pathlib import Path

# Ensure project root is on sys.path so `import app` works when running from notebooks/
PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Configure OMDb API key for experiments.
# Prefer setting OMDB_API_KEY in your environment or `.env` file.
# For quick experiments, you can set it here (replace the placeholder):
OMDB_API_KEY = os.environ.get("OMDB_API_KEY") or "YOUR_OMDB_API_KEY_HERE"
os.environ["OMDB_API_KEY"] = OMDB_API_KEY
print("OMDB_API_KEY set:", "present" if bool(os.environ.get("OMDB_API_KEY")) and os.environ.get("OMDB_API_KEY") != "YOUR_OMDB_API_KEY_HERE" else "missing")

OMDB_API_KEY set: present


In [11]:
# Imports, async runner, and loading the recommender
import asyncio
from typing import Any

from app.services.recommender import recommend_movies


def run(coro):
    """Run an async coroutine from a notebook cell, returning its result."""
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = None
    if loop and loop.is_running():
        # If already in an event loop (e.g., IPython), create a new task
        return asyncio.ensure_future(coro)
    return asyncio.run(coro)


def show_results(items: list[dict[str, Any]]):
    if not items:
        print("No results.")
        return
    for i, m in enumerate(items, 1):
        print(f"{i}. {m.get('title')}  |  Rating: {m.get('vote_average')}  |  Released: {m.get('release_date')}")
        if m.get("overview"):
            print(f"   {m['overview'][:200]}{'...' if len(m['overview'])>200 else ''}")
        if m.get("poster_url"):
            print(f"   Poster: {m['poster_url']}")
        print()


In [3]:
# Example 1: Keyword strategy (default)
MOOD = "gory fun"
LIMIT = 6

movies = run(recommend_movies(mood=MOOD, limit=LIMIT, strategy="keyword"))

# If we're in a running loop (Jupyter), `run` may return a Task. Await it to get results.
try:
    from asyncio import Future
    if hasattr(movies, "__await__") or isinstance(movies, Future):
        movies = await movies
except Exception:
    pass

show_results(movies)


1. Return to Horror High  |  Rating: 4.3  |  Released: 20 Feb 1987
   In the early 1980s, a series of gruesome murders occurred at Crippen High School. A few years later, a film crew uses the now-abandoned Crippen High as the set for a film. But an uninvited guest makes...
   Poster: https://m.media-amazon.com/images/M/MV5BYjM0ZjJjNDctM2IwMC00MzNkLWFlMWQtMGMzNmI1N2JmOTg0XkEyXkFqcGc@._V1_SX300.jpg

2. Horror Express  |  Rating: 6.5  |  Released: 03 Jan 1974
   While on the Trans-Siberian Express, an anthropologist and his rival must contain the threat posed by the former's cargo: a prehistoric ape which is the host for a parasitic life-form.
   Poster: https://m.media-amazon.com/images/M/MV5BMWY4MzczOWUtYTdlYS00MjNlLTg1ODEtYjkzOTY0MTNmZmEyXkEyXkFqcGc@._V1_SX300.jpg

3. A Classic Horror Story  |  Rating: 5.7  |  Released: 14 Jul 2021
   In this gruesome suspense film, strangers traveling in southern Italy become stranded in the woods, where they must fight desperately to get out alive.
 

In [4]:
# Example 2: Embedding/TF-IDF strategy
MOOD = "claustrophobic psychological"
LIMIT = 6

movies = run(recommend_movies(mood=MOOD, limit=LIMIT, strategy="embedding"))

try:
    from asyncio import Future
    if hasattr(movies, "__await__") or isinstance(movies, Future):
        movies = await movies
except Exception:
    pass

show_results(movies)


1. The Vault of Horror  |  Rating: 6.5  |  Released: 30 Mar 1973
   An anthology of five horror stories shared by five men trapped in the basement of an office building.
   Poster: https://m.media-amazon.com/images/M/MV5BODQzOGQzYmMtZTU4Ny00MjcwLWEyOTctNGQ0OWQxYTQ4ZjIwXkEyXkFqcGc@._V1_SX300.jpg

2. The Last Horror Movie  |  Rating: 5.5  |  Released: 13 May 2005
   A serial killer uses a horror video rental to lure his next victim. What begins as a teen slasher transforms into a disturbing journey through the mind of Max Parry, a mild mannered wedding photograph...
   Poster: https://m.media-amazon.com/images/M/MV5BOTI3NDcxNDM0Nl5BMl5BanBnXkFtZTgwNTA1NzAwMTE@._V1_SX300.jpg

3. Nosferatu: A Symphony of Horror  |  Rating: 7.8  |  Released: 19 Dec 1922
   Vampire Count Orlok expresses interest in a new residence and real estate agent Hutter's wife.
   Poster: https://m.media-amazon.com/images/M/MV5BNDg1OTI1M2MtMTVlMS00ZjFhLTgyMTAtYjIzOWUwZTkyZWE5XkEyXkFqcGc@._V1_SX300.jpg

4. Horror Expres

In [5]:
# Improved OMDb helpers: plot=full, multi-page search, type/year filters
import os
from collections.abc import Iterable
from typing import Any

import httpx

OMDB_BASE_URL = "https://www.omdbapi.com/"


async def omdb_get(params: dict[str, Any]) -> dict[str, Any]:
    key = os.environ.get("OMDB_API_KEY") or ""
    merged = {"apikey": key}
    merged.update(params)
    async with httpx.AsyncClient(timeout=httpx.Timeout(12.0, connect=5.0)) as client:
        resp = await client.get(OMDB_BASE_URL, params=merged)
        resp.raise_for_status()
        data = resp.json()
    if isinstance(data, dict) and data.get("Response") == "False":
        return {}
    return data


async def search_ids(query: str, *, types: Iterable[str] = ("movie",), pages: int = 3, year: int | None = None) -> list[str]:
    ids: list[str] = []
    for t in types:
        for page in range(1, max(1, pages) + 1):
            params: dict[str, Any] = {"s": query, "type": t, "page": page}
            if year:
                params["y"] = year
            data = await omdb_get(params)
            results = data.get("Search") if isinstance(data, dict) else None
            for item in results or []:
                imdb_id = item.get("imdbID")
                if isinstance(imdb_id, str):
                    ids.append(imdb_id)
    # De-duplicate while preserving order
    return list(dict.fromkeys(ids))


async def get_detail_full(imdb_id: str) -> dict[str, Any]:
    return await omdb_get({"i": imdb_id, "plot": "full"})


async def fetch_details(ids: list[str], *, horror_only: bool = True) -> list[dict[str, Any]]:
    details: list[dict[str, Any]] = []
    for imdb_id in ids:
        d = await get_detail_full(imdb_id)
        if not d:
            continue
        genre = (d.get("Genre") or "").lower()
        if horror_only and "horror" not in genre:
            continue
        poster = d.get("Poster")
        poster_url = poster if poster and poster != "N/A" else None
        details.append(
            {
                "imdb_id": d.get("imdbID"),
                "type": (d.get("Type") or d.get("Type") or "movie"),
                "title": d.get("Title"),
                "overview": d.get("Plot") or "",
                "poster_url": poster_url,
                "release_date": d.get("Released"),
                "year": d.get("Year"),
                "vote_average": float(d.get("imdbRating") or 0) if (d.get("imdbRating") and d.get("imdbRating") != "N/A") else None,
                "imdb_votes_raw": d.get("imdbVotes"),
                "metascore_raw": d.get("Metascore"),
                "ratings": d.get("Ratings"),
                "awards": d.get("Awards"),
                "genre": d.get("Genre"),
                "language": d.get("Language"),
                "country": d.get("Country"),
            }
        )
    return details


In [6]:
# Scoring and filters: votes-aware score, year band, language, type
from math import log


def parse_int(s: str | None) -> int:
    if not s:
        return 0
    try:
        return int(str(s).replace(",", "").strip())
    except Exception:
        return 0


def parse_float(s: str | None) -> float:
    if not s or s == "N/A":
        return 0.0
    try:
        return float(s)
    except Exception:
        return 0.0


def popularity_score(item: dict[str, Any]) -> float:
    rating = float(item.get("vote_average") or 0.0)
    votes = parse_int(item.get("imdb_votes_raw"))
    metascore = parse_int(item.get("metascore_raw"))
    # Weighted score combining IMDb rating, votes and Metascore
    return rating * (1 + log(1 + votes)) + 0.02 * metascore


def filter_item(item: dict[str, Any], *, min_year: int | None = None, max_year: int | None = None, languages: set[str] | None = None, types: set[str] | None = None) -> bool:
    year_str = item.get("year") or ""
    try:
        year_int = int(str(year_str)[:4]) if year_str else None
    except Exception:
        year_int = None
    if min_year is not None and (year_int is None or year_int < min_year):
        return False
    if max_year is not None and (year_int is None or year_int > max_year):
        return False
    if languages:
        lang = (item.get("language") or "").lower()
        if not any(l.lower() in lang for l in languages):
            return False
    if types:
        t = (item.get("type") or "movie").lower()
        if t not in types:
            return False
    return True


In [7]:
# Enhanced recommend using improved helpers, filters, and scoring
from random import sample


async def recommend_enhanced(
    mood: str,
    *,
    limit: int = 6,
    types: tuple[str, ...] = ("movie",),
    min_year: int | None = None,
    max_year: int | None = None,
    pages: int = 3,
    languages: tuple[str, ...] = tuple(),
) -> list[dict[str, Any]]:
    # Expand queries similar to app's heuristic plus generic pool
    m = (mood or "").strip().lower()
    queries: list[str] = [f"{m} horror", "horror", "scary horror", "supernatural horror", "slasher horror", "zombie horror"]

    # Collect IDs from multiple pages and types
    ids: list[str] = []
    for q in queries:
        ids += await search_ids(q, types=types, pages=pages)
    ids = list(dict.fromkeys(ids))

    # Fetch full details (with plot=full) and filter
    items = await fetch_details(ids, horror_only=True)

    chosen: list[dict[str, Any]] = []
    lang_set = set(languages)
    type_set = set(t.lower() for t in types)
    for it in items:
        if not filter_item(it, min_year=min_year, max_year=max_year, languages=lang_set or None, types=type_set or None):
            continue
        it["_score"] = popularity_score(it)
        chosen.append(it)

    if not chosen:
        return []

    chosen_sorted = sorted(chosen, key=lambda x: x.get("_score", 0.0), reverse=True)
    pool = chosen_sorted[: max(10, limit * 3)]
    if len(pool) <= limit:
        return [{k: v for k, v in m.items() if k != "_score"} for m in pool][:limit]
    sampled = sample(pool, k=limit)
    return [{k: v for k, v in m.items() if k != "_score"} for m in sampled]


In [8]:
# Example A: Year range + multi-page search (movies only)
from asyncio import Future

MOOD = "atmospheric"
LIMIT = 6

# Favor modern horror: 2005-2024, search more pages for breadth
movies = run(recommend_enhanced(
    mood=MOOD,
    limit=LIMIT,
    types=("movie",),
    min_year=2005,
    max_year=2024,
    pages=4,
    languages=("english",),
))

if hasattr(movies, "__await__") or isinstance(movies, Future):
    movies = await movies

show_results(movies)


1. Zombie Horror Fright Fest!  |  Rating: None  |  Released: 14 Aug 2012
   N/A
   Poster: https://m.media-amazon.com/images/M/MV5BNzQzNjYwMjA3OV5BMl5BanBnXkFtZTgwNTM0MDk5MzE@._V1_SX300.jpg

2. A Christmas Horror Story  |  Rating: 5.8  |  Released: 02 Oct 2015
   High school students investigate a mysterious homicide that occurred the prior holiday season, a couple notices their young son is acting strangely after a snowy forest trip to cut down a traditional ...
   Poster: https://m.media-amazon.com/images/M/MV5BODg3NTAwNzk5M15BMl5BanBnXkFtZTgwNjA4OTE3MDI@._V1_SX300.jpg

3. Untitled Horror Movie  |  Rating: 4.8  |  Released: 15 Jun 2021
   A comedy about making a horror movie. When six co-stars learn their hit TV show is about to be canceled, they decide to shoot their own film, unintentionally summoning a spirit with an affinity for vi...
   Poster: https://m.media-amazon.com/images/M/MV5BYzgyNmFmMDUtMjc5OC00OTVjLWI2NzUtMzRhNWI4MmM2MTk1XkEyXkFqcGc@._V1_SX300.jpg

4. #Horror  |  Ratin

In [9]:
# Example B: Include series in candidates
from asyncio import Future

MOOD = "paranormal haunting"
LIMIT = 6

movies = run(recommend_enhanced(
    mood=MOOD,
    limit=LIMIT,
    types=("movie", "series"),
    min_year=1990,
    pages=3,
    languages=("english",),
))

if hasattr(movies, "__await__") or isinstance(movies, Future):
    movies = await movies

show_results(movies)


1. The Amityville Horror  |  Rating: 5.9  |  Released: 15 Apr 2005
   In December 1975, George and Kathy Lutz along with their three children move into an elegant Long Island house. What they don't know is that the house was the site of a horrific mass murder a year bef...
   Poster: https://m.media-amazon.com/images/M/MV5BMzc1Njc2NDc3NV5BMl5BanBnXkFtZTYwODYyNzI3._V1_SX300.jpg

2. The Last Horror Movie  |  Rating: 5.5  |  Released: 13 May 2005
   A serial killer uses a horror video rental to lure his next victim. What begins as a teen slasher transforms into a disturbing journey through the mind of Max Parry, a mild mannered wedding photograph...
   Poster: https://m.media-amazon.com/images/M/MV5BOTI3NDcxNDM0Nl5BMl5BanBnXkFtZTgwNTA1NzAwMTE@._V1_SX300.jpg

3. Masters of Horror  |  Rating: 7.4  |  Released: 28 Oct 2005
   Anchor Bay has amassed some of the greatest horror film writers and directors to bring to you the anthology series, "Masters of Horror". For the first time, the foremos

In [16]:
# Utilities: embeddings (pretrained if available), normalization, MMR diversification
from __future__ import annotations

import math
from typing import Any, Iterable

import numpy as np

# Try to use Sentence-Transformers if available, otherwise fall back to TF-IDF
from sentence_transformers import SentenceTransformer
_SBERT_MODEL: SentenceTransformer | None = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# try:
#     from sklearn.feature_extraction.text import TfidfVectorizer
# except Exception:
#     TfidfVectorizer = None  # type: ignore


def _normalize_text(s: str | None) -> str:
    return (s or "").strip().lower()


def embed_texts(texts: list[str]) -> np.ndarray:
    """Embed a list of texts to a 2D array [n, d]. Uses SBERT if present, else TF-IDF."""
    texts_norm = [_normalize_text(t) for t in texts]
    if _SBERT_MODEL is not None:
        vecs = _SBERT_MODEL.encode(texts_norm, normalize_embeddings=True)
        return np.asarray(vecs, dtype=np.float32)
    # Fallback: TF-IDF vectors (cosine-compatible); not semantic but zero-setup
    if TfidfVectorizer is None:
        raise RuntimeError("No embedding backend available")
    tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
    X = tfidf.fit_transform(texts_norm)
    # L2-normalize rows for cosine
    X = X.astype(np.float32)
    row_norms = np.sqrt((X.multiply(X)).sum(axis=1)).A1 + 1e-9
    X_norm = X.multiply(1.0 / row_norms[:, None])
    return X_norm.toarray()


def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    """Return cosine similarity between a (1xd or nxd) and b (mxd) -> [n or 1, m]."""
    return (a @ b.T).astype(np.float32)


def minmax(x: np.ndarray) -> np.ndarray:
    if x.size == 0:
        return x
    lo, hi = float(np.min(x)), float(np.max(x))
    if not math.isfinite(lo) or not math.isfinite(hi) or abs(hi - lo) < 1e-12:
        return np.zeros_like(x, dtype=np.float32)
    return ((x - lo) / (hi - lo)).astype(np.float32)


def mmr_select(items: list[dict[str, Any]], sims: np.ndarray, k: int, lambda_: float = 0.7) -> list[dict[str, Any]]:
    """Maximal Marginal Relevance selection from a pool.
    sims: relevance scores per item (higher is better).
    """
    n = len(items)
    if n <= k:
        return items
    selected: list[int] = []
    candidates = set(range(n))
    # pick the most relevant first
    first = int(np.argmax(sims))
    selected.append(first)
    candidates.remove(first)
    # precompute pairwise similarities using title+overview embeddings if available later; here use sims proxy
    # As a proxy for intra-item similarity, reuse a content similarity matrix if provided; otherwise 0
    # We'll compute a naive similarity by Jaccard of word sets as a light proxy
    def _item_sim(i: int, j: int) -> float:
        ti = _normalize_text(items[i].get("title")) + " " + _normalize_text(items[i].get("overview"))
        tj = _normalize_text(items[j].get("title")) + " " + _normalize_text(items[j].get("overview"))
        si, sj = set(ti.split()), set(tj.split())
        if not si or not sj:
            return 0.0
        inter = len(si & sj)
        union = len(si | sj)
        return inter / union if union else 0.0

    while len(selected) < k and candidates:
        best_cand = None
        best_score = -1e9
        for c in list(candidates):
            max_sim_to_sel = 0.0
            for s in selected:
                max_sim_to_sel = max(max_sim_to_sel, _item_sim(c, s))
            score = lambda_ * float(sims[c]) - (1.0 - lambda_) * max_sim_to_sel
            if score > best_score:
                best_score = score
                best_cand = c
        if best_cand is None:
            break
        selected.append(best_cand)
        candidates.remove(best_cand)
    return [items[i] for i in selected]


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Unified recommender: hybrid scoring + optional cross-encoder rerank + MMR diversity
from __future__ import annotations

from typing import Any, Sequence

import numpy as np

# Optional cross-encoder (commented by default to avoid heavy downloads)
try:
    from sentence_transformers import CrossEncoder  # type: ignore
    _CROSS: CrossEncoder | None = None  # lazy load if used
except Exception:
    _CROSS = None  # type: ignore


def _popularity(detail: dict[str, Any]) -> float:
    rating = float(detail.get("vote_average") or 0.0)
    votes_str = (detail.get("imdb_votes_raw") or detail.get("imdbVotes") or "0").replace(",", "")
    metascore_str = (detail.get("metascore_raw") or detail.get("Metascore") or "0")
    try:
        votes = int(votes_str)
    except Exception:
        votes = 0
    try:
        metascore = int(metascore_str)
    except Exception:
        metascore = 0
    return rating * (1 + np.log1p(votes)) + 0.02 * metascore


def _facet_align_score(mood: str, item: dict[str, Any]) -> float:
    # Lightweight proxy: keyword overlap of normalized mood and title+overview
    q = set(_normalize_text(mood).split())
    t = set((_normalize_text(item.get("title")) + " " + _normalize_text(item.get("overview"))).split())
    if not q or not t:
        return 0.0
    inter = len(q & t)
    return inter / max(1, len(q))


async def recommend_unified(
    mood: str,
    *,
    items: list[dict[str, Any]],  # input pool (e.g., from your enhanced OMDb fetch)
    limit: int = 6,
    use_cross_encoder: bool = False,
    diversity_lambda: float = 0.7,
    weights: dict[str, float] | None = None,
) -> list[dict[str, Any]]:
    global _CROSS
    
    if not items:
        return []

    # Embedding-based semantic score
    plots = [(_normalize_text(mood))] + [
        _normalize_text(m.get("overview") or "") for m in items
    ]
    try:
        embs = embed_texts(plots)  # [1+N, d]
    except Exception:
        # Fallback: zero semantic when embeddings unavailable
        embs = np.zeros((1 + len(items), 1), dtype=np.float32)
    mood_vec = embs[0:1]
    plot_vecs = embs[1:]
    sem = cosine_sim(mood_vec, plot_vecs).ravel()
    sem = minmax(sem)

    # Keyword/facet/popularity
    kw = np.array([
        _facet_align_score(mood, it) for it in items
    ], dtype=np.float32)
    kw = minmax(kw)

    pop = np.array([
        _popularity(it) for it in items
    ], dtype=np.float32)
    pop = minmax(pop)

    # Recency prior (weak) using year when available
    rec = np.zeros(len(items), dtype=np.float32)
    years: list[int | None] = []
    for it in items:
        y = it.get("year") or it.get("release_date") or ""
        try:
            y_int = int(str(y)[:4])
        except Exception:
            y_int = None
        years.append(y_int)
    valid_years = [y for y in years if isinstance(y, int)]
    if valid_years:
        y_arr = np.array([y if isinstance(y, int) else min(valid_years) for y in years], dtype=np.int32)
        rec = minmax(y_arr.astype(np.float32))

    # Blend weights
    w = {
        "semantic": 0.45,
        "keyword": 0.20,
        "popularity": 0.20,
        "recency": 0.05,
    }
    if weights:
        w.update(weights)

    blended = (
        w["semantic"] * sem
        + w["keyword"] * kw
        + w["popularity"] * pop
        + w["recency"] * rec
    ).astype(np.float32)

    order = np.argsort(-blended)
    pool_idx = order[: max(10, limit * 5)]
    pool = [items[i] for i in pool_idx]
    pool_scores = blended[pool_idx]

    # Optional cross-encoder reranking of top M (e.g., 50)
    if use_cross_encoder and _CROSS is not None:
        try:
            if _CROSS is None:
                _CROSS = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
            pairs = [(mood, (it.get("overview") or "")) for it in pool]
            ce_scores = np.asarray(_CROSS.predict(pairs), dtype=np.float32)
            # Re-blend with semantic score (small weight)
            ce_scores = minmax(ce_scores)
            pool_scores = 0.8 * ce_scores + 0.2 * minmax(pool_scores)
            reranked_idx = np.argsort(-pool_scores)
            pool = [pool[i] for i in reranked_idx]
            pool_scores = pool_scores[reranked_idx]
        except Exception:
            pass

    # MMR to select final K
    selected = mmr_select(pool, sims=pool_scores, k=limit, lambda_=diversity_lambda)
    return selected

In [20]:
# Example: Run the unified recommender on OMDb-enhanced pool
from asyncio import Future

MOOD = "claustrophobic supernatural slow-burn"
LIMIT = 6

# Reuse earlier enhanced pipeline to build a candidate pool
pool = run(recommend_enhanced(
    mood=MOOD,
    limit=60,
    types=("movie",),
    min_year=1970,
    max_year=2024,
    pages=4,
    languages=("english",),
))

if hasattr(pool, "__await__") or isinstance(pool, Future):
    pool = await pool

recs = run(recommend_unified(
    mood=MOOD,
    items=pool,
    limit=LIMIT,
    use_cross_encoder=False,  # set True to enable cross-encoder rerank (heavier)
    diversity_lambda=0.7,
))

if hasattr(recs, "__await__") or isinstance(recs, Future):
    recs = await recs

show_results(recs)


1. The Amityville Horror  |  Rating: 5.9  |  Released: 15 Apr 2005
   In December 1975, George and Kathy Lutz along with their three children move into an elegant Long Island house. What they don't know is that the house was the site of a horrific mass murder a year bef...
   Poster: https://m.media-amazon.com/images/M/MV5BMzc1Njc2NDc3NV5BMl5BanBnXkFtZTYwODYyNzI3._V1_SX300.jpg

2. Woodlands Dark and Days Bewitched: A History of Folk Horror  |  Rating: 7.6  |  Released: 03 Jun 2022
   WOODLANDS DARK AND DAYS BEWITCHED explores the folk horror phenomenon from its beginnings in a trilogy of films - Michael Reeves' Witchfinder General (1968), Piers Haggard's Blood on Satan's Claw (197...
   Poster: https://m.media-amazon.com/images/M/MV5BMTllNTQzMzktODY3YS00NjRmLTkzNDItNmRiZDMzYWIxNjg1XkEyXkFqcGc@._V1_SX300.jpg

3. My Amityville Horror  |  Rating: 5.4  |  Released: 22 Sep 2012
   For the first time in 35 years, Daniel Lutz recounts his version of the infamous Amityville haunting that terri