In [1]:
%load_ext dotenv
%dotenv

import pandas as pd
import os
import csv
import re


In [2]:
### For now only read ASCII compatible text for title and description


csv_path = os.getcwd() + '/../data/netflix_titles.csv'
# df = pd.read_csv(csv_path, encoding='latin1')

# with open("data/netflix_titles.csv", "r", encoding="utf-8", errors="ignore") as f:
#     reader = csv.DictReader(f)
#     data = [row for row in reader]


ascii_only = re.compile(r'^[\x00-\x7F]+$')

def is_ascii(s):
    return bool(s) and ascii_only.match(s)

with open(csv_path, "r", encoding="utf-8", errors="ignore") as f:
    reader = csv.DictReader(f)
    data = [
        row for row in reader
        if is_ascii(row.get("title", "").strip()) and is_ascii(row.get("description", "").strip())
    ]

print(f"Considering {len(data)} ASCII-only rows")

Considering 7370 ASCII-only rows


In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Unnamed: 13
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",
4,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...,


In [5]:
df.count()

show_id         7370
type            7370
title           7370
director        7370
cast            7370
country         7370
date_added      7370
release_year    7370
rating          7370
duration        7370
listed_in       7370
description     7370
                7370
dtype: int64

In [6]:

df = df.drop(columns=[''], errors='ignore')
df.to_csv("../data/netflix_titles_cleaned.csv", index=False)


In [7]:

nf_data = df.to_dict(orient="records")
nf_data[:2]

[{'show_id': 's1',
  'type': 'Movie',
  'title': 'Dick Johnson Is Dead',
  'director': 'Kirsten Johnson',
  'cast': '',
  'country': 'United States',
  'date_added': 'September 25, 2021',
  'release_year': '2020',
  'rating': 'PG-13',
  'duration': '90 min',
  'listed_in': 'Documentaries',
  'description': 'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'},
 {'show_id': 's2',
  'type': 'TV Show',
  'title': 'Blood & Water',
  'director': '',
  'cast': 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng',
  'country': 'South Africa',
  'date_added': 'September 24, 2021',
  'release_year': '2021',
  'rating': 'TV-MA',
  'duration': '2 S

### OpenSearch

#### Docker Compose setup for OpenSearch 

```sh
docker-compose up -d
```

In [8]:
from __future__ import annotations
import os, re
from dataclasses import dataclass
from datetime import datetime
from itertools import islice
from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple

from opensearchpy import OpenSearch, RequestsHttpConnection
from opensearchpy.helpers import streaming_bulk

# Lazy import for speed if embed=False
try:
    from sentence_transformers import SentenceTransformer  # type: ignore
except Exception:
    SentenceTransformer = None  # noqa: N816

In [9]:
# -------------------- Config --------------------

@dataclass(frozen=True)
class Cfg:
    embed_model: str = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
    embed_batch: int = int(os.getenv("EMBED_BATCH", "32"))
    index: str = os.getenv("OS_INDEX", "netflix_assets")
    vector_dim: int = int(os.getenv("OS_VECTOR_DIM", "384"))
    url: str = os.getenv("OPENSEARCH_URL", "https://localhost:9200")
    user: str = os.getenv("OS_USER", "admin")
    pwd: str = os.getenv("OS_PASS", "admin")
    verify: bool = os.getenv("OS_VERIFY", "true").lower() in ("true", "1", "yes")
    timeout: int = int(os.getenv("OS_TIMEOUT", "60"))


In [10]:
# -------------------- Small, fast utils --------------------

_MONTHS: Dict[str, int] = {m: i for i, m in enumerate(
    ("January","February","March","April","May","June","July","August","September","October","November","December"), 1
)}
_RE_DUR = re.compile(r"^\s*(\d+)\s*(min|mins?|minutes?|seasons?)\b", re.I)
_RE_WS = re.compile(r"\s+")

def _clean(s: Optional[str]) -> str:
    return s.strip() if s else ""

def _to_int(s: Optional[str]) -> Optional[int]:
    s = _clean(s)
    return int(s) if s.isdigit() else None

def _batched(it: Iterable[Any], n: int) -> Iterator[List[Any]]:
    it = iter(it)
    while True:
        chunk = list(islice(it, n))
        if not chunk: return
        yield chunk

def _split_csv(s: Optional[str]) -> List[str]:
    if not s: return []
    return [p.strip() for p in s.split(",") if p.strip()]

def to_iso_date(s: Optional[str]) -> Optional[str]:
    s = _clean(s)
    if not s: return None
    s2 = s.replace(",", "")
    try:
        return datetime.strptime(s2, "%B %d %Y").date().isoformat()
    except ValueError:
        pass
    parts = s2.split()
    if len(parts) == 3 and parts[0] in _MONTHS:
        try:
            return datetime(int(parts[2]), _MONTHS[parts[0]], int(parts[1])).date().isoformat()
        except Exception:
            return None
    return None

def parse_duration(s: Optional[str]) -> Tuple[Optional[int], Optional[int]]:
    s = _clean(s).lower()
    if not s: return (None, None)
    m = _RE_DUR.match(s)
    if not m: return (None, None)
    n, unit = int(m.group(1)), m.group(2)
    return (n, None) if unit.startswith("min") else (None, n)

def text_for_embedding(d: Dict[str, Any]) -> str:
    # avoids joins/empties
    parts = []
    a = d.get("title");        parts.append(a) if a else None
    a = d.get("description");  parts.append(a) if a else None
    a = d.get("director");     parts.append(a) if a else None
    a = d.get("cast_list");    parts.append(", ".join(a)) if a else None
    a = d.get("listed_in");    parts.append(", ".join(a)) if a else None
    a = d.get("type");         parts.append(a) if a else None           
    return " | ".join(parts)

def row_normalize(row: Dict[str, str]) -> Dict[str, Any]:
    minutes, seasons = parse_duration(row.get("duration"))
    title = _clean(row.get("title"))
    show_id = _clean(row.get("show_id"))
    release_year = _to_int(row.get("release_year"))

    listed = _split_csv(row.get("listed_in"))  

    doc = {
        "show_id": show_id,
        "type": _clean(row.get("type")),
        "type_text": _clean(row.get("type")),              
        "title": title,
        "director": _clean(row.get("director")),
        "cast": _clean(row.get("cast")),
        "cast_list": _split_csv(row.get("cast")),
        "country": _clean(row.get("country")),
        "date_added_raw": _clean(row.get("date_added")),
        "date_added": to_iso_date(row.get("date_added")),
        "release_year": release_year,
        "rating": _clean(row.get("rating")),
        "duration_raw": _clean(row.get("duration")),
        "duration_minutes": minutes,
        "seasons": seasons,
        "listed_in": listed,
        "listed_in_text": ", ".join(listed) if listed else "",
        "description": _clean(row.get("description")),
    }
    if show_id:
        doc["_id"] = show_id
    else:
        slug = _RE_WS.sub("", title.lower())
        doc["_id"] = f"{slug}_{release_year if release_year is not None else 'na'}_gen"
    return doc


def index_mapping(vector_dim: int) -> Dict[str, Any]:
    return {
        "settings": {"index": {"knn": True, "refresh_interval": "1s"}},
            "mappings": {
                "properties": {
                "show_id": {"type": "keyword"},
                "type": {"type": "keyword"},
                # text copy so "movie / tv show" appears in BM25
                "type_text": {"type": "text"},

                "title": {"type": "text", "fields": {"raw": {"type": "keyword"}}},
                "director": {"type": "text", "fields": {"raw": {"type": "keyword"}}},
                "cast": {"type": "text"},
                "cast_list": {"type": "keyword"},
                "country": {"type": "keyword"},

                "date_added": {"type": "date"},
                "date_added_raw": {"type": "keyword"},
                "release_year": {"type": "integer"},
                "rating": {"type": "keyword"},
                "duration_raw": {"type": "keyword"},
                "duration_minutes": {"type": "integer"},
                "seasons": {"type": "integer"},

                "listed_in": {"type": "keyword"},
                # text copy so "sci fi" / "action" hits BM25
                "listed_in_text": {"type": "text"},

                "description": {"type": "text"},
                "vector": {
                    "type": "knn_vector",
                    "dimension": vector_dim,
                    "method": {"name": "hnsw", "engine": "faiss", "space_type": "cosinesimil"},
                },
            }
        }
    }


In [11]:
# -------------------- OpenSearch + Embedding --------------------

def make_client(cfg: Cfg) -> OpenSearch:
    if cfg.url.startswith("https://"):
        return OpenSearch(
            cfg.url,
            http_auth=(cfg.user, cfg.pwd),
            verify_certs=cfg.verify,
            ssl_assert_hostname=cfg.verify,
            ssl_show_warn=cfg.verify,
            http_compress=True,
            connection_class=RequestsHttpConnection,
            timeout=cfg.timeout, max_retries=3, retry_on_timeout=True,
        )
    return OpenSearch(cfg.url, http_compress=True, timeout=cfg.timeout, max_retries=3, retry_on_timeout=True)

_embedder_cache: Dict[str, Any] = {}

def get_embedder(cfg: Cfg):
    if SentenceTransformer is None:
        raise RuntimeError("sentence-transformers not installed, but embed=True requested.")
    key = cfg.embed_model
    emb = _embedder_cache.get(key)
    if emb is None:
        model = SentenceTransformer(cfg.embed_model)
        # quick dim probe (no hard fail in prod unless you want fail-fast)
        try:
            v = model.encode(["dim"], normalize_embeddings=True)
            dim = v.shape[1] if hasattr(v, "shape") else len(v[0])
            if dim != cfg.vector_dim:
                raise RuntimeError(f"VECTOR_DIM={cfg.vector_dim} mismatches model dim={dim}")
        except Exception:
            pass
        _embedder_cache[key] = model
        emb = model
    return emb

def ensure_index(client: OpenSearch, cfg: Cfg) -> None:
    try:
        exists = client.indices.exists(index=cfg.index)
    except TypeError:  # old clients sig
        exists = client.indices.exists(cfg.index)
    if not (exists if isinstance(exists, bool) else exists):
        client.indices.create(index=cfg.index, body=index_mapping(cfg.vector_dim), ignore=400)
        print(f"created index {cfg.index}")
    else:
        print(f"using index {cfg.index}")
    client.indices.refresh(index=cfg.index)


# -------------------- Action stream --------------------

def actions_from_rows(
    rows: Iterable[Dict[str, str]],
    cfg: Cfg,
    embed: bool = True,
    embedder=None,
) -> Iterator[Dict[str, Any]]:
    """
    Yields bulk index actions. When embed=True, encodes in efficient batches.
    """
    if not embed:
        for r in rows:
            d = row_normalize(r)
            _id = d.get("_id")
            if not _id:
                continue
            src = {k: v for k, v in d.items() if k != "_id"}
            yield {"_op_type": "index", "_index": cfg.index, "_id": _id, **src}
        return

    model = embedder or get_embedder(cfg)
    norm = (row_normalize(r) for r in rows)

    for chunk in _batched(norm, cfg.embed_batch):
        docs = [d for d in chunk if d.get("_id")]
        if not docs:
            continue
        texts = [text_for_embedding(d) for d in docs]
        vecs = model.encode(
            texts,
            normalize_embeddings=True,
            batch_size=cfg.embed_batch,
            convert_to_numpy=True,
            show_progress_bar=False,
        )
        # Avoid numpy -> list per element call overhead by one pass
        for d, v in zip(docs, vecs.tolist() if hasattr(vecs, "tolist") else vecs):
            _id = d["_id"]
            src = {k: v2 for k, v2 in d.items() if k != "_id"}
            src["vector"] = v if isinstance(v, list) else list(v)
            yield {"_op_type": "index", "_index": cfg.index, "_id": _id, **src}



In [12]:
cfg = Cfg()
os_client = make_client(cfg)
ensure_index(os_client, cfg)

using index netflix_assets


In [13]:
from opensearchpy.helpers import streaming_bulk

success = 0
for ok, item in streaming_bulk(
    client=os_client,
    actions=actions_from_rows(data, cfg, embed=True),
    chunk_size=cfg.embed_batch,          # good default: your embed batch
    max_retries=3,
    raise_on_error=False,                 # don't raise on first bad doc
    request_timeout=cfg.timeout,
):
    if ok:
        success += 1
    else:
        # optional: log the failed item
        print("FAIL:", item)
        pass

os_client.indices.refresh(index=cfg.index)
print(f"Bulk indexed {success} documents into {cfg.index}")


Bulk indexed 7370 documents into netflix_assets


### Search capability

In [14]:
from __future__ import annotations
from functools import lru_cache
from typing import Any, Dict, Iterable, Iterator, List, Optional
from opensearchpy import OpenSearch

# ---- Config-ish bits (adjust as you like)
DEFAULT_PIPELINE = "RRF"
DEFAULT_FIELDS = ["title^3","description^2","listed_in_text^1.5","cast","director"]

# ---- Model loader (cached)
@lru_cache(maxsize=1)
def get_embed_model():
    # Keep this tiny on purpose—inject your loader here
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


# ---- Hybrid search (BM25 + kNN), no filters
def hybrid_search_rrf(
    client: OpenSearch,
    index: str,
    q: str,
    *,
    size: int = 10,
    knn_k: int = 200,
    pipeline: str = DEFAULT_PIPELINE,
    model=None,
):
    # Field boosts tuned for MOVIE data
    boosted_fields = [
        "title^4",
        "cast^2.5",
        "listed_in_text^2",
        "director^1.5",
        "description^1",
        "type_text^2"
    ]

    bm25 = {
        "multi_match": {
            "query": q,
            "fields": boosted_fields,
            "type": "most_fields",
            "fuzziness": "AUTO"     # forgiving typos
        }
    }

    # Vector query (leave as-is)
    m = model or get_embed_model()
    qvec = m.encode(q, normalize_embeddings=True).tolist()
    knn = {
        "knn": {
            "vector": {
                "vector": qvec,
                "k": max(knn_k, size)
            }
        }
    }

    body = {
        "size": size,
        "query": {
            "bool": {
                "must": {"hybrid": {"queries": [bm25, knn]}}
            }
        }
    }
    return client.search(index=index, body=body, params={"search_pipeline": pipeline})



# ---- Ensure RRF pipeline exists (idempotent)
def ensure_rrf_pipeline(
    client: OpenSearch,
    name: str = DEFAULT_PIPELINE,
    rank_constant: int = 60,
    weights: Optional[List[float]] = None,
) -> None:
    body: Dict[str, Any] = {
        "description": "RRF for BM25 + kNN",
        "phase_results_processors": [
            {"score-ranker-processor": {
                "combination": {"technique": "rrf", "rank_constant": rank_constant}
            }}
        ],
    }
    if weights:
        body["phase_results_processors"][0]["score-ranker-processor"]["combination"]["parameters"] = {
            "weights": weights
        }

    try:
        client.transport.perform_request("GET", f"/_search/pipeline/{name}")
    except Exception:
        client.transport.perform_request("PUT", f"/_search/pipeline/{name}", body=body)

# ---- Tiny convenience wrapper
def asset_search(client: OpenSearch, index: str, query: str, *, size: int = 10, pipeline: str = DEFAULT_PIPELINE, display: bool = False) -> List[str]:
    res = hybrid_search_rrf(client, index, query, size=size, knn_k=200, pipeline=pipeline)
    if display:
        for i, h in enumerate(res.get("hits", {}).get("hits", [])):
            src = h["_source"]
            print(f"{i+1:2d}. {src.get('title')}  ({src.get('type')}, {src.get('release_year')})  score={h['_score']:.3f}")
    return [h["_source"].get("show_id", "") for h in res.get("hits", {}).get("hits", [])]


def full_asset_search(client: OpenSearch, index: str, query: str, *, size: int = 10, pipeline: str = DEFAULT_PIPELINE) -> List[str]:
    res = hybrid_search_rrf(client, index, query, size=size, knn_k=200, pipeline=pipeline)
    docs = []
    for h in res.get("hits", {}).get("hits", []):
        src = h["_source"]
        doc = {
            "show_id": src.get("show_id"),
            "type": src.get("type"),
            "title": src.get("title"),
            "director": src.get("director"),
            "cast": src.get("cast"),
            "country": src.get("country"),
            "date_added": src.get("date_added"),
            "release_year": src.get("release_year"),
            "rating": src.get("rating"),
            "duration": src.get("duration"),
            "listed_in": src.get("listed_in"),
            "description": src.get("description"),
        }
        docs.append(doc)
    return docs

In [15]:
# =================== Example ===================
ensure_rrf_pipeline(os_client, name="RRF", rank_constant=60)
ids = asset_search(os_client, cfg.index, "sci fi movie highly rated and questioning reality", display=True)
print(ids)


 1. The Box  (Movie, 2009)  score=0.027
 2. My Scientology Movie  (Movie, 2015)  score=0.025
 3. The Matrix  (Movie, 1999)  score=0.025
 4. Reality Z  (TV Show, 2020)  score=0.016
 5. Bright  (Movie, 2017)  score=0.016
 6. To All the Boys: Always and Forever - The Afterparty  (Movie, 2021)  score=0.016
 7. D.L. Hughley: Clear  (Movie, 2014)  score=0.016
 8. Singularity  (Movie, 2017)  score=0.016
 9. Imagine That  (Movie, 2009)  score=0.016
10. D.L. Hughley: Contrarian  (Movie, 2018)  score=0.016
['s8221', 's7546', 's8415', 's2410', 's5114', 's733', 's5379', 's8026', 's7058', 's3646']


In [16]:
ids = asset_search(os_client, cfg.index, "matrix like action movies", display=True)
print(ids)

 1. The Matrix Revolutions  (Movie, 2003)  score=0.032
 2. The Matrix Reloaded  (Movie, 2003)  score=0.031
 3. The Matrix  (Movie, 1999)  score=0.031
 4. Night Moves  (Movie, 2013)  score=0.026
 5. Scary Movie  (Movie, 2000)  score=0.025
 6. Pulp Fiction  (Movie, 1994)  score=0.024
 7. The Art of War  (Movie, 2000)  score=0.023
 8. Martin Lawrence Live: Runteldat  (Movie, 2002)  score=0.016
 9. Swearnet Live  (Movie, 2014)  score=0.016
10. Boyka: Undisputed  (Movie, 2016)  score=0.016
['s8417', 's8416', 's8415', 's7585', 's7956', 's7803', 's8197', 's7412', 's5016', 's329']


### RAG Flow Implementation


In [17]:
#### Direct LLM Query

In [18]:
from openai import OpenAI
import openai
llm_client = OpenAI()


In [19]:
def generate_response(q):
    response = llm_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": q}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [20]:
query = "Action movies post 2020"
res = generate_response(query)
print(res)

Here are some notable action movies released after 2020:

1. **Nobody (2021)** - An action-thriller about a seemingly ordinary man who reveals his explosive past when his family is threatened.
2. **F9 (Fast & Furious 9) (2021)** - The ninth installment of the Fast & Furious franchise that continues the adventures of Dominic Toretto and his crew, featuring high-octane stunts and family drama.
3. **No Time to Die (2021)** - The 25th James Bond film, where Bond comes out of retirement to confront a mysterious villain armed with dangerous technology.
4. **Shang-Chi and the Legend of the Ten Rings (2021)** - A Marvel superhero film introducing Shang-Chi, who must confront a past he thought he left behind.
5. **Dune (2021)** - While primarily a science fiction film, it features intense action sequences and epic battles, based on Frank Herbert's novel.
6. **The Matrix Resurrections (2021)** - The long-awaited continuation of the Matrix saga, mixing action and philosophy in a new narrative.
7.

In [21]:
### RAG Implementation

In [22]:
from string import Template

entry_template = Template("""
show_id: $show_id
type: $type
title: $title
director: $director
cast: $cast
country: $country
date_added: $date_added
release_year: $release_year
rating: $rating
duration: $duration
listed_in: $listed_in
description: $description
""")

prompt_template = Template("""
You are a streaming-catalog assistant.

Return ONE JSON object matching this SCHEMA exactly (no extra keys, no prose):

SCHEMA: {
  "catalog_recommendations": [
    {
      "show_id": "string",
      "type": "string",
      "title": "string",
      "director": "string",
      "cast": ["string"],
      "country": "string",
      "date_added": "string",
      "release_year": "integer",
      "rating": "string",
      "listed_in": ["string"],
      "description": "string"
    }
  ],
  "out_of_catalog_suggestions": [
    { "title": "string", "url": "string" }
  ]
}


HARD RULES
- CONTEXT is pre-ranked (earlier = more relevant). Build "catalog_recommendations" ONLY from CONTEXT.
- Scan CONTEXT top-down:
  1) Add items that plausibly match QUERY.
  2) If fewer than MIN_CATALOG and CONTEXT still has items, keep taking the next items (even weak matches)
     until you reach MIN_CATALOG or run out of CONTEXT.
- If CONTEXT has >= MIN_CATALOG items total, you MUST return at least MIN_CATALOG in "catalog_recommendations".
- Copy fields exactly from CONTEXT; for "cast", split the comma-separated string and trim; drop empties.
- Deduplicate by (title, release_year); keep the earlier one.
- EXTERNALS: If ALLOW_EXTERNAL=true, add "out_of_catalog_suggestions" to bring TOTAL items to TOTAL_RESULTS.
  Use authoritative URLs (IMDb/Wikipedia/JustWatch). If ALLOW_EXTERNAL=false, "out_of_catalog_suggestions": [].
- Final counts:
  len(catalog_recommendations) >= min(MIN_CATALOG, number_of_items_in_CONTEXT)
  len(catalog_recommendations) + len(out_of_catalog_suggestions) == TOTAL_RESULTS (when ALLOW_EXTERNAL=true), else externals=[].

INPUTS
QUERY: $query
ALLOW_EXTERNAL: $allow_external
MIN_CATALOG: $min_catalog
TOTAL_RESULTS: $total_results

CONTEXT (ranked):
$context
""".strip())



def build_prompt(query, search_results, allow_external=True, min_catalog=5, total_results=10):
    context = ""
    for doc in search_results:
        context += entry_template.substitute(**doc) + "\n\n"
    return prompt_template.substitute(
        query=query,
        context=context,
        allow_external=str(allow_external).lower(), 
        min_catalog=min_catalog,
        total_results=total_results
    )

def llm(prompt):
    response = llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = full_asset_search(os_client, cfg.index, query, size=10)
    if not search_results:
        print("WARN: No relevant results found.")

    prompt = build_prompt(query, search_results)
    response = llm(prompt)
    return response


In [23]:
docs = full_asset_search(os_client, cfg.index, "action movies post 2020")
res = build_prompt(query, docs)
# print(res)

In [24]:
query

'Action movies post 2020'

In [25]:
search_results = full_asset_search(os_client, cfg.index, query, size=5)
search_results[:2]

[{'show_id': 's1487',
  'type': 'Movie',
  'title': 'Death to 2020',
  'director': 'Al Campbell, Alice Mathias',
  'cast': 'Samuel L. Jackson, Hugh Grant, Lisa Kudrow, Kumail Nanjiani, Tracey Ullman, Samson Kayo, Leslie Jones, Diane Morgan, Cristin Milioti, Joe Keery',
  'country': 'United States',
  'date_added': '2020-12-27',
  'release_year': 2020,
  'rating': 'TV-MA',
  'duration': None,
  'listed_in': ['Comedies'],
  'description': 'As the year we all want to end finally does, take a look back at 2020\'s mad glory in this comedic retrospective from the creators of "Black Mirror."'},
 {'show_id': 's1397',
  'type': 'Movie',
  'title': 'Homefront',
  'director': 'Gary Fleder',
  'cast': 'Jason Statham, James Franco, Izabela Vidovic, Kate Bosworth, Marcus Hester, Clancy Brown, Winona Ryder, Omar Benson Miller, Rachelle Lefevre, Frank Grillo, Chuck Zito, Pruitt Taylor Vince',
  'country': 'United States',
  'date_added': '2021-01-18',
  'release_year': 2013,
  'rating': 'R',
  'durati

In [26]:
rag_res =rag(query)
print(rag_res)

{
  "catalog_recommendations": [
    {
      "show_id": "s7687",
      "type": "Movie",
      "title": "Outlawed",
      "director": "Adam Collins, Luke Radford",
      "cast": [
        "Adam Collins",
        "Jessica Norris",
        "Ian Hitchens",
        "Steven Blades",
        "Zara Phythian",
        "Anthony Burrows",
        "Andy Calderwood",
        "Emmeline Kellie",
        "Andre Squire",
        "Ollie Christie"
      ],
      "country": "United Kingdom",
      "date_added": "2019-02-15",
      "release_year": 2018,
      "rating": "TV-MA",
      "listed_in": [
        "Action & Adventure"
      ],
      "description": "After a failed mission, an ex-Royal Marines Commando tries to overcome his demons while investigating the death of his ex-girlfriend's father."
    },
    {
      "show_id": "s295",
      "type": "Movie",
      "title": "Takizawa Kabuki ZERO 2020 The Movie",
      "director": "Hideaki Takizawa",
      "cast": [
        "Hikaru Iwamoto",
        "Tatsuya

In [27]:
docs = full_asset_search(os_client, cfg.index, "matrix like action movies")
for i in docs:
    print(i)

{'show_id': 's8417', 'type': 'Movie', 'title': 'The Matrix Revolutions', 'director': 'Lilly Wachowski, Lana Wachowski', 'cast': 'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, Jada Pinkett Smith, Mary Alice, Harold Perrineau, Monica Bellucci, Harry Lennix, Lambert Wilson, Nona Gaye', 'country': 'United States', 'date_added': '2019-11-01', 'release_year': 2003, 'rating': 'R', 'duration': None, 'listed_in': ['Action & Adventure', 'Sci-Fi & Fantasy'], 'description': 'The final installment in the Matrix trilogy finds an unconscious Neo trapped in a subway station in a zone between the Matrix and the machine world.'}
{'show_id': 's8416', 'type': 'Movie', 'title': 'The Matrix Reloaded', 'director': 'Lilly Wachowski, Lana Wachowski', 'cast': 'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, Jada Pinkett Smith, Gloria Foster, Harold Perrineau, Monica Bellucci, Harry Lennix, Lambert Wilson, Randall Duk Kim, Nona Gaye', 'country': 'United States', 'date_added'

In [28]:
query

'Action movies post 2020'

In [29]:
nf_data[:10]

[{'show_id': 's1',
  'type': 'Movie',
  'title': 'Dick Johnson Is Dead',
  'director': 'Kirsten Johnson',
  'cast': '',
  'country': 'United States',
  'date_added': 'September 25, 2021',
  'release_year': '2020',
  'rating': 'PG-13',
  'duration': '90 min',
  'listed_in': 'Documentaries',
  'description': 'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'},
 {'show_id': 's2',
  'type': 'TV Show',
  'title': 'Blood & Water',
  'director': '',
  'cast': 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng',
  'country': 'South Africa',
  'date_added': 'September 24, 2021',
  'release_year': '2021',
  'rating': 'TV-MA',
  'duration': '2 S

In [30]:
print(rag(query))

{
  "catalog_recommendations": [
    {
      "show_id": "s7687",
      "type": "Movie",
      "title": "Outlawed",
      "director": "Adam Collins, Luke Radford",
      "cast": [
        "Adam Collins",
        "Jessica Norris",
        "Ian Hitchens",
        "Steven Blades",
        "Zara Phythian",
        "Anthony Burrows",
        "Andy Calderwood",
        "Emmeline Kellie",
        "Andre Squire",
        "Ollie Christie"
      ],
      "country": "United Kingdom",
      "date_added": "2019-02-15",
      "release_year": 2018,
      "rating": "TV-MA",
      "listed_in": [
        "Action & Adventure"
      ],
      "description": "After a failed mission, an ex-Royal Marines Commando tries to overcome his demons while investigating the death of his ex-girlfriend's father."
    },
    {
      "show_id": "s295",
      "type": "Movie",
      "title": "Takizawa Kabuki ZERO 2020 The Movie",
      "director": "Hideaki Takizawa",
      "cast": [
        "Hikaru Iwamoto",
        "Tatsuya

### Minsearch

In [31]:
import minsearch

In [32]:
nf_data[:2]

[{'show_id': 's1',
  'type': 'Movie',
  'title': 'Dick Johnson Is Dead',
  'director': 'Kirsten Johnson',
  'cast': '',
  'country': 'United States',
  'date_added': 'September 25, 2021',
  'release_year': '2020',
  'rating': 'PG-13',
  'duration': '90 min',
  'listed_in': 'Documentaries',
  'description': 'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'},
 {'show_id': 's2',
  'type': 'TV Show',
  'title': 'Blood & Water',
  'director': '',
  'cast': 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng',
  'country': 'South Africa',
  'date_added': 'September 24, 2021',
  'release_year': '2021',
  'rating': 'TV-MA',
  'duration': '2 S

In [33]:
index = minsearch.Index(
    text_fields=["type", "title", "director", "cast", "country", "date_added", "release_year", "rating", "duration", "listed_in", "description"],
    keyword_fields=["title", "director", "cast", "country", "listed_in", "release_year"]
)


index.fit(nf_data)

<minsearch.minsearch.Index at 0x13d57c690>

In [34]:
def min_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [35]:
min_search(query)

[{'show_id': 's1903',
  'type': 'Movie',
  'title': 'All Because of You',
  'director': 'Adrian Teh',
  'cast': 'Hairul Azreen, Janna Nick, Amerul Affendi, Henley Hii, Nam Ron, Theebaan Govindasamy, Taufiq Hanafi, Hafizul Kamal, Josiah Hogan, Sophia Albarakbah, Sugeeta Chandran, Anna Jobling',
  'country': 'Malaysia',
  'date_added': 'October 1, 2020',
  'release_year': '2020',
  'rating': 'TV-PG',
  'duration': '102 min',
  'listed_in': 'Action & Adventure, Comedies, International Movies',
  'description': 'After falling for a guest, an unsuspecting hotel staff becomes embroiled in a hostage scheme and discovers true love in an unlikely place.'},
 {'show_id': 's2177',
  'type': 'Movie',
  'title': 'Rogue Warfare: The Hunt',
  'director': 'Mike Gunther',
  'cast': 'Will Yun Lee, Jermaine Love, Rory Markham, Bertrand-Xavier Corbi, Katie Keene, Fernando Chien, Gina DeCesare, Michael Blalock, Chris Mulkey, Stephen Lang',
  'country': 'United States',
  'date_added': 'August 1, 2020',
  'r

In [36]:


def build_prompt(query, search_results):
    prompt_template = """
You're a movie tv-show content catalog assistant. Answer the QUERY based on the CONTEXT from the catalog.
Use only the facts from the CONTEXT when answering the QUERY.

QUERY: {query}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
               context += entry_template.substitute(**doc) + "\n\n"

    prompt = prompt_template.format(query=query, context=context).strip()
    return prompt


def min_rag(query):
    search_results = min_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [37]:
query

'Action movies post 2020'

In [38]:
res = min_rag(query)
print(res)

Here are the action movies released post-2020 based on the provided context:

- None. The action movies listed in the context all have a release year of 2020.


In [39]:
sr = min_search(query)
for s in sr:
    print(s)

{'show_id': 's1903', 'type': 'Movie', 'title': 'All Because of You', 'director': 'Adrian Teh', 'cast': 'Hairul Azreen, Janna Nick, Amerul Affendi, Henley Hii, Nam Ron, Theebaan Govindasamy, Taufiq Hanafi, Hafizul Kamal, Josiah Hogan, Sophia Albarakbah, Sugeeta Chandran, Anna Jobling', 'country': 'Malaysia', 'date_added': 'October 1, 2020', 'release_year': '2020', 'rating': 'TV-PG', 'duration': '102 min', 'listed_in': 'Action & Adventure, Comedies, International Movies', 'description': 'After falling for a guest, an unsuspecting hotel staff becomes embroiled in a hostage scheme and discovers true love in an unlikely place.'}
{'show_id': 's2177', 'type': 'Movie', 'title': 'Rogue Warfare: The Hunt', 'director': 'Mike Gunther', 'cast': 'Will Yun Lee, Jermaine Love, Rory Markham, Bertrand-Xavier Corbi, Katie Keene, Fernando Chien, Gina DeCesare, Michael Blalock, Chris Mulkey, Stephen Lang', 'country': 'United States', 'date_added': 'August 1, 2020', 'release_year': '2020', 'rating': 'R', 'd

### Ground Truth Data Generation

In [40]:
# First try
gt_prompt_template = """
You emulate a netflix streaming service customer who is looking for an interesting movie or tv-show.
Formulate 5 questions that a customer might ask based on a Netflix database record.
The record should contain the asset title (movie or tv-show) to the user query phrases, and the asset search phrases can be long or short.
If possible, use as few words as possible from the title or movie description.

The Movie/TV-Show record:

title: {title}
description: {description}
listed_in: {listed_in}
cast: {cast}
country: {country}
director: {director}
asset: {title}
rating: {rating}

Provide the output in parsable JSON format without using code blocks:

["question 1", "question 2", "question 3", "question 4", "question 5"]
""".strip()

In [41]:
# Refined based on results

gt_prompt_template = Template("""
You are emulating a Netflix customer searching for something to watch.
Given ONE movie/TV-show record, produce EXACTLY 5 diverse search queries that a human might type to find this title.

Record:

show_id: $show_id
type: $type
title: $title
director: $director
cast: $cast
country: $country
date_added: $date_added
release_year: $release_year
rating: $rating
duration: $duration
listed_in: $listed_in
description: $description


HARD CONSTRAINTS
- Output: return ONLY a valid JSON array of 5 strings (no prose, no code fences).
- Length targets (in order): 
  1st = 1 word; 2nd = 2 to 3 words; 3rd = 4 to 5 words; 4th = 6 to 7 words; 5th = 8 to 10 words.
  Do not exceed the upper bound for any item. Prefer concise synonyms over padding.
- Distinct intent for EACH query (choose 5 different ones): 
  {genre/subgenre} · {mood/tone} · {cast/creator} · {storyline/theme/plot element} · {setting/location/language} · {age rating/parental guidance} · {audience/kids/family/teens} · {release period/era} · {format: movie vs series, limited series, miniseries}
- Title/description usage:
  • Do NOT use the full title or copy full phrases from the description.
  • At most TWO non-consecutive words from the title across the entire set.
- Use ONLY information present in the record. Do NOT invent other titles, brands, or proper nouns not in the record.
- Natural query style as typed in a streaming app; no filler like “please” or “can you”.
- No punctuation except spaces and hyphens. No quotes or trailing periods.
- Type-aware phrasing: if type == "Movie", you may include “movie” in at most one query; if type == "TV Show", you may include “series/show” in at most one query.
- Avoid repeating the same key noun/adjective across different queries; vary vocabulary.


Output exactly in this JSON format:
[
  "query_1 (exactly 1 word)",
  "query_2 (exactly 2 to 3 words)",
  "query_3 (exactly 4 to 5 words)",
  "query_4 (exactly 6 to 7 words)",
  "query_5 (exactly 8 to 10 words)"
]

""".strip())



In [42]:
from openai import OpenAI
client = OpenAI()

In [43]:
import random

random.seed(42)
snf = random.sample(nf_data, 1)
snf

[{'show_id': 's6422',
  'type': 'Movie',
  'title': 'Caregiver',
  'director': 'Chito S. Roño',
  'cast': 'Sharon Cuneta, John Estrada, John Manalo, Rica Peralejo, Jhong Hilario, Saul Reichlin, Matthew Rutherford, Claire Jeater, Makisig Morales, Mickey Ferriols',
  'country': 'Philippines',
  'date_added': 'March 5, 2019',
  'release_year': '2008',
  'rating': 'TV-14',
  'duration': '131 min',
  'listed_in': 'Dramas, International Movies',
  'description': 'Sarah leaves her son in the Philippines to reunite with her husband in London, where she struggles personally and professionally as a care-home worker.'}]

In [44]:
from tqdm.auto import tqdm
import json

def gt_generate_questions(doc):
    gt_prompt = gt_prompt_template.substitute(**doc)

    response = llm_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": gt_prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


# results = {}

# sampled_nf_data = random.sample(nf_data, 1000)

# for doc in tqdm(sampled_nf_data):
#     doc_id = doc['show_id']
#     if doc_id in results:
#         continue
#     model_out = gt_generate_questions(doc)
#     questions = json.loads(model_out)
#     results[doc_id] = questions

In [45]:

# for doc in tqdm(sampled_nf_data):
#     doc_id = doc['show_id']
#     if doc_id in results:
#         continue
#     model_out = gt_generate_questions(doc)
#     questions = json.loads(model_out)
#     results[doc_id] = questions

In [46]:
# with open('results.json', 'w') as f:
#     json.dump(results, f, indent=2)

In [47]:
import json
ground_truth = []

with open('results.json', 'r') as f:
    results = json.load(f)


for s_id, qs in results.items():
    for q in qs:
        ground_truth.append({'query': q, 'doc_id': s_id})

In [48]:
ground_truth[:15]

[{'query': 'documentary', 'doc_id': 's1426'},
 {'query': 'chess movie', 'doc_id': 's1426'},
 {'query': 'story about a chess prodigy', 'doc_id': 's1426'},
 {'query': "making of The Queen's Gambit documentary", 'doc_id': 's1426'},
 {'query': 'behind the scenes of a chess film', 'doc_id': 's1426'},
 {'query': 'Goth', 'doc_id': 's7321'},
 {'query': 'independent movie', 'doc_id': 's7321'},
 {'query': 'film about faith and family', 'doc_id': 's7321'},
 {'query': "comedy drama about a nun's journey", 'doc_id': 's7321'},
 {'query': 'movies featuring characters returning home from war',
  'doc_id': 's7321'},
 {'query': 'thrillers', 'doc_id': 's1047'},
 {'query': 'stalker movies', 'doc_id': 's1047'},
 {'query': 'movies with Amber Midthunder', 'doc_id': 's1047'},
 {'query': 'small-town girl escaping an obsessive relationship',
  'doc_id': 's1047'},
 {'query': 'movies added in April 2021', 'doc_id': 's1047'}]

In [49]:
# Evaluation metric functions

def min_search_ids(query) -> list[str]:
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return [r['show_id'] for r in results]

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


# Evaluate

def calculate_metrics(ground_truth, search_func=min_search_ids):
    relevance_total = []
    for i in tqdm(ground_truth):
        query = i['query']
        doc_id = i['doc_id']
        s_ids = search_func(query)
        relevance = [doc_id == s_id for s_id in s_ids]
        relevance_total.append(relevance)

    mrr_score = mrr(relevance_total)
    hit_rate_score = hit_rate(relevance_total)

    return {
        "mrr": mrr_score,
        "hit_rate": hit_rate_score
    }

In [50]:
calculate_metrics(ground_truth) 

  0%|          | 0/5000 [00:00<?, ?it/s]

{'mrr': 0.12476007936507942, 'hit_rate': 0.2382}

In [51]:
def os_search(query):
    # Implement the search function using the OpenSearch API
    return asset_search(os_client, cfg.index, query)    


In [52]:
## Older Eval metrixs for OpenSearch
#1 {'mrr': 0.22870531746031753, 'hit_rate': 0.3584}
#2 {'mrr': 0.16949936507936533, 'hit_rate': 0.3234}

In [53]:
calculate_metrics(ground_truth, os_search) 

  0%|          | 0/5000 [00:00<?, ?it/s]

{'mrr': 0.16074095238095254, 'hit_rate': 0.3126}

In [54]:
# Optimization

# ---- Hybrid search (BM25 + kNN), tuned for MOVIE data
def hybrid_search_rrf_2(
    client: OpenSearch,
    index: str,
    q: str,
    *,
    size: int = 10,
    knn_k: int = 250,  # slight bump
    pipeline: str = DEFAULT_PIPELINE,
    model=None,
):
    # ultra-light normalization (optional)
    q = q.lower().replace("sci-fi", "sci fi").strip()

    # BM25 tuned: phrase + cross_fields; no description, no type_text
    bm25 = {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": q,
                        "type": "phrase",
                        "fields": ["title^8", "cast^5"],
                        "slop": 1
                    }
                },
                {
                    "multi_match": {
                        "query": q,
                        "type": "cross_fields",
                        "fields": ["title^6", "cast^4", "listed_in_text^3", "director^2"],
                        "operator": "AND",
                        "minimum_should_match": "2<-25%"
                    }
                },
                # optional softener for longer queries:
                {
                    "multi_match": {
                        "query": q,
                        "fields": ["title^2", "cast^1.5"],
                        "fuzziness": 1,
                        "prefix_length": 1
                    }
                }
            ],
            "minimum_should_match": 1
        }
    }

    # Vector query (unchanged)
    m = model or get_embed_model()
    qvec = m.encode(q, normalize_embeddings=True).tolist()
    knn = {
        "knn": {
            "vector": {
                "vector": qvec,
                "k": max(knn_k, size)
            }
        }
    }

    body = {
        "size": size,
        "query": {
            "bool": {
                "must": {"hybrid": {"queries": [bm25, knn]}}
            }
        }
    }
    return client.search(index=index, body=body, params={"search_pipeline": pipeline})


In [55]:
def asset_search_2(client: OpenSearch, index: str, query: str, *, size: int = 10, pipeline: str = DEFAULT_PIPELINE, display: bool = False) -> List[str]:
    res = hybrid_search_rrf_2(client, index, query, size=size, knn_k=200, pipeline=pipeline)
    if display:
        for i, h in enumerate(res.get("hits", {}).get("hits", [])):
            src = h["_source"]
            print(f"{i+1:2d}. {src.get('title')}  ({src.get('type')}, {src.get('release_year')})  score={h['_score']:.3f}")
    return [h["_source"].get("show_id", "") for h in res.get("hits", {}).get("hits", [])]


def os_search_2(query):
    # Implement the search function using the OpenSearch API
    return asset_search_2(os_client, cfg.index, query)    

In [56]:
calculate_metrics(ground_truth, os_search_2) 

  0%|          | 0/5000 [00:00<?, ?it/s]

{'mrr': 0.15720706349206368, 'hit_rate': 0.3312}

In [57]:
import re

def hybrid_search_rrf_3(
    client: OpenSearch,
    index: str,
    q: str,
    *,
    size: int = 10,
    knn_k: int = 150,           # modest k to limit vector noise
    pipeline: str = DEFAULT_PIPELINE,
    model=None,
):
    # ---- ultra-light, generic query prep (no parsing, no filters)
    qn = q.lower().strip().replace("sci-fi", "sci fi")

    # tiny, generic expansions that help many queries w/o being domain-specific
    EXPAND = {
        "kids": "children family preschool toddler",
        "family": "kids children",
        "romantic": "romance love romcom",
        "animals": "pets cats dogs kittens puppies",
        "mature": "adult",
        "comedy": "funny",
        "horror": "scary",
        "thriller": "suspense"
    }
    for k, extra in EXPAND.items():
        if k in qn:
            qn = f"{qn} {extra}"

    has_movie = any(t in qn for t in (" movie", " film "))
    has_tv    = any(t in qn for t in (" tv ", " series", " tv show"))

    # soft year nudge (no filter)
    m = re.search(r"\b(19|20)\d{2}\b", qn)
    yr = int(m.group(0)) if m else None

    # ---- BM25 side: use multiple intent buckets and let dis_max pick the best
    # Avoid free-form description matches dominating; only allow phrase in desc
    bm25_queries = [
        # Names intent: title/cast phrase (very precise)
        {"multi_match": {
            "query": qn, "type": "phrase",
            "fields": ["title^12", "cast^8"],
            "slop": 1
        }},
        # Names intent: best_fields AND for multi-token coherence
        {"multi_match": {
            "query": qn, "type": "best_fields",
            "fields": ["title^10", "cast^7", "director^3"],
            "operator": "AND", "minimum_should_match": "2<-25%"
        }},
        # Genre/audience intent: listed_in_text only (controlled)
        {"multi_match": {
            "query": qn, "type": "best_fields",
            "fields": ["listed_in_text^6"],
            "operator": "OR"
        }},
        # Theme/phrase intent: description (phrase only to reduce noise)
        {"match_phrase": {"description": {"query": qn, "slop": 2, "boost": 1.2}}},
        # Prefix support: helps short cues like "austen", "book club"
        {"match_bool_prefix": {"title": {"query": qn, "boost": 2.0}}}
    ]

    # tiny type nudge if user literally says movie/series (not a filter)
    if has_movie:
        bm25_queries.append({"term": {"type": "Movie"}})
    if has_tv:
        bm25_queries.append({"term": {"type": "TV Show"}})

    bm25_core = {"dis_max": {"tie_breaker": 0.15, "queries": bm25_queries}}

    # gentle year preference if a 4-digit year is present
    bm25 = (
        {
            "function_score": {
                "query": bm25_core,
                "boost_mode": "multiply",
                "score_mode": "sum",
                "functions": [{
                    "gauss": {"release_year": {"origin": yr, "scale": 4, "decay": 0.6}}
                }]
            }
        } if yr else bm25_core
    )

    # ---- Vector side (unchanged model)
    m = model or get_embed_model()
    qvec = m.encode(q, normalize_embeddings=True).tolist()
    knn = {"knn": {"vector": {"vector": qvec, "k": max(knn_k, size)}}}

    # ---- Final query (no filters)
    body = {
        "size": size,
        "query": {"bool": {"must": {"hybrid": {"queries": [bm25, knn]}}}}
    }
    return client.search(index=index, body=body, params={"search_pipeline": pipeline})


In [58]:
def asset_search_3(client: OpenSearch, index: str, query: str, *, size: int = 10, pipeline: str = DEFAULT_PIPELINE, display: bool = False) -> List[str]:
    res = hybrid_search_rrf_3(client, index, query, size=size, knn_k=200, pipeline=pipeline)
    if display:
        for i, h in enumerate(res.get("hits", {}).get("hits", [])):
            src = h["_source"]
            print(f"{i+1:2d}. {src.get('title')}  ({src.get('type')}, {src.get('release_year')})  score={h['_score']:.3f}")
    return [h["_source"].get("show_id", "") for h in res.get("hits", {}).get("hits", [])]


def os_search_3(query):
    # Implement the search function using the OpenSearch API
    return asset_search_3(os_client, cfg.index, query)  

In [59]:
calculate_metrics(ground_truth, os_search_3) 

  0%|          | 0/5000 [00:00<?, ?it/s]

{'mrr': 0.13141166666666676, 'hit_rate': 0.2814}

In [60]:
##

In [61]:
RET = {
    # Retrieval model tuned for short phrase queries; 384-dim, fast
    "embedder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",

    # Candidate generation
    "bm25_size": 200,    # candidate pool from BM25
    "top_k": 10,         # final results to return

    # Re-rank blend: final_score = alpha * bm25_norm + (1 - alpha) * cosine
    "alpha": 0.65,       # start here; sweep 0.5–0.8 on your eval

    # BM25 buckets (dis_max) — generic, safe
    "bm25": {
        "tie_breaker": 0.1,
        "boosts": {
            "title_phrase": 12.0,
            "cast_phrase":   8.0,
            "title_best":   10.0,
            "cast_best":     7.0,
            "director":      2.5,
            "listed_in":     9.0,   # genres/audience (biggest ROI)
            "country":       4.0,   # “German/Korean series”
            "rating":        2.5,   # “mature/TV-MA”
            "type":          1.5,   # tiny nudge
            "desc_phrase":   1.05,  # keep description constrained
        }
    },

    # Tiny, generic expansions (LLM-free)
    "expand": True
}


In [62]:
def add_text_copies_mapping(client, index):
    body = {
      "properties": {
        "listed_in_text": {"type": "text"},
        "country_text":   {"type": "text"},
        "rating_text":    {"type": "text"},
        "type_text":      {"type": "text"}
      }
    }
    client.indices.put_mapping(index=index, body=body, ignore=400)



In [63]:
import re
from datetime import datetime

def _clean(s):
    if s is None: return None
    s2 = str(s).strip()
    return s2 or None

def _split_csv(s):
    s = _clean(s)
    return [p.strip() for p in s.split(",")] if s else []

def row_normalize(row: dict) -> dict:
    listed = _split_csv(row.get("listed_in"))
    cast_list = _split_csv(row.get("cast"))

    # release_year int
    ry = _clean(row.get("release_year"))
    release_year = int(ry) if ry and ry.isdigit() else None

    doc = {
        "show_id":     _clean(row.get("show_id")),
        "type":        _clean(row.get("type")),
        "type_text":   _clean(row.get("type")),
        "title":       _clean(row.get("title")),
        "director":    _clean(row.get("director")),
        "cast":        _clean(row.get("cast")),
        "cast_list":   cast_list,
        "country":     _clean(row.get("country")),
        "country_text": _clean(row.get("country")),
        "date_added":  _clean(row.get("date_added")),  # keep as-is unless you need range filters
        "release_year": release_year,
        "rating":      _clean(row.get("rating")),
        "rating_text": _clean(row.get("rating")),
        "duration":    _clean(row.get("duration")),
        "listed_in":   listed,
        "listed_in_text": ", ".join(listed) if listed else None,
        "description": _clean(row.get("description")),
    }
    # drop empties for cleaner index
    doc = {k: v for k, v in doc.items() if v not in (None, "", [], {})}
    doc["listed_in_text"] = ", ".join(doc["listed_in"]) if doc.get("listed_in") else None
    doc["country_text"]   = doc.get("country")
    doc["rating_text"]    = doc.get("rating")
    doc["type_text"]      = doc.get("type")
    doc["_id"] = doc.get("show_id") or (re.sub(r"\s+", "", (doc.get("title") or "").lower()) + "_gen")
    return doc

def text_for_embedding(d):
    parts = []
    for k in ("title","description","director","type","country","rating"):
        v = d.get(k); parts.append(v) if v else None
    if d.get("cast_list"): parts.append(", ".join(d["cast_list"]))
    if d.get("listed_in"): parts.append(", ".join(d["listed_in"]))
    return " | ".join(parts)


In [68]:
from functools import lru_cache

@lru_cache(maxsize=1)
def get_embed_model(name: str):
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer(name)

def _normalize_expand(q: str, enable=True) -> str:
    qn = q.lower().strip().replace("sci-fi", "sci fi").replace("&", " and ")
    if not enable: return qn
    adds = []
    if "kids" in qn or "family" in qn: adds += ["children", "preschool", "toddler", "family"]
    if any(t in qn for t in ("romantic","romance","romcom")): adds += ["love", "relationships", "romcom"]
    if any(t in qn for t in ("mature","adult")): adds += ["tv ma", "r", "mature"]
    if any(t in qn for t in ("series"," tv "," tv-"," tv_"," show")): adds += ["tv show", "series"]
    for hint, country in [("german","germany"), ("korean","south korea"), ("japanese","japan"),
                          ("french","france"), ("spanish","spain"), ("italian","italy")]:
        if hint in qn: adds.append(country)
    return qn + (" " + " ".join(adds) if adds else "")

def bm25_query(qn: str, boosts: dict, tie: float) -> dict:
    queries = [
        {"multi_match": {"query": qn, "type": "phrase",
                         "fields": [f"title^{boosts['title_phrase']}", f"cast^{boosts['cast_phrase']}"], "slop": 1}},
        {"multi_match": {"query": qn, "type": "best_fields",
                         "fields": [f"title^{boosts['title_best']}", f"cast^{boosts['cast_best']}", f"director^{boosts['director']}"],
                         "operator": "AND", "minimum_should_match": "2<-25%"}},
        {"multi_match": {"query": qn, "type": "best_fields",
                         "fields": [f"listed_in_text^{boosts['listed_in']}"], "operator": "OR"}},
        {"multi_match": {"query": qn, "type": "best_fields",
                         "fields": [f"country_text^{boosts['country']}", f"rating_text^{boosts['rating']}", f"type_text^{boosts['type']}"]}},
        {"match_bool_prefix": {"title": {"query": qn, "boost": 2.0}}},
        {"match_bool_prefix": {"cast":  {"query": qn, "boost": 1.5}}},
        {"match_phrase": {"description": {"query": qn, "slop": 2, "boost": boosts["desc_phrase"]}}}
    ]
    return {"dis_max": {"tie_breaker": tie, "queries": queries}}

def bm25_candidates(client, index, q, *, cfg=RET):
    qn = _normalize_expand(q, cfg["expand"])
    bq = bm25_query(qn, cfg["bm25"]["boosts"], cfg["bm25"]["tie_breaker"])
    body = {
        "size": cfg["bm25_size"],
        "_source": ["show_id", "title", "type", "release_year", "vector"],  # include vector for re-rank
        "query": bq
    }
    return client.search(index=index, body=body)


In [69]:
import math

def _minmax(scores):
    if not scores: return ([], 0.0, 1.0)
    lo, hi = min(scores), max(scores)
    if math.isclose(hi, lo):  # avoid divide-by-zero
        return ([1.0]*len(scores), lo, hi)
    return ([(s - lo) / (hi - lo) for s in scores], lo, hi)

def rerank_with_vectors(res, qvec, *, alpha=0.65, top_k=10):
    hits = res.get("hits", {}).get("hits", [])
    if not hits: return []

    bm25_scores = [h.get("_score", 0.0) for h in hits]
    bm25_norm, _, _ = _minmax(bm25_scores)

    ranked = []
    for h, bm25_n in zip(hits, bm25_norm):
        src = h.get("_source", {})
        v = src.get("vector")
        if not v:  # safety
            final = bm25_n * alpha
        else:
            # qvec and v are already L2-normalized → dot = cosine
            # If not normalized, compute cosine(qvec,v) = dot(qvec,v) / (||q||*||v||)
            dot = sum(a*b for a,b in zip(qvec, v))
            final = alpha * bm25_n + (1.0 - alpha) * dot
        ranked.append((final, h))

    ranked.sort(key=lambda x: x[0], reverse=True)
    return [h for _, h in ranked[:top_k]]

def retrieve(client, index, q, *, cfg=RET, model=None):
    res = bm25_candidates(client, index, q, cfg=cfg)
    m = model or get_embed_model(cfg["embedder"])
    qvec = m.encode(q, normalize_embeddings=True).tolist()
    top = rerank_with_vectors(res, qvec, alpha=cfg["alpha"], top_k=cfg["top_k"])
    return top


In [70]:
def hits_ids(hits): return [h["_source"].get("show_id","") for h in hits]

def hit_rate(ids, gold, k=10): return 1.0 if gold in ids[:k] else 0.0
def mrr(ids, gold, k=10):
    for i,x in enumerate(ids[:k], 1):
        if x == gold: return 1.0 / i
    return 0.0

def evaluate(client, index, qid_to_queries: dict, *, cfg=RET):
    total_hr = total_mrr = n = 0
    for gold, qs in qid_to_queries.items():
        for q in qs:
            hits = retrieve(client, index, q, cfg=cfg)
            ids = hits_ids(hits)
            total_hr  += hit_rate(ids, gold, cfg["top_k"])
            total_mrr += mrr(ids, gold, cfg["top_k"])
            n += 1
    return {"hit_rate": total_hr / n, "mrr": total_mrr / n}

def sweep_three_knobs(client, index, qid_to_queries, base=RET):
    best = None
    for alpha in [0.5, 0.6, 0.65, 0.7, 0.8]:
        for bm25_size in [100, 200, 300]:
            for listed_boost in [8.0, 9.0, 10.0]:
                cfg = {**base,
                       "alpha": alpha,
                       "bm25_size": bm25_size,
                       "bm25": {**base["bm25"],
                                "boosts": {**base["bm25"]["boosts"], "listed_in": listed_boost}}}
                metrics = evaluate(client, index, qid_to_queries, cfg=cfg)
                if not best or metrics["mrr"] > best[0]["mrr"]:
                    best = (metrics, cfg)
    return best


In [71]:
# sample 100 from a dict

sampled_results = random.sample(list(results.items()), 10)
len(sampled_results)
sweep_three_knobs(os_client, cfg.index, dict(sampled_results))

({'hit_rate': 0.18, 'mrr': 0.09233333333333334},
 {'embedder': 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'bm25_size': 300,
  'top_k': 10,
  'alpha': 0.5,
  'bm25': {'tie_breaker': 0.1,
   'boosts': {'title_phrase': 12.0,
    'cast_phrase': 8.0,
    'title_best': 10.0,
    'cast_best': 7.0,
    'director': 2.5,
    'listed_in': 8.0,
    'country': 4.0,
    'rating': 2.5,
    'type': 1.5,
    'desc_phrase': 1.05}},
  'expand': True})

In [None]:
### OLd
#1   ({'hit_rate': 0.4, 'mrr': 0.17769047619047618},
