<div align="center">

# **DELIVERY 3**
## **Ranking & Filtering**

</div>

---

### **Main Code**

In [5]:
import json
import math
from collections import defaultdict, Counter
from pathlib import Path
import sys
from typing import Dict, List, Any, Iterable, Tuple

NOTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name in {"part_1", "part_2", "part_3"} else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"
INDEX_DIR = DATA_DIR / "index"

sys.path.append(str(REPO_ROOT / "project_progress"))
from utils.preprocessing import preprocess_text_field  

ENRICHED_PATH = DATA_DIR / "fashion_products_dataset_enriched.json"
INVERTED_PATH = INDEX_DIR / "boolean_inverted_index.json"
DOCMAP_PATH = INDEX_DIR / "docid_pid_map.json"

print(f"Using enriched dataset: {ENRICHED_PATH}")
print(f"Using boolean index:    {INVERTED_PATH}")

# Load data
if not ENRICHED_PATH.exists():
    raise FileNotFoundError(ENRICHED_PATH)
if not INVERTED_PATH.exists():
    raise FileNotFoundError(INVERTED_PATH)

docs: List[Dict[str, Any]] = json.loads(ENRICHED_PATH.read_text(encoding="utf-8"))
inverted_index: Dict[str, List[int]] = json.loads(INVERTED_PATH.read_text(encoding="utf-8"))
docid_to_pid = json.loads(DOCMAP_PATH.read_text(encoding="utf-8"))["docid_to_pid"]

N_DOCS = len(docs)
print(f"Loaded {N_DOCS} documents")

# Text fields used for indexing / ranking (same as Part 2)
INDEXED_TEXT_FIELDS = ["title_clean", "description_clean", "metadata_clean"]

REQUIRED_OUTPUT_FIELDS = [
    "pid", "title", "description", "brand", "category", "sub_category",
    "product_details", "seller", "out_of_stock", "selling_price", "discount",
    "actual_price", "average_rating", "url"
]


def _doc_tokens(record: Dict[str, Any], fields: Iterable[str]) -> List[str]:
    toks: List[str] = []
    for f in fields:
        val = record.get(f)
        if not val:
            continue
        toks.extend(str(val).split())
    return toks

def _query_tokens(q: str) -> List[str]:
    proc = preprocess_text_field(q or "")
    return proc["tokens"]

def _intersect_sorted(a: List[int], b: List[int]) -> List[int]:
    i = j = 0
    out: List[int] = []
    while i < len(a) and j < len(b):
        if a[i] == b[j]:
            out.append(a[i])
            i += 1
            j += 1
        elif a[i] < b[j]:
            i += 1
        else:
            j += 1
    return out

def _candidate_docs(q_terms: List[str]) -> List[int]:
    """
    AND semantics over the boolean index:
    returns doc_ids that contain ALL query terms.
    """
    if not q_terms:
        return []
    postings_lists: List[List[int]] = []
    for t in set(q_terms):
        p = inverted_index.get(t)
        if not p:
            return []
        postings_lists.append(p)
    postings_lists.sort(key=len)
    result = postings_lists[0]
    for pl in postings_lists[1:]:
        result = _intersect_sorted(result, pl)
        if not result:
            break
    return result

# Pre-compute TF, IDF, norms for TF-IDF and stats for BM25
term_df: Dict[str, int] = {t: len(pl) for t, pl in inverted_index.items()}  # df(t)

# tf per doc and doc lengths
doc_tf: Dict[int, Dict[str, int]] = {}
doc_len: Dict[int, int] = {}

for doc_id, rec in enumerate(docs):
    toks = _doc_tokens(rec, INDEXED_TEXT_FIELDS)
    tf = Counter(toks)
    doc_tf[doc_id] = dict(tf)
    doc_len[doc_id] = sum(tf.values())

avg_doc_len = sum(doc_len.values()) / max(N_DOCS, 1)

# TF-IDF (log-tf, log2(N/df)) 
idf_tfidf: Dict[str, float] = {}
for t, df in term_df.items():
    if df > 0:
        idf_tfidf[t] = math.log2(N_DOCS / df)
    else:
        idf_tfidf[t] = 0.0

tfidf_weights: Dict[int, Dict[str, float]] = {}
doc_norms: Dict[int, float] = {}

for doc_id, tf_map in doc_tf.items():
    w_map: Dict[str, float] = {}
    sq_sum = 0.0
    for t, f in tf_map.items():
        if f <= 0:
            continue
        w = (1.0 + math.log2(f)) * idf_tfidf.get(t, 0.0)
        if w != 0.0:
            w_map[t] = w
            sq_sum += w * w
    tfidf_weights[doc_id] = w_map
    doc_norms[doc_id] = math.sqrt(sq_sum) if sq_sum > 0 else 0.0

# BM25 stats
# idf formula: log( (N - df + 0.5) / (df + 0.5) + 1 )
idf_bm25: Dict[str, float] = {}
for t, df in term_df.items():
    num = N_DOCS - df + 0.5
    den = df + 0.5
    idf_bm25[t] = math.log(num / den + 1.0)

k1 = 1.5
b = 0.75


# Ranking 1: TF-IDF + cosine similarity
def _tfidf_cosine_scores(q_terms: List[str], candidate_ids: List[int]) -> Dict[int, float]:
    if not candidate_ids:
        return {}

    # query weights
    q_tf = Counter(q_terms)
    q_weights: Dict[str, float] = {}
    q_sq_sum = 0.0
    for t, f in q_tf.items():
        if f <= 0:
            continue
        w = (1.0 + math.log2(f)) * idf_tfidf.get(t, 0.0)
        if w != 0.0:
            q_weights[t] = w
            q_sq_sum += w * w
    q_norm = math.sqrt(q_sq_sum) if q_sq_sum > 0 else 0.0
    if q_norm == 0.0:
        return {}

    scores: Dict[int, float] = {}
    for did in candidate_ids:
        d_weights = tfidf_weights.get(did, {})
        denom = doc_norms.get(did, 0.0)
        if denom == 0.0:
            continue
        dot = 0.0
        for t, qw in q_weights.items():
            dw = d_weights.get(t)
            if dw is not None:
                dot += qw * dw
        if dot > 0.0:
            scores[did] = dot / (q_norm * denom)
    return scores

def search_tfidf_cosine(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)
    scores = _tfidf_cosine_scores(q_terms, cand_ids)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_tfidf"] = s
        results.append(view)
    return results


# Ranking 2: BM25
def _bm25_scores(q_terms: List[str], candidate_ids: List[int]) -> Dict[int, float]:
    if not candidate_ids:
        return {}
    q_unique = list(set(q_terms))  # BM25 usually ignores query term frequency or uses min(1, tf)

    scores: Dict[int, float] = {}
    for did in candidate_ids:
        tf_map = doc_tf.get(did, {})
        dl = doc_len.get(did, 0)
        if dl == 0:
            continue
        score = 0.0
        for t in q_unique:
            f = tf_map.get(t, 0)
            if f <= 0:
                continue
            idf = idf_bm25.get(t, 0.0)
            denom = f + k1 * (1.0 - b + b * dl / avg_doc_len)
            score += idf * (f * (k1 + 1.0) / denom)
        if score != 0.0:
            scores[did] = score
    return scores

def search_bm25(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)
    scores = _bm25_scores(q_terms, cand_ids)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_bm25"] = s
        results.append(view)
    return results


# Ranking 3: Custom score (TF-IDF + numeric boosts)
def _numeric_boost(rec: Dict[str, Any]) -> float:
    """
    Combines rating, discount, price and stock availability into a single multiplier.
    Idea:
      - Prefer in-stock items
      - Higher rating and higher discount -> better
      - Slight preference for cheaper items (within a cap)
    """
    rating = rec.get("average_rating_num") or 0.0
    discount = rec.get("discount_pct") or 0
    price = rec.get("selling_price_num") or rec.get("actual_price_num") or 0.0
    out_of_stock = rec.get("out_of_stock_bool")

    # Normalize
    rating_norm = max(0.0, min(rating / 5.0, 1.0))          # 0â€“1
    discount_norm = max(0.0, min(discount / 80.0, 1.0))     # assume 80% is "very high"
    price_cap = 4000.0
    if price <= 0:
        price_norm = 0.5
    else:
        price_norm = 1.0 - min(price, price_cap) / price_cap  # cheaper -> closer to 1

    stock_factor = 1.0 if not out_of_stock else 0.2         # strong penalty if out of stock

    # Weighted combination -> multiplier around ~[0.5, 2]
    boost = 1.0 + 0.5 * rating_norm + 0.4 * discount_norm + 0.3 * price_norm
    return boost * stock_factor

def search_custom_score(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)

    # base relevance: TF-IDF cosine
    base_scores = _tfidf_cosine_scores(q_terms, cand_ids)
    if not base_scores:
        return []

    final_scores: Dict[int, float] = {}
    for did, base in base_scores.items():
        rec = docs[did]
        boost = _numeric_boost(rec)
        final_scores[did] = base * boost

    ranked = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_custom"] = s
        results.append(view)
    return results

# Small helper to try all three methods with the same query
def compare_rankers(query: str, k: int = 5):
    print(f"\n=== Query: {query!r} ===")

    tfidf_res = search_tfidf_cosine(query, k=k)
    bm25_res = search_bm25(query, k=k)
    custom_res = search_custom_score(query, k=k)

    def _show(label: str, res: List[Dict[str, Any]], score_key: str):
        print(f"\n-- {label} --")
        for r in res:
            title = (r.get("title") or "")[:60]
            pid = r.get("pid")
            s = r.get(score_key)
            print(f"{pid} | {s:.4f} | {title}")

    _show("TF-IDF + cosine", tfidf_res, "score_tfidf")
    _show("BM25", bm25_res, "score_bm25")
    _show("Custom score", custom_res, "score_custom")


Using enriched dataset: C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\fashion_products_dataset_enriched.json
Using boolean index:    C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\index\boolean_inverted_index.json
Loaded 28080 documents


### **Testing**

In [None]:
compare_rankers("women full sleeve sweatshirt cotton", k=5)
compare_rankers("men slim jeans blue", k=5)


=== Query: 'women full sleeve sweatshirt cotton' ===

-- TF-IDF + cosine --
SWSFZVTTQCB4SJ7F | 0.7305 | Full Sleeve Solid Women Sweatshirt
SWSFQGS456JAZCQB | 0.6561 | Full Sleeve Printed Women Sweatshirt
SWSFZVTNGM4HG8BC | 0.6339 | Full Sleeve Printed Women Sweatshirt
SWSFYTYMNTBNARUN | 0.6237 | Full Sleeve Solid Women Sweatshirt
SWSFYRKYAHH4HHSM | 0.6070 | Full Sleeve Printed Women Sweatshirt

-- BM25 --
SWSFVZRFS7GHGKSF | 10.2591 | Full Sleeve Solid Women Sweatshirt
SWSFYFFFFYZ896TJ | 9.9860 | Full Sleeve Printed Women Sweatshirt
SWSFXMFPDVRHYYPH | 9.9796 | Full Sleeve Striped Women Sweatshirt
SWSFXMFPPZGDQGMW | 9.9796 | Full Sleeve Striped Women Sweatshirt
SWSFYFFYQ7Z3ZKN6 | 9.9541 | Full Sleeve Printed Women Sweatshirt

-- Custom score --
SWSFZVTNGM4HG8BC | 1.2278 | Full Sleeve Printed Women Sweatshirt
SWSFYTYMNTBNARUN | 1.2057 | Full Sleeve Solid Women Sweatshirt
SWSFY38ADYPVZHYZ | 1.1757 | Full Sleeve Printed Women Sweatshirt
SWSFY382UZHFBCNB | 1.1277 | Full Sleeve Printed Women