<div align="center">

# **DELIVERY 3**
## **Ranking & Filtering**

</div>

---

## **PART 1: Three Different Ways of Ranking (TF-IDF + Cosine Similarity, BM25 and Custom)**

### **Main Code**

In [1]:
import json
import math
from collections import defaultdict, Counter
from pathlib import Path
import sys
from typing import Dict, List, Any, Iterable, Tuple

NOTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name in {"part_1", "part_2", "part_3"} else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"
INDEX_DIR = DATA_DIR / "index"

sys.path.append(str(REPO_ROOT / "project_progress"))
from utils.preprocessing import preprocess_text_field  

ENRICHED_PATH = DATA_DIR / "fashion_products_dataset_enriched.json"
INVERTED_PATH = INDEX_DIR / "boolean_inverted_index.json"
DOCMAP_PATH = INDEX_DIR / "docid_pid_map.json"

print(f"Using enriched dataset: {ENRICHED_PATH}")
print(f"Using boolean index:    {INVERTED_PATH}")

# Load data
if not ENRICHED_PATH.exists():
    raise FileNotFoundError(ENRICHED_PATH)
if not INVERTED_PATH.exists():
    raise FileNotFoundError(INVERTED_PATH)

docs: List[Dict[str, Any]] = json.loads(ENRICHED_PATH.read_text(encoding="utf-8"))
inverted_index: Dict[str, List[int]] = json.loads(INVERTED_PATH.read_text(encoding="utf-8"))
docid_to_pid = json.loads(DOCMAP_PATH.read_text(encoding="utf-8"))["docid_to_pid"]

N_DOCS = len(docs)
print(f"Loaded {N_DOCS} documents")

# Text fields used for indexing / ranking (same as Part 2)
INDEXED_TEXT_FIELDS = ["title_clean", "description_clean", "metadata_clean"]

REQUIRED_OUTPUT_FIELDS = [
    "pid", "title", "description", "brand", "category", "sub_category",
    "product_details", "seller", "out_of_stock", "selling_price", "discount",
    "actual_price", "average_rating", "url"
]


def _doc_tokens(record: Dict[str, Any], fields: Iterable[str]) -> List[str]:
    toks: List[str] = []
    for f in fields:
        val = record.get(f)
        if not val:
            continue
        toks.extend(str(val).split())
    return toks

def _query_tokens(q: str) -> List[str]:
    proc = preprocess_text_field(q or "")
    return proc["tokens"]

def _intersect_sorted(a: List[int], b: List[int]) -> List[int]:
    i = j = 0
    out: List[int] = []
    while i < len(a) and j < len(b):
        if a[i] == b[j]:
            out.append(a[i])
            i += 1
            j += 1
        elif a[i] < b[j]:
            i += 1
        else:
            j += 1
    return out

def _candidate_docs(q_terms: List[str]) -> List[int]:
    """
    AND semantics over the boolean index:
    returns doc_ids that contain ALL query terms.
    """
    if not q_terms:
        return []
    postings_lists: List[List[int]] = []
    for t in set(q_terms):
        p = inverted_index.get(t)
        if not p:
            return []
        postings_lists.append(p)
    postings_lists.sort(key=len)
    result = postings_lists[0]
    for pl in postings_lists[1:]:
        result = _intersect_sorted(result, pl)
        if not result:
            break
    return result

# Pre-compute TF, IDF, norms for TF-IDF and stats for BM25
term_df: Dict[str, int] = {t: len(pl) for t, pl in inverted_index.items()}  # df(t)

# tf per doc and doc lengths
doc_tf: Dict[int, Dict[str, int]] = {}
doc_len: Dict[int, int] = {}

for doc_id, rec in enumerate(docs):
    toks = _doc_tokens(rec, INDEXED_TEXT_FIELDS)
    tf = Counter(toks)
    doc_tf[doc_id] = dict(tf)
    doc_len[doc_id] = sum(tf.values())

avg_doc_len = sum(doc_len.values()) / max(N_DOCS, 1)

# TF-IDF (log-tf, log2(N/df)) 
idf_tfidf: Dict[str, float] = {}
for t, df in term_df.items():
    if df > 0:
        idf_tfidf[t] = math.log2(N_DOCS / df)
    else:
        idf_tfidf[t] = 0.0

tfidf_weights: Dict[int, Dict[str, float]] = {}
doc_norms: Dict[int, float] = {}

for doc_id, tf_map in doc_tf.items():
    w_map: Dict[str, float] = {}
    sq_sum = 0.0
    for t, f in tf_map.items():
        if f <= 0:
            continue
        w = (1.0 + math.log2(f)) * idf_tfidf.get(t, 0.0)
        if w != 0.0:
            w_map[t] = w
            sq_sum += w * w
    tfidf_weights[doc_id] = w_map
    doc_norms[doc_id] = math.sqrt(sq_sum) if sq_sum > 0 else 0.0

# BM25 stats
# idf formula: log( (N - df + 0.5) / (df + 0.5) + 1 )
idf_bm25: Dict[str, float] = {}
for t, df in term_df.items():
    num = N_DOCS - df + 0.5
    den = df + 0.5
    idf_bm25[t] = math.log(num / den + 1.0)

k1 = 1.5
b = 0.75


# Ranking 1: TF-IDF + cosine similarity
def _tfidf_cosine_scores(q_terms: List[str], candidate_ids: List[int]) -> Dict[int, float]:
    if not candidate_ids:
        return {}

    # query weights
    q_tf = Counter(q_terms)
    q_weights: Dict[str, float] = {}
    q_sq_sum = 0.0
    for t, f in q_tf.items():
        if f <= 0:
            continue
        w = (1.0 + math.log2(f)) * idf_tfidf.get(t, 0.0)
        if w != 0.0:
            q_weights[t] = w
            q_sq_sum += w * w
    q_norm = math.sqrt(q_sq_sum) if q_sq_sum > 0 else 0.0
    if q_norm == 0.0:
        return {}

    scores: Dict[int, float] = {}
    for did in candidate_ids:
        d_weights = tfidf_weights.get(did, {})
        denom = doc_norms.get(did, 0.0)
        if denom == 0.0:
            continue
        dot = 0.0
        for t, qw in q_weights.items():
            dw = d_weights.get(t)
            if dw is not None:
                dot += qw * dw
        if dot > 0.0:
            scores[did] = dot / (q_norm * denom)
    return scores

def search_tfidf_cosine(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)
    scores = _tfidf_cosine_scores(q_terms, cand_ids)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_tfidf"] = s
        results.append(view)
    return results


# Ranking 2: BM25
def _bm25_scores(q_terms: List[str], candidate_ids: List[int]) -> Dict[int, float]:
    if not candidate_ids:
        return {}
    q_unique = list(set(q_terms))  # BM25 usually ignores query term frequency or uses min(1, tf)

    scores: Dict[int, float] = {}
    for did in candidate_ids:
        tf_map = doc_tf.get(did, {})
        dl = doc_len.get(did, 0)
        if dl == 0:
            continue
        score = 0.0
        for t in q_unique:
            f = tf_map.get(t, 0)
            if f <= 0:
                continue
            idf = idf_bm25.get(t, 0.0)
            denom = f + k1 * (1.0 - b + b * dl / avg_doc_len)
            score += idf * (f * (k1 + 1.0) / denom)
        if score != 0.0:
            scores[did] = score
    return scores

def search_bm25(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)
    scores = _bm25_scores(q_terms, cand_ids)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_bm25"] = s
        results.append(view)
    return results


# Ranking 3: Custom score (TF-IDF + numeric boosts)
def _numeric_boost(rec: Dict[str, Any]) -> float:
    """
    Combines rating, discount, price and stock availability into a single multiplier.
    Idea:
      - Prefer in-stock items
      - Higher rating and higher discount -> better
      - Slight preference for cheaper items (within a cap)
    """
    rating = rec.get("average_rating_num") or 0.0
    discount = rec.get("discount_pct") or 0
    price = rec.get("selling_price_num") or rec.get("actual_price_num") or 0.0
    out_of_stock = rec.get("out_of_stock_bool")

    # Normalize
    rating_norm = max(0.0, min(rating / 5.0, 1.0))          # 0â€“1
    discount_norm = max(0.0, min(discount / 80.0, 1.0))     # assume 80% is "very high"
    price_cap = 4000.0
    if price <= 0:
        price_norm = 0.5
    else:
        price_norm = 1.0 - min(price, price_cap) / price_cap  # cheaper -> closer to 1

    stock_factor = 1.0 if not out_of_stock else 0.2         # strong penalty if out of stock

    # Weighted combination -> multiplier around ~[0.5, 2]
    boost = 1.0 + 0.5 * rating_norm + 0.4 * discount_norm + 0.3 * price_norm
    return boost * stock_factor

def search_custom_score(query: str, k: int = 20) -> List[Dict[str, Any]]:
    q_terms = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms)

    # base relevance: TF-IDF cosine
    base_scores = _tfidf_cosine_scores(q_terms, cand_ids)
    if not base_scores:
        return []

    final_scores: Dict[int, float] = {}
    for did, base in base_scores.items():
        rec = docs[did]
        boost = _numeric_boost(rec)
        final_scores[did] = base * boost

    ranked = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_custom"] = s
        results.append(view)
    return results

# Small helper to try all three methods with the same query
def compare_rankers(query: str, k: int = 5):
    print(f"\n=== Query: {query!r} ===")

    tfidf_res = search_tfidf_cosine(query, k=k)
    bm25_res = search_bm25(query, k=k)
    custom_res = search_custom_score(query, k=k)

    def _show(label: str, res: List[Dict[str, Any]], score_key: str):
        print(f"\n-- {label} --")
        for r in res:
            title = (r.get("title") or "")[:60]
            pid = r.get("pid")
            s = r.get(score_key)
            print(f"{pid} | {s:.4f} | {title}")

    _show("TF-IDF + cosine", tfidf_res, "score_tfidf")
    _show("BM25", bm25_res, "score_bm25")
    _show("Custom score", custom_res, "score_custom")


Using enriched dataset: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Using boolean index:    /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/index/boolean_inverted_index.json
Loaded 28080 documents


### **Testing**

In [3]:
compare_rankers("women full sleeve sweatshirt cotton", k=5)
compare_rankers("men slim jeans blue", k=5)


=== Query: 'women full sleeve sweatshirt cotton' ===

-- TF-IDF + cosine --
SWSFZVTTQCB4SJ7F | 0.7305 | Full Sleeve Solid Women Sweatshirt
SWSFQGS456JAZCQB | 0.6561 | Full Sleeve Printed Women Sweatshirt
SWSFZVTNGM4HG8BC | 0.6339 | Full Sleeve Printed Women Sweatshirt
SWSFYTYMNTBNARUN | 0.6237 | Full Sleeve Solid Women Sweatshirt
SWSFYRKYAHH4HHSM | 0.6070 | Full Sleeve Printed Women Sweatshirt

-- BM25 --
SWSFVZRFS7GHGKSF | 10.2591 | Full Sleeve Solid Women Sweatshirt
SWSFYFFFFYZ896TJ | 9.9860 | Full Sleeve Printed Women Sweatshirt
SWSFXMFPDVRHYYPH | 9.9796 | Full Sleeve Striped Women Sweatshirt
SWSFXMFPPZGDQGMW | 9.9796 | Full Sleeve Striped Women Sweatshirt
SWSFYFFYQ7Z3ZKN6 | 9.9541 | Full Sleeve Printed Women Sweatshirt

-- Custom score --
SWSFZVTNGM4HG8BC | 1.2278 | Full Sleeve Printed Women Sweatshirt
SWSFYTYMNTBNARUN | 1.2057 | Full Sleeve Solid Women Sweatshirt
SWSFY38ADYPVZHYZ | 1.1757 | Full Sleeve Printed Women Sweatshirt
SWSFY382UZHFBCNB | 1.1277 | Full Sleeve Printed Women

## **PART 2: Word2vec + Cosine Ranking Score**

### **Main Code**

In [9]:
from pathlib import Path
from typing import List, Dict, Any, Iterable, Tuple
import json, math
import numpy as np

# gensim allowed for Word2Vec
from gensim.models import Word2Vec

W2V_MODEL_PATH = INDEX_DIR / "word2vec.model"
DOCVECS_PATH   = INDEX_DIR / "word2vec_docvecs.npy"   # dense matrix (n_docs, dim)
DOCMASK_PATH   = INDEX_DIR / "word2vec_docmask.npy"   # boolean mask: doc has at least 1 in-vocab token
META_PATH      = INDEX_DIR / "word2vec_meta.json"     # stores dim, min_count, etc.

# Helpers: training corpus
def _sentences_from_docs(records: List[Dict[str, Any]], fields: Iterable[str]) -> List[List[str]]:
    """
    Build training sentences from cleaned fields.
    Each doc contributes one sentence per field (tokenized by .split()).
    """
    sents: List[List[str]] = []
    for r in records:
        for f in fields:
            txt = r.get(f)
            if not txt:
                continue
            toks = str(txt).split()
            if toks:
                sents.append(toks)
    return sents

# Train (or load) Word2Vec
def get_or_train_w2v(
    records: List[Dict[str, Any]],
    fields: Iterable[str],
    vector_size: int = 100,
    window: int = 5,
    min_count: int = 2,
    workers: int = 4,
    sg: int = 1,  # skip-gram (1) tends to work better for semantic similarity
) -> Word2Vec:
    if W2V_MODEL_PATH.exists():
        model = Word2Vec.load(str(W2V_MODEL_PATH))
        return model

    sents = _sentences_from_docs(records, fields)
    model = Word2Vec(
        sentences=sents,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg,
        epochs=10,
    )
    model.save(str(W2V_MODEL_PATH))
    META_PATH.write_text(json.dumps({
        "vector_size": vector_size,
        "window": window,
        "min_count": min_count,
        "sg": sg,
        "epochs": 10
    }, indent=2))
    return model

# Building / loading document vectors
def _avg_vec(tokens: List[str], model: Word2Vec) -> np.ndarray:
    """Average word vectors for tokens in model; return zero vector if none in vocab."""
    vecs = []
    for t in tokens:
        if t in model.wv:
            vecs.append(model.wv[t])
    if not vecs:
        return np.zeros(model.vector_size, dtype=np.float32)
    arr = np.vstack(vecs).mean(axis=0)
    return arr.astype(np.float32)

def build_or_load_doc_matrix(model: Word2Vec) -> Tuple[np.ndarray, np.ndarray]:
    """
    Returns:
      doc_mat: (N_DOCS, dim) float32
      has_vec: (N_DOCS,) boolean, True if doc has at least 1 in-vocab token
    """
    if DOCVECS_PATH.exists() and DOCMASK_PATH.exists():
        doc_mat = np.load(DOCVECS_PATH)
        has_vec = np.load(DOCMASK_PATH)
        return doc_mat, has_vec

    dim = model.vector_size
    doc_mat = np.zeros((N_DOCS, dim), dtype=np.float32)
    has_vec = np.zeros(N_DOCS, dtype=bool)

    for did, r in enumerate(docs):
        toks: List[str] = []
        for f in INDEXED_TEXT_FIELDS:
            txt = r.get(f)
            if txt:
                toks.extend(str(txt).split())
        v = _avg_vec(toks, model)
        doc_mat[did, :] = v
        has_vec[did] = np.any(v != 0.0)

    np.save(DOCVECS_PATH, doc_mat)
    np.save(DOCMASK_PATH, has_vec)
    return doc_mat, has_vec

# Cosine similarity
def _cosine(a: np.ndarray, b: np.ndarray) -> float:
    da = float(np.linalg.norm(a))
    db = float(np.linalg.norm(b))
    if da == 0.0 or db == 0.0:
        return 0.0
    return float(np.dot(a, b) / (da * db))

# Word2Vec search (AND-filtered)
def search_w2v_cosine(query: str, k: int = 20) -> List[Dict[str, Any]]:
    # Ensuring model + vectors exist
    model = get_or_train_w2v(docs, INDEXED_TEXT_FIELDS)
    doc_mat, has_vec = build_or_load_doc_matrix(model)

    # Query processing -> tokens (reuse the same preprocessing used everywhere)
    q_tokens = preprocess_text_field(query or "")["tokens"]
    q_vec = _avg_vec(q_tokens, model)
    if not np.any(q_vec):
        return []  # nothing in-vocab, no good semantic signal

    # AND candidate set
    q_terms_for_bool = _query_tokens(query)
    cand_ids = _candidate_docs(q_terms_for_bool)
    if not cand_ids:
        return []

    # Score candidates by cosine
    scores: Dict[int, float] = {}
    for did in cand_ids:
        if not has_vec[did]:
            continue
        dv = doc_mat[did, :]
        s = _cosine(q_vec, dv)
        if s > 0.0:
            scores[did] = s

    if not scores:
        return []

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]

    results: List[Dict[str, Any]] = []
    for did, s in ranked:
        rec = docs[did]
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        view["pid"] = rec.get("pid") or docid_to_pid.get(str(did))
        view["score_w2v"] = float(s)
        results.append(view)
    return results

# Batch run for our queries from Part 2
def run_word2vec_on_proposed(k: int = 20, save_path: Path = (INDEX_DIR / "w2v_results.json")) -> Path:
    """
    Loads the five proposed queries created in Part 2 (proposed_test_queries.json)
    and saves top-k results per query ranked with Word2Vec + cosine.
    """
    qfile = INDEX_DIR / "proposed_test_queries.json"
    if not qfile.exists():
        print(f"[INFO] Proposed queries file not found at {qfile}; run your Part 2 query-mining first.")
        return save_path

    queries = json.loads(qfile.read_text(encoding="utf-8")).get("queries", [])
    out = {}
    for q in queries:
        hits = search_w2v_cosine(q, k=k)
        out[q] = [{"pid": h.get("pid"), "title": h.get("title"), "score_w2v": h.get("score_w2v")} for h in hits]

    save_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Wrote Word2Vec top-{k} for {len(queries)} queries to: {save_path}")
    return save_path


### **Testing**

In [10]:
# Single query demo
for q in ["women full sleeve sweatshirt cotton", "men slim jeans blue"]:
    res = search_w2v_cosine(q, k=5)
    print(f"\n== {q} ==")
    for r in res:
        print(f"{r['pid']} | {r['score_w2v']:.4f} | {(r.get('title') or '')[:70]}")

# Batch on our queries saved in Part 2
run_word2vec_on_proposed(k=20)


== women full sleeve sweatshirt cotton ==
SWSFY5ZHUEZPZZYV | 0.8426 | Full Sleeve Printed Women Sweatshirt
SWSF9W528G7VEGCV | 0.8423 | Full Sleeve Striped Women Sweatshirt
SWSFY5ZHEJ2HYWDG | 0.8421 | Full Sleeve Printed Men Sweatshirt
SWSF9W5YHFAAHNJZ | 0.8404 | Full Sleeve Striped Women Sweatshirt
SWSFMJF98EY2FXBH | 0.8377 | Full Sleeve Solid Women Sweatshirt

== men slim jeans blue ==
JEAFSKYHRVZSABPR | 0.7829 | Slim Men Blue Jeans
JEAFSKYHTE76YWH7 | 0.7829 | Slim Men Blue Jeans
JEAFSKYH539HTZB8 | 0.7803 | Tapered Fit Men Blue Jeans
JEAFXUHCWV9C5WNX | 0.7783 | Super Skinny Men Blue Jeans
JEAFXUHA5YF8WYQY | 0.7780 | Tapered Fit Men Blue Jeans
Wrote Word2Vec top-20 for 4 queries to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/index/w2v_results.json


PosixPath('/Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/index/w2v_results.json')