<div align="center">

# DELIVERY 2  
## **Indexing and Evaluation**

</div>

---

## **PART 1: Indexing**

### **STEP 1 — Build inverted index:**


### **Main Code**

In [4]:
import json
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Set, Any, Iterable
from pathlib import Path
import sys

NOTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name in {"part_1", "part_2"} else NOTEBOOK_DIR
sys.path.append(str(REPO_ROOT / "project_progress"))
from utils.preprocessing import preprocess_text_field



# Path
NOTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1]          
DATA_DIR = REPO_ROOT / "data"
INPUT = DATA_DIR / "fashion_products_dataset_enriched.json"

INDEX_DIR = DATA_DIR / "index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)

INDEX_FILE = INDEX_DIR / "boolean_inverted_index.json"
DOCMAP_FILE = INDEX_DIR / "docid_pid_map.json"
FIELDS_FILE = INDEX_DIR / "indexed_fields.json"

print(f"Reading enriched dataset: {INPUT}")
print(f"Index will be saved in:   {INDEX_DIR}")


if not INPUT.exists():
    raise FileNotFoundError(f"Enriched dataset not found: {INPUT}")
docs: List[Dict[str, Any]] = json.loads(INPUT.read_text(encoding="utf-8"))
print(f"Loaded {len(docs)} docs")


INDEXED_TEXT_FIELDS = [
    "title_clean",
    "description_clean",
    "metadata_clean",   
]


# doc_id is an integer, stable order = index in list
docid_to_pid: Dict[int, str] = {}
pid_to_docid: Dict[str, int] = {}

for i, r in enumerate(docs):
    pid = r.get("pid")
    if not pid:
        pid = r.get("_id", f"missing_pid_{i}")
    docid_to_pid[i] = pid
    pid_to_docid[pid] = i

def _doc_tokens(record: Dict[str, Any], fields: Iterable[str]) -> List[str]:
    toks: List[str] = []
    for f in fields:
        val = record.get(f)
        if not val:
            continue
        # We already have cleaned strings; just split.
        toks.extend(str(val).split())
    return toks


# Build inverted index 
vocab: Dict[str, Set[int]] = defaultdict(set)

for doc_id, rec in enumerate(docs):
    tokens = _doc_tokens(rec, INDEXED_TEXT_FIELDS)
    if not tokens:
        continue
    # Use unique terms per doc for Boolean presence posting
    for term in set(tokens):
        vocab[term].add(doc_id)

# Convert sets to sorted lists for compactness and efficient AND intersections
inverted_index: Dict[str, List[int]] = {t: sorted(list(s)) for t, s in vocab.items()}
print(f"Vocabulary size: {len(inverted_index):,}")


INDEX_FILE.write_text(json.dumps(inverted_index), encoding="utf-8")
DOCMAP_FILE.write_text(json.dumps({"docid_to_pid": docid_to_pid}, ensure_ascii=False), encoding="utf-8")
FIELDS_FILE.write_text(json.dumps({"indexed_fields": INDEXED_TEXT_FIELDS}, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"Saved inverted index to: {INDEX_FILE}")
print(f"Saved doc map         to: {DOCMAP_FILE}")
print(f"Saved fields          to: {FIELDS_FILE}")


REQUIRED_OUTPUT_FIELDS = [
    "pid", "title", "description", "brand", "category", "sub_category",
    "product_details", "seller", "out_of_stock", "selling_price", "discount",
    "actual_price", "average_rating", "url"
]

def _query_tokens(q: str) -> List[str]:
    # Use the same normalization and stemming pipeline as Step 1
    proc = preprocess_text_field(q or "")
    return proc["tokens"]

def _intersect_sorted(a: List[int], b: List[int]) -> List[int]:
    """Intersect two sorted posting lists."""
    i=j=0
    out: List[int] = []
    while i < len(a) and j < len(b):
        if a[i] == b[j]:
            out.append(a[i])
            i+=1; j+=1
        elif a[i] < b[j]:
            i+=1
        else:
            j+=1
    return out

def search_and(query: str, fields: List[str] = None, k: int = 20) -> List[Dict[str, Any]]:
    """
    Conjunctive (AND) Boolean search.
    Every returned doc must contain ALL query terms (after preprocessing).
    Returns up to k full records with the required fields (when present).
    """
    _ = fields  # kept for future extension; current index already built over INDEXED_TEXT_FIELDS
    q_terms = _query_tokens(query)
    if not q_terms:
        return []

    # Load postings lists; if any term not in vocab -> empty result
    postings_lists: List[List[int]] = []
    for t in q_terms:
        p = inverted_index.get(t)
        if not p:
            return []
        postings_lists.append(p)

    # Intersect from shortest to longest for speed
    postings_lists.sort(key=len)
    result_ids = postings_lists[0]
    for pl in postings_lists[1:]:
        result_ids = _intersect_sorted(result_ids, pl)
        if not result_ids:
            break

    # Map to records and keep only required output fields (when present)
    out: List[Dict[str, Any]] = []
    for did in result_ids[:k]:
        rec = docs[did]
        # Build a thin view with required fields (include only those present)
        view = {f: rec.get(f) for f in REQUIRED_OUTPUT_FIELDS if f in rec}
        # Always include pid
        if "pid" not in view:
            view["pid"] = rec.get("pid") or docid_to_pid.get(did)
        out.append(view)
    return out

Reading enriched dataset: C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\fashion_products_dataset_enriched.json
Index will be saved in:   C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\index
Loaded 28080 docs
Vocabulary size: 9,048
Saved inverted index to: C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\index\boolean_inverted_index.json
Saved doc map         to: C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\index\docid_pid_map.json
Saved fields          to: C:\Users\Pol\Documents\POL\UNI\WEB\irwa-search-engine\data\index\indexed_fields.json


### **Testing**

In [5]:
tests = [
    "women full sleeve sweatshirt cotton",
    "men slim jeans blue",
]

for q in tests:
    hits = search_and(q, k=5)
    print(f"\nQuery: {q!r}  -> {len(hits)} hits (showing up to 5)")
    for h in hits[:5]:
        print(" -", h.get("pid"), "|", (h.get("title") or "")[:80])


Query: 'women full sleeve sweatshirt cotton'  -> 5 hits (showing up to 5)
 - SWSFJY5ZBGZK49ZB | Full Sleeve Solid Women Sweatshirt
 - SWSFVEV2PZTNSVG6 | Full Sleeve Solid Women Sweatshirt
 - SWSFJY5ZRDUYGWN3 | Full Sleeve Solid Women Sweatshirt
 - SWSFJY5ZJZKVGUFT | Full Sleeve Striped Women Sweatshirt
 - SWSFJY5ZKQNGYZZJ | Full Sleeve Solid Women Sweatshirt

Query: 'men slim jeans blue'  -> 5 hits (showing up to 5)
 - TSHEXH594YXPYNEQ | Solid Men Round Neck Black, Grey T-Shirt
 - TSHEXM9HRJ3JGCZX | Solid Men V-neck Blue T-Shirt
 - BZRFUXVMEZUPAFFB | Self Design Single Breasted Casual Men Full Sleeve Blazer  (Blue)
 - SHTFTGKHFSAUBWVC | Men Slim Fit Solid Spread Collar Casual Shirt  (Pack of 3)
 - SHTFTGKHG4YSFBXZ | Men Slim Fit Solid Button Down Collar Casual Shirt  (Pack of 3)
