<div align="center">

# **DELIVERY 1**
## **TEXT PROCESSING & EXPLORATORY DATA ANALYSIS (EDA)**

</div>

---

## **PART 1: DATA PREPARATION**

### **STEP 1 — Preprocessing Pipeline (NLTK)**

### **Main Code**

In [5]:
import json
import re
from pathlib import Path
from typing import List, Dict, Any
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from unidecode import unidecode

NOTEBOOK_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name == "part_1" else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"

INPUT_FILE = DATA_DIR / "fashion_products_dataset.json"
OUTPUT_FILE = DATA_DIR / "fashion_products_dataset_preprocessed.json"

print(f"Using dataset: {INPUT_FILE}")
print(f"Output will be saved to: {OUTPUT_FILE}")

def _ensure_nltk():
    try:
        stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        word_tokenize("test")
    except LookupError:
        nltk.download("punkt")
_ensure_nltk()



STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()


def normalize_basic(text: str) -> str:
    """
    Basic normalization:
      - Lowercase
      - Remove accents with unidecode
      - Remove currency symbols and numbers
      - Remove punctuation marks
      - Replace dashes/underscores with space
      - Collapse extra whitespace
    """
    if not text:
        return ""

    txt = unidecode(text)
    txt = txt.lower()
    txt = re.sub(r"http[s]?://\S+", " ", txt)
    txt = re.sub(r"[-_/]", " ", txt)
    txt = re.sub(r"[$€£₹¥%]+", " ", txt)
    txt = re.sub(r"\d+", " ", txt)
    txt = re.sub(r"[^\w\s]", " ", txt)
    txt = re.sub(r"[^a-z\s]", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt


def tokenize(text: str) -> List[str]:
    return word_tokenize(text) if text else []


def remove_stopwords(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in STOPWORDS]


def stem(tokens: List[str]) -> List[str]:
    return [STEMMER.stem(t) for t in tokens]


def preprocess_text_field(text: str) -> Dict[str, Any]:
    """Apply the full preprocessing pipeline to one text field."""
    cleaned = normalize_basic(text)
    toks = tokenize(cleaned)
    toks_nostop = remove_stopwords(toks)
    toks_stem = stem(toks_nostop)
    return {
        "tokens": toks_stem,
        "text": " ".join(toks_stem)
    }


def process_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    """Process one record and add preprocessed fields."""
    title = rec.get("title", "")
    desc = rec.get("description", "")

    t_proc = preprocess_text_field(title)
    d_proc = preprocess_text_field(desc)

    rec_out = dict(rec)
    rec_out["title_tokens"] = t_proc["tokens"]
    rec_out["title_clean"] = t_proc["text"]
    rec_out["description_tokens"] = d_proc["tokens"]
    rec_out["description_clean"] = d_proc["text"]
    return rec_out


def read_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
        if not isinstance(data, list):
            raise ValueError("Expected a JSON array.")
        return data


def write_json(path: Path, items):
    with path.open("w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)


if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Input file not found: {INPUT_FILE}")

data = read_json(INPUT_FILE)
processed = [process_record(rec) for rec in data]
write_json(OUTPUT_FILE, processed)

print(f"Processed {len(processed)} records.")
print(f"Saved to {OUTPUT_FILE}")
print("Example output:\n")
print(json.dumps(processed[0], indent=2)[:800])


Using dataset: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset.json
Output will be saved to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Processed 28080 records.
Saved to /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Example output:

{
  "_id": "fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a",
  "actual_price": "2,999",
  "average_rating": "3.9",
  "brand": "York",
  "category": "Clothing and Accessories",
  "crawled_at": 1612987911000,
  "description": "Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India",


### **Verification Code**

In [6]:
from pathlib import Path
import json, itertools

out_path = Path("../..") / "data" / "fashion_products_dataset_preprocessed.json"  # adjust if needed
data = json.loads(Path(out_path).read_text(encoding="utf-8"))

# 1) Confirm new keys exist
required_keys = {"title_tokens","title_clean","description_tokens","description_clean"}
missing = [i for i,r in enumerate(data[:200]) if not required_keys.issubset(r)]
print("Missing fields in first 200 records:", missing[:10], "(showing up to 10)")

# 2) Spot-check a few examples (before vs after)
for rec in itertools.islice((r for r in data if r.get("title")), 3):
    print("\nTITLE RAW: ", rec["title"])
    print("TITLE CLEAN:", rec["title_clean"])
    print("TITLE TOKENS:", rec["title_tokens"][:15])
    print("DESC CLEAN:", rec["description_clean"][:120], "...")


Missing fields in first 200 records: [] (showing up to 10)

TITLE RAW:  Solid Women Multicolor Track Pants
TITLE CLEAN: solid women multicolor track pant
TITLE TOKENS: ['solid', 'women', 'multicolor', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...

TITLE RAW:  Solid Men Blue Track Pants
TITLE CLEAN: solid men blue track pant
TITLE TOKENS: ['solid', 'men', 'blue', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...

TITLE RAW:  Solid Men Multicolor Track Pants
TITLE CLEAN: solid men multicolor track pant
TITLE TOKENS: ['solid', 'men', 'multicolor', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...


### **STEP 2 — Non-Text Fields Processing**

### **Main Code**

In [7]:
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from unidecode import unidecode

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

NOTEBOOK_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name == "part_1" else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"

INPUT_FILE = DATA_DIR / "fashion_products_dataset_preprocessed.json"  # from Step 1
OUTPUT_FILE = DATA_DIR / "fashion_products_dataset_enriched.json"

print(f"Using preprocessed dataset: {INPUT_FILE}")
print(f"Enriched output will be saved to: {OUTPUT_FILE}")

def _ensure_nltk():
    try:
        stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        word_tokenize("test")
    except LookupError:
        nltk.download("punkt")

    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        try:
            nltk.download("punkt_tab")
        except Exception:
            pass
_ensure_nltk()

STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()

def normalize_basic(text: str) -> str:
    if not text:
        return ""
    txt = unidecode(text)
    txt = txt.lower()
    txt = re.sub(r"http[s]?://\S+", " ", txt)
    txt = re.sub(r"[-_/]", " ", txt)
    txt = re.sub(r"[$€£₹¥%]+", " ", txt)
    txt = re.sub(r"\d+", " ", txt)
    txt = re.sub(r"[^\w\s]", " ", txt)
    txt = re.sub(r"[^a-z\s]", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def preprocess_text_field(text: str) -> Dict[str, Any]:
    cleaned = normalize_basic(text)
    toks = word_tokenize(cleaned) if cleaned else []
    toks = [t for t in toks if t not in STOPWORDS]
    toks = [STEMMER.stem(t) for t in toks]
    return {"tokens": toks, "text": " ".join(toks)}

def _to_bool(val: Any) -> Optional[bool]:
    if isinstance(val, bool):
        return val
    if val is None:
        return None
    s = str(val).strip().lower()
    if s in {"true", "yes", "1"}:
        return True
    if s in {"false", "no", "0"}:
        return False
    return None

def _to_float_price(val: Any) -> Optional[float]:
    """
    Parses strings like '2,999' or '₹2,999.50' -> 2999.0 / 2999.5
    Returns None if cannot parse.
    """
    if val is None:
        return None
    s = str(val)
    s = s.replace(",", "")
    m = re.search(r"(\d+(\.\d+)?)", s)
    if not m:
        return None
    try:
        return float(m.group(1))
    except ValueError:
        return None

def _to_int_discount_percent(val: Any) -> Optional[int]:
    """
    Parses '69% off' -> 69; '15%' -> 15
    """
    if val is None:
        return None
    m = re.search(r"(\d+)\s*%?", str(val))
    try:
        return int(m.group(1)) if m else None
    except Exception:
        return None

def _to_float_rating(val: Any) -> Optional[float]:
    if val is None:
        return None
    try:
        return float(str(val).strip())
    except ValueError:
        return None

def _norm_str(val: Any) -> Optional[str]:
    if val is None:
        return None
    s = unidecode(str(val)).strip().lower()
    return s if s else None

def flatten_product_details(pd: Any) -> Tuple[Dict[str, str], str]:
    """
    Converts product_details list of dicts into:
      - a single dict {key: value, ...} (lowercased)
      - a single text string "key: value; key2: value2"
    """
    out: Dict[str, str] = {}
    if isinstance(pd, list):
        for item in pd:
            if isinstance(item, dict):
                for k, v in item.items():
                    k_norm = _norm_str(k) or ""
                    v_norm = _norm_str(v) or ""
                    if k_norm:
                        out[k_norm] = v_norm
    elif isinstance(pd, dict):
        for k, v in pd.items():
            k_norm = _norm_str(k) or ""
            v_norm = _norm_str(v) or ""
            if k_norm:
                out[k_norm] = v_norm
    # Build a readable text
    pd_text = "; ".join([f"{k}: {v}" if v else f"{k}" for k, v in out.items()]).strip()
    return out, pd_text

def bucket_price(x: Optional[float]) -> Optional[str]:
    if x is None:
        return None
    if x < 1000: return "low"
    if x < 3000: return "mid"
    return "high"

def bucket_discount(p: Optional[int]) -> Optional[str]:
    if p is None: return None
    if p == 0: return "0"
    if p <= 20: return "1-20"
    if p <= 40: return "21-40"
    return "41+"

def bucket_rating(r: Optional[float]) -> Optional[str]:
    if r is None: return None
    if r < 2: return "0-2"
    if r < 3.5: return "2-3.5"
    if r < 4.5: return "3.5-4.5"
    return "4.5-5"

def enrich_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    out = dict(rec)  # keep everything for later output

    # Normalize categorical fields for faceting
    brand = _norm_str(rec.get("brand"))
    category = _norm_str(rec.get("category"))
    sub_category = _norm_str(rec.get("sub_category"))
    seller = _norm_str(rec.get("seller"))
    out["brand_norm"] = brand
    out["category_norm"] = category
    out["sub_category_norm"] = sub_category
    out["seller_norm"] = seller

    # product_details flattening
    pd_map, pd_text = flatten_product_details(rec.get("product_details"))
    out["product_details_map"] = pd_map
    out["product_details_text"] = pd_text

    # Build one merged "metadata text" field for recall
    parts = [
        brand or "",
        category or "",
        sub_category or "",
        seller or "",
        pd_text or "",
    ]
    metadata_text_raw = " | ".join([p for p in parts if p])
    meta_proc = preprocess_text_field(metadata_text_raw)
    out["metadata_tokens"] = meta_proc["tokens"]
    out["metadata_clean"] = meta_proc["text"]

    # Numeric/bool parsing (for filters/ranking features)
    out_of_stock = _to_bool(rec.get("out_of_stock"))
    actual_price_num = _to_float_price(rec.get("actual_price"))
    selling_price_num = _to_float_price(rec.get("selling_price"))
    discount_pct = _to_int_discount_percent(rec.get("discount"))
    average_rating_num = _to_float_rating(rec.get("average_rating"))

    out["out_of_stock_bool"] = out_of_stock
    out["actual_price_num"] = actual_price_num
    out["selling_price_num"] = selling_price_num
    out["discount_pct"] = discount_pct
    out["average_rating_num"] = average_rating_num

    # Buckets for UX/boosting/facets
    out["price_bucket"] = bucket_price(selling_price_num or actual_price_num)
    out["discount_bucket"] = bucket_discount(discount_pct)
    out["rating_bucket"] = bucket_rating(average_rating_num)

    return out

def read_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Expected a JSON array.")
    return data

def write_json(path: Path, items):
    with path.open("w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)

if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Input file not found: {INPUT_FILE}")

data = read_json(INPUT_FILE)
enriched = [enrich_record(rec) for rec in data]
write_json(OUTPUT_FILE, enriched)

print(f"Enriched {len(enriched)} records.")
print(f"Saved to {OUTPUT_FILE}")
print("Example (keys excerpt):", list(enriched[0].keys())[:20])

Using preprocessed dataset: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Enriched output will be saved to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Enriched 28080 records.
Saved to /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Example (keys excerpt): ['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url', 'title_tokens', 'title_clean', 'description_tokens']


## **PART 2: EDA**

### **Main Code**

In [8]:
import json
import math
import os
from pathlib import Path
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud



OTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1]
DATA_DIR = REPO_ROOT / "data"
INPUT = DATA_DIR / "fashion_products_dataset_enriched.json"

OUT_DIR = DATA_DIR / "eda_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reading: {INPUT}")
print(f"Saving outputs to: {OUT_DIR}")



if not INPUT.exists():
    raise FileNotFoundError(f"Input not found: {INPUT}")

df = pd.read_json(INPUT)
n_docs = len(df)
print("Total products:", n_docs)

def to_num(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")


summary_txt = OUT_DIR / "summary.txt"
with summary_txt.open("w", encoding="utf-8") as f:
    f.write(f"Total products: {n_docs}\n\n")
    f.write("Missing values (top 20 columns):\n")
    f.write(df.isna().sum().sort_values(ascending=False).head(20).to_string())
    f.write("\n\nColumns:\n")
    f.write(", ".join(df.columns))
print(f"Wrote summary: {summary_txt}")


# Text stats: word counts & vocab
def word_count_col(series: pd.Series) -> pd.Series:
    return series.fillna("").astype(str).str.split().apply(len)

df["title_word_count"] = word_count_col(df.get("title_clean", pd.Series()))
df["desc_word_count"] = word_count_col(df.get("description_clean", pd.Series()))

# Hist: title length
plt.figure()
df["title_word_count"].dropna().plot(kind="hist", bins=30)
plt.title("Title word count distribution")
plt.xlabel("Words in title")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(OUT_DIR / "hist_title_word_count.png", dpi=150)
plt.close()

# Hist: description length
plt.figure()
df["desc_word_count"].dropna().plot(kind="hist", bins=30)
plt.title("Description word count distribution")
plt.xlabel("Words in description")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(OUT_DIR / "hist_desc_word_count.png", dpi=150)
plt.close()

# Vocabulary size (title + description + metadata)
def concat_text(cols):
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return ""
    return " ".join(df[c].dropna().astype(str) for c in cols)

# Prefer *_clean fields if present
text_sources = []
for col in ["title_clean", "description_clean", "metadata_clean"]:
    if col in df.columns:
        text_sources.append(col)

all_text = " ".join(df[col].dropna().astype(str).tolist()) if text_sources else ""
vocab_size = len(set(all_text.split())) if all_text else 0

with summary_txt.open("a", encoding="utf-8") as f:
    f.write(f"\nVocabulary size (title/desc/metadata): {vocab_size}\n")


# Word clouds
def make_wordcloud(text_series: pd.Series, out_name: str, title: str):
    text = " ".join(text_series.dropna().astype(str).tolist())
    if not text.strip():
        print(f"[WARN] No text for wordcloud: {out_name}")
        return
    wc = WordCloud(width=1200, height=600, background_color="white").generate(text)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(OUT_DIR / out_name, dpi=150)
    plt.close()

if "title_clean" in df.columns:
    make_wordcloud(df["title_clean"], "wc_title.png", "Word Cloud — Titles")

if "description_clean" in df.columns:
    make_wordcloud(df["description_clean"], "wc_description.png", "Word Cloud — Descriptions")

if "metadata_clean" in df.columns:
    make_wordcloud(df["metadata_clean"], "wc_metadata.png", "Word Cloud — Metadata")


# Numeric distributions: rating, price, discount
if "average_rating_num" in df.columns:
    plt.figure()
    to_num(df["average_rating_num"]).dropna().plot(kind="hist", bins=25)
    plt.title("Distribution of average ratings")
    plt.xlabel("Rating")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_ratings.png", dpi=150)
    plt.close()

if "selling_price_num" in df.columns:
    plt.figure()
    to_num(df["selling_price_num"]).dropna().plot(kind="hist", bins=40)
    plt.title("Distribution of selling prices")
    plt.xlabel("Price")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_prices.png", dpi=150)
    plt.close()

if "discount_pct" in df.columns:
    plt.figure()
    to_num(df["discount_pct"]).dropna().plot(kind="hist", bins=40)
    plt.title("Distribution of discounts (%)")
    plt.xlabel("Discount %")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_discounts.png", dpi=150)
    plt.close()


# Out-of-stock distribution
if "out_of_stock_bool" in df.columns:
    plt.figure()
    df["out_of_stock_bool"].value_counts(dropna=False).plot(kind="bar")
    plt.title("Out-of-stock distribution")
    plt.xlabel("Out of stock")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "bar_out_of_stock.png", dpi=150)
    plt.close()

# Top brands / sellers
def bar_top_counts(series: pd.Series, out_name: str, title: str, topn: int = 15):
    vc = series.dropna().astype(str).str.strip().replace({"": None}).dropna().value_counts().head(topn)
    if vc.empty:
        print(f"[WARN] No data for {title}")
        return
    plt.figure()
    vc[::-1].plot(kind="barh")
    plt.title(title)
    plt.xlabel("Count")
    plt.ylabel("Value")
    plt.tight_layout()
    plt.savefig(OUT_DIR / out_name, dpi=150)
    plt.close()

if "brand_norm" in df.columns:
    bar_top_counts(df["brand_norm"], "bar_top_brands.png", "Top brands (count)")

if "seller_norm" in df.columns:
    bar_top_counts(df["seller_norm"], "bar_top_sellers.png", "Top sellers (count)")


# Rankings: top rated / cheapest / top discounted
def save_table(df_sorted: pd.DataFrame, cols, out_csv):
    cols = [c for c in cols if c in df_sorted.columns]
    if not cols:
        return
    df_sorted[cols].to_csv(OUT_DIR / out_csv, index=False)

# Top rated
if "average_rating_num" in df.columns:
    top_rated = df.sort_values(["average_rating_num", "selling_price_num"], ascending=[False, True]).head(50)
    save_table(
        top_rated,
        ["pid", "title", "brand", "category", "sub_category", "average_rating_num", "selling_price_num", "discount_pct", "url"],
        "top_rated_products.csv"
    )

# Cheapest (by selling price if available, fallback to actual price)
price_col = "selling_price_num" if "selling_price_num" in df.columns else ("actual_price_num" if "actual_price_num" in df.columns else None)
if price_col:
    cheapest = df[df[price_col].notna()].sort_values(price_col, ascending=True).head(50)
    save_table(
        cheapest,
        ["pid", "title", "brand", "category", "sub_category", price_col, "average_rating_num", "discount_pct", "url"],
        "cheapest_products.csv"
    )

# Top discounted
if "discount_pct" in df.columns:
    top_disc = df[df["discount_pct"].notna()].sort_values("discount_pct", ascending=False).head(50)
    save_table(
        top_disc,
        ["pid", "title", "brand", "category", "sub_category", "discount_pct", "selling_price_num", "average_rating_num", "url"],
        "top_discounted_products.csv"
    )


# Optional: Named Entity Recognition (spaCy)
def try_ner_sample(df_in: pd.DataFrame, n_docs: int = 20):
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
    except Exception as e:
        print("[INFO] spaCy NER skipped (model not installed). To enable: pip install spacy && python -m spacy download en_core_web_sm")
        return

    sample = df_in["description"].dropna().astype(str).sample(min(n_docs, len(df_in)), random_state=42)
    ents_rows = []
    for text in sample:
        doc = nlp(text[:2000])  # limit very long descriptions
        for ent in doc.ents:
            ents_rows.append({"text": text[:80] + ("..." if len(text) > 80 else ""),
                              "entity": ent.text, "label": ent.label_})
    if ents_rows:
        ner_df = pd.DataFrame(ents_rows)
        ner_csv = OUT_DIR / "ner_sample_entities.csv"
        ner_df.to_csv(ner_csv, index=False)
        print(f"Saved NER sample entities: {ner_csv}")

try_ner_sample(df, n_docs=20)


# Print quick KPI summary
def safe_median(series):
    s = pd.to_numeric(series, errors="coerce").dropna()
    return float(s.median()) if not s.empty else None

kpis = {
    "n_products": n_docs,
    "vocab_size_title_desc_meta": vocab_size,
    "median_title_len": float(df["title_word_count"].median()) if "title_word_count" in df else None,
    "median_desc_len": float(df["desc_word_count"].median()) if "desc_word_count" in df else None,
    "median_price": safe_median(df.get("selling_price_num", pd.Series())),
    "median_rating": safe_median(df.get("average_rating_num", pd.Series())),
    "median_discount_pct": safe_median(df.get("discount_pct", pd.Series())),
    "out_of_stock_ratio": float(df["out_of_stock_bool"].mean()) if "out_of_stock_bool" in df and df["out_of_stock_bool"].notna().any() else None,
}

kpi_lines = "\n".join([f"{k}: {v}" for k, v in kpis.items()])
print("\n=== KPI SUMMARY ===")
print(kpi_lines)
with (OUT_DIR / "kpi_summary.txt").open("w", encoding="utf-8") as f:
    f.write(kpi_lines)

print("\nDone. Check the 'outputs' folder for figures, CSVs, and summaries.")

Reading: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Saving outputs to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/eda_outputs
Total products: 28080
Wrote summary: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/eda_outputs/summary.txt
[INFO] spaCy NER skipped (model not installed). To enable: pip install spacy && python -m spacy download en_core_web_sm

=== KPI SUMMARY ===
n_products: 28080
vocab_size_title_desc_meta: 6650
median_title_len: 6.0
median_desc_len: 9.0
median_price: 545.0
median_rating: 3.8
median_discount_pct: 53.0
out_of_stock_ratio: 0.05854700854700855

Done. Check the 'outputs' folder for figures, CSVs, and summaries.
