<div align="center">

# DELIVERY 1  
## **TEXT PROCESSING & EXPLORATORY DATA ANALYSIS (EDA)**

</div>

---

## PART 1: **DATA PREPARATION**

### STEP 1 — **Preprocessing Pipeline (NLTK)**

### **Main Code**

In [5]:
import json
import re
from pathlib import Path
from typing import List, Dict, Any
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from unidecode import unidecode

NOTEBOOK_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name == "part_1" else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"

INPUT_FILE = DATA_DIR / "fashion_products_dataset.json"
OUTPUT_FILE = DATA_DIR / "fashion_products_dataset_preprocessed.json"

print(f"Using dataset: {INPUT_FILE}")
print(f"Output will be saved to: {OUTPUT_FILE}")

def _ensure_nltk():
    try:
        stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        word_tokenize("test")
    except LookupError:
        nltk.download("punkt")
_ensure_nltk()



STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()


def normalize_basic(text: str) -> str:
    """
    Basic normalization:
      - Lowercase
      - Remove accents with unidecode
      - Remove currency symbols and numbers
      - Remove punctuation marks
      - Replace dashes/underscores with space
      - Collapse extra whitespace
    """
    if not text:
        return ""

    txt = unidecode(text)
    txt = txt.lower()
    txt = re.sub(r"http[s]?://\S+", " ", txt)
    txt = re.sub(r"[-_/]", " ", txt)
    txt = re.sub(r"[$€£₹¥%]+", " ", txt)
    txt = re.sub(r"\d+", " ", txt)
    txt = re.sub(r"[^\w\s]", " ", txt)
    txt = re.sub(r"[^a-z\s]", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt


def tokenize(text: str) -> List[str]:
    return word_tokenize(text) if text else []


def remove_stopwords(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in STOPWORDS]


def stem(tokens: List[str]) -> List[str]:
    return [STEMMER.stem(t) for t in tokens]


def preprocess_text_field(text: str) -> Dict[str, Any]:
    """Apply the full preprocessing pipeline to one text field."""
    cleaned = normalize_basic(text)
    toks = tokenize(cleaned)
    toks_nostop = remove_stopwords(toks)
    toks_stem = stem(toks_nostop)
    return {
        "tokens": toks_stem,
        "text": " ".join(toks_stem)
    }


def process_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    """Process one record and add preprocessed fields."""
    title = rec.get("title", "")
    desc = rec.get("description", "")

    t_proc = preprocess_text_field(title)
    d_proc = preprocess_text_field(desc)

    rec_out = dict(rec)
    rec_out["title_tokens"] = t_proc["tokens"]
    rec_out["title_clean"] = t_proc["text"]
    rec_out["description_tokens"] = d_proc["tokens"]
    rec_out["description_clean"] = d_proc["text"]
    return rec_out


def read_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
        if not isinstance(data, list):
            raise ValueError("Expected a JSON array.")
        return data


def write_json(path: Path, items):
    with path.open("w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)


if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Input file not found: {INPUT_FILE}")

data = read_json(INPUT_FILE)
processed = [process_record(rec) for rec in data]
write_json(OUTPUT_FILE, processed)

print(f"Processed {len(processed)} records.")
print(f"Saved to {OUTPUT_FILE}")
print("Example output:\n")
print(json.dumps(processed[0], indent=2)[:800])


Using dataset: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset.json
Output will be saved to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Processed 28080 records.
Saved to /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Example output:

{
  "_id": "fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a",
  "actual_price": "2,999",
  "average_rating": "3.9",
  "brand": "York",
  "category": "Clothing and Accessories",
  "crawled_at": 1612987911000,
  "description": "Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India",


### **Verification Code**

In [6]:
from pathlib import Path
import json, itertools

out_path = Path("../..") / "data" / "fashion_products_dataset_preprocessed.json"  # adjust if needed
data = json.loads(Path(out_path).read_text(encoding="utf-8"))

# 1) Confirm new keys exist
required_keys = {"title_tokens","title_clean","description_tokens","description_clean"}
missing = [i for i,r in enumerate(data[:200]) if not required_keys.issubset(r)]
print("Missing fields in first 200 records:", missing[:10], "(showing up to 10)")

# 2) Spot-check a few examples (before vs after)
for rec in itertools.islice((r for r in data if r.get("title")), 3):
    print("\nTITLE RAW: ", rec["title"])
    print("TITLE CLEAN:", rec["title_clean"])
    print("TITLE TOKENS:", rec["title_tokens"][:15])
    print("DESC CLEAN:", rec["description_clean"][:120], "...")


Missing fields in first 200 records: [] (showing up to 10)

TITLE RAW:  Solid Women Multicolor Track Pants
TITLE CLEAN: solid women multicolor track pant
TITLE TOKENS: ['solid', 'women', 'multicolor', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...

TITLE RAW:  Solid Men Blue Track Pants
TITLE CLEAN: solid men blue track pant
TITLE TOKENS: ['solid', 'men', 'blue', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...

TITLE RAW:  Solid Men Multicolor Track Pants
TITLE CLEAN: solid men multicolor track pant
TITLE TOKENS: ['solid', 'men', 'multicolor', 'track', 'pant']
DESC CLEAN: yorker trackpant made rich comb cotton give rich look design comfort skin friendli fabric itch free waistband great year ...


### STEP 2 — **Non-Text Fields Processing**

In [None]:
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from unidecode import unidecode

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

NOTEBOOK_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parents[1] if NOTEBOOK_DIR.name == "part_1" else NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"

INPUT_FILE = DATA_DIR / "fashion_products_dataset_preprocessed.json"  # from Step 1
OUTPUT_FILE = DATA_DIR / "fashion_products_dataset_enriched.json"

print(f"Using preprocessed dataset: {INPUT_FILE}")
print(f"Enriched output will be saved to: {OUTPUT_FILE}")

def _ensure_nltk():
    try:
        stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        word_tokenize("test")
    except LookupError:
        nltk.download("punkt")

    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        try:
            nltk.download("punkt_tab")
        except Exception:
            pass
_ensure_nltk()

STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()

def normalize_basic(text: str) -> str:
    if not text:
        return ""
    txt = unidecode(text)
    txt = txt.lower()
    txt = re.sub(r"http[s]?://\S+", " ", txt)
    txt = re.sub(r"[-_/]", " ", txt)
    txt = re.sub(r"[$€£₹¥%]+", " ", txt)
    txt = re.sub(r"\d+", " ", txt)
    txt = re.sub(r"[^\w\s]", " ", txt)
    txt = re.sub(r"[^a-z\s]", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def preprocess_text_field(text: str) -> Dict[str, Any]:
    cleaned = normalize_basic(text)
    toks = word_tokenize(cleaned) if cleaned else []
    toks = [t for t in toks if t not in STOPWORDS]
    toks = [STEMMER.stem(t) for t in toks]
    return {"tokens": toks, "text": " ".join(toks)}

def _to_bool(val: Any) -> Optional[bool]:
    if isinstance(val, bool):
        return val
    if val is None:
        return None
    s = str(val).strip().lower()
    if s in {"true", "yes", "1"}:
        return True
    if s in {"false", "no", "0"}:
        return False
    return None

def _to_float_price(val: Any) -> Optional[float]:
    """
    Parses strings like '2,999' or '₹2,999.50' -> 2999.0 / 2999.5
    Returns None if cannot parse.
    """
    if val is None:
        return None
    s = str(val)
    s = s.replace(",", "")
    m = re.search(r"(\d+(\.\d+)?)", s)
    if not m:
        return None
    try:
        return float(m.group(1))
    except ValueError:
        return None

def _to_int_discount_percent(val: Any) -> Optional[int]:
    """
    Parses '69% off' -> 69; '15%' -> 15
    """
    if val is None:
        return None
    m = re.search(r"(\d+)\s*%?", str(val))
    try:
        return int(m.group(1)) if m else None
    except Exception:
        return None

def _to_float_rating(val: Any) -> Optional[float]:
    if val is None:
        return None
    try:
        return float(str(val).strip())
    except ValueError:
        return None

def _norm_str(val: Any) -> Optional[str]:
    if val is None:
        return None
    s = unidecode(str(val)).strip().lower()
    return s if s else None

def flatten_product_details(pd: Any) -> Tuple[Dict[str, str], str]:
    """
    Converts product_details list of dicts into:
      - a single dict {key: value, ...} (lowercased)
      - a single text string "key: value; key2: value2"
    """
    out: Dict[str, str] = {}
    if isinstance(pd, list):
        for item in pd:
            if isinstance(item, dict):
                for k, v in item.items():
                    k_norm = _norm_str(k) or ""
                    v_norm = _norm_str(v) or ""
                    if k_norm:
                        out[k_norm] = v_norm
    elif isinstance(pd, dict):
        for k, v in pd.items():
            k_norm = _norm_str(k) or ""
            v_norm = _norm_str(v) or ""
            if k_norm:
                out[k_norm] = v_norm
    # Build a readable text
    pd_text = "; ".join([f"{k}: {v}" if v else f"{k}" for k, v in out.items()]).strip()
    return out, pd_text

def bucket_price(x: Optional[float]) -> Optional[str]:
    if x is None:
        return None
    if x < 1000: return "low"
    if x < 3000: return "mid"
    return "high"

def bucket_discount(p: Optional[int]) -> Optional[str]:
    if p is None: return None
    if p == 0: return "0"
    if p <= 20: return "1-20"
    if p <= 40: return "21-40"
    return "41+"

def bucket_rating(r: Optional[float]) -> Optional[str]:
    if r is None: return None
    if r < 2: return "0-2"
    if r < 3.5: return "2-3.5"
    if r < 4.5: return "3.5-4.5"
    return "4.5-5"

def enrich_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    out = dict(rec)  # keep everything for later output

    # Normalize categorical fields for faceting
    brand = _norm_str(rec.get("brand"))
    category = _norm_str(rec.get("category"))
    sub_category = _norm_str(rec.get("sub_category"))
    seller = _norm_str(rec.get("seller"))
    out["brand_norm"] = brand
    out["category_norm"] = category
    out["sub_category_norm"] = sub_category
    out["seller_norm"] = seller

    # product_details flattening
    pd_map, pd_text = flatten_product_details(rec.get("product_details"))
    out["product_details_map"] = pd_map
    out["product_details_text"] = pd_text

    # Build one merged "metadata text" field for recall
    parts = [
        brand or "",
        category or "",
        sub_category or "",
        seller or "",
        pd_text or "",
    ]
    metadata_text_raw = " | ".join([p for p in parts if p])
    meta_proc = preprocess_text_field(metadata_text_raw)
    out["metadata_tokens"] = meta_proc["tokens"]
    out["metadata_clean"] = meta_proc["text"]

    # Numeric/bool parsing (for filters/ranking features)
    out_of_stock = _to_bool(rec.get("out_of_stock"))
    actual_price_num = _to_float_price(rec.get("actual_price"))
    selling_price_num = _to_float_price(rec.get("selling_price"))
    discount_pct = _to_int_discount_percent(rec.get("discount"))
    average_rating_num = _to_float_rating(rec.get("average_rating"))

    out["out_of_stock_bool"] = out_of_stock
    out["actual_price_num"] = actual_price_num
    out["selling_price_num"] = selling_price_num
    out["discount_pct"] = discount_pct
    out["average_rating_num"] = average_rating_num

    # Buckets for UX/boosting/facets
    out["price_bucket"] = bucket_price(selling_price_num or actual_price_num)
    out["discount_bucket"] = bucket_discount(discount_pct)
    out["rating_bucket"] = bucket_rating(average_rating_num)

    return out

def read_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Expected a JSON array.")
    return data

def write_json(path: Path, items):
    with path.open("w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)

if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Input file not found: {INPUT_FILE}")

data = read_json(INPUT_FILE)
enriched = [enrich_record(rec) for rec in data]
write_json(OUTPUT_FILE, enriched)

print(f"Enriched {len(enriched)} records.")
print(f"Saved to {OUTPUT_FILE}")
print("Example (keys excerpt):", list(enriched[0].keys())[:20])

Using preprocessed dataset: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_preprocessed.json
Enriched output will be saved to: /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Enriched 28080 records.
Saved to /Users/pauchaves/Documents/Mathematical Engineering in Data Science/4th Year/1st Trimester/WEB RETRIEVAL/irwa-search-engine/data/fashion_products_dataset_enriched.json
Example (keys excerpt): ['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url', 'title_tokens', 'title_clean', 'description_tokens']
