In [54]:
import pandas as pd
from fuzzywuzzy import process, fuzz
import json
import uuid
import logging
import time

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

start_time = time.time()
logging.info("Starting product normalization process...")

# Load JSON files for mapping
brands = pd.read_json("elaf_brands.json")
categories = pd.read_json("elaf_categories.json")

# Preprocess brand and category mapping
brand_map = {str(b).strip().lower(): bid for bid, b in zip(brands["id"], brands["original_value"])}
cat_map = {str(c).strip().lower(): cid for cid, c in zip(categories["id"], categories["original_value"])}

def getBrandId(originalValue, threshold=80):
    if not originalValue:
        return None
    val = originalValue.strip().lower()
    if val in brand_map:
        return brand_map[val]
    match, score = process.extractOne(val, brand_map.keys(), scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        return brand_map[match]
    return None

def getCategoryId(row, threshold=80):
    category_columns = [ "التصنيف الفرعي الثالث", "التصنيف الفرعي الثاني", "الصنف الثاني", "التصنيف الفرعي الأول", "الصنف الاول"  ]
    for col in category_columns:
        if col in row and pd.notna(row[col]):
            val = str(row[col]).strip().lower()
            if val in cat_map:
                return cat_map[val]
            val_clean = val.replace(",", " ").replace("-", " ")
            cleaned_keys = [k.replace(",", " ").replace("-", " ") for k in cat_map.keys()]
            match, score = process.extractOne(val_clean, cleaned_keys, scorer=fuzz.token_sort_ratio)
            if score >= threshold:
                original_key = next(k for k in cat_map.keys() if k.replace(",", " ").replace("-", " ") == match)
                return cat_map[original_key]
    return None

def clean_sheet(df):
    df = df.dropna(axis=1, how="all")
    if df.columns.str.contains("Unnamed").all() or all([str(c) == "nan" for c in df.columns]):
        df.columns = df.iloc[0]
        df = df[1:]
    df.columns = df.columns.astype(str)
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    return df

# Load Excel
df_sheets = pd.read_excel("ElafProducts.xlsx", sheet_name=None)
mainCats = [sheet for sheet in df_sheets.keys() if sheet not in ["Countries Code", "طعام طازجold"]]

all_products = []

for cat in mainCats:
    sheet_start = time.time()
    logging.info(f"Processing sheet: {cat} ...")
    df = clean_sheet(df_sheets[cat])
    for _, row in df.iterrows():
        product = {}
        product["id"] = str(uuid.uuid4())
        product["categoryId"] = getCategoryId(row)

        # Brand
        brand_value = None
        for bcol in ["العلامه التجاريه", "العلامة التجارية"]:
            if bcol in row and pd.notna(row[bcol]):
                brand_value = row[bcol]
                break
        product["brandId"] = getBrandId(brand_value)

        # Product Name
        name_ar, name_en = None, None
        for ar_col in ["اسم المنتج بالعربي", "اسم المنتج (عربي)"]:
            if ar_col in row and pd.notna(row[ar_col]):
                name_ar = row[ar_col]
                break
        for en_col in ["اسم المنتج بالنجليزي", "اسم المنتج (إنجليزي)"]:
            if en_col in row and pd.notna(row[en_col]):
                name_en = row[en_col]
                break
        if not name_ar or not name_en:
            continue  # Skip if either name is missing
        product["name"] = {"ar": name_ar, "en": name_en}

        # Quantity / UOM / Size
        qty, uom, size = None, None, None
        for qcol in ["العدد", "العدد بدون (pcs)"]:
            if qcol in row and pd.notna(row[qcol]):
                qty = row[qcol]
                break
        for ucol in ["وحدة القياس"]:
            if ucol in row and pd.notna(row[ucol]):
                uom = row[ucol]
                break
        for scol in ["السعة او الحجم", "السعه"]:
            if scol in row and pd.notna(row[scol]):
                size = row[scol]
                break
        product["qty"] = qty
        product["uom"] = uom
        product["size"] = size

        # Price
        price_col = None
        for pcol in ["السعر بالريال السعودي", "السعر  بالريال السعودي"]:
            if pcol in row and pd.notna(row[pcol]):
                price_col = row[pcol]
                break
        product["price"] = price_col

        all_products.append(product)
    
    sheet_end = time.time()
    logging.info(f"Finished sheet: {cat} in {sheet_end - sheet_start:.2f} seconds")

# Export JSON
# Export JSON safely with datetime converted to string
with open("elaf_products.json", "w", encoding="utf-8") as f:
    json.dump(all_products, f, ensure_ascii=False, indent=4, default=str)

total_time = time.time() - start_time
logging.info(f"Total products processed: {len(all_products)}")
logging.info(f"Total time: {total_time:.2f} seconds")


2025-12-09 00:21:38 [INFO] Starting product normalization process...
2025-12-09 00:21:42 [INFO] Processing sheet:  طعام طازج ...
2025-12-09 00:21:42 [INFO] Finished sheet:  طعام طازج in 0.19 seconds
2025-12-09 00:21:42 [INFO] Processing sheet: أكسسوارات السيارات ...
2025-12-09 00:21:43 [INFO] Finished sheet: أكسسوارات السيارات in 0.46 seconds
2025-12-09 00:21:43 [INFO] Processing sheet: خزانة الطعام ...
2025-12-09 00:22:59 [INFO] Finished sheet: خزانة الطعام in 76.06 seconds
2025-12-09 00:22:59 [INFO] Processing sheet: المشروبات ...
2025-12-09 00:23:19 [INFO] Finished sheet: المشروبات in 20.03 seconds
2025-12-09 00:23:19 [INFO] Processing sheet: خضار و فواكه ...
2025-12-09 00:23:22 [INFO] Finished sheet: خضار و فواكه in 3.55 seconds
2025-12-09 00:23:22 [INFO] Processing sheet: العاب و مستلزمات النشاطات الخار ...
2025-12-09 00:23:23 [INFO] Finished sheet: العاب و مستلزمات النشاطات الخار in 0.81 seconds
2025-12-09 00:23:23 [INFO] Processing sheet: أطعمة ومستلزمات الحيوانات الألي ...
2025