# Jalalsons Scraper
This notebook scrapes product data from Jalalsons Pakistan using Playwright with multi-branch support.

### Instructions:
1. Run the **Setup Cell** to install dependencies and Playwright browser.
2. Run the **Scraper Logic Cell** to define the classes.
3. Run the **Execution Cell** to start scraping. (Note: Scraping all branches can be slow; you can specify a target branch in the execution cell).

In [None]:
# @title Setup Cell
!pip install playwright nest_asyncio pandas
!playwright install chromium
import os
os.makedirs('data', exist_ok=True)
import nest_asyncio
nest_asyncio.apply()

In [None]:
# @title Base Infrastructure
import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
import time
import random

class BaseScraper(ABC):
    def __init__(self, store_name: str, base_url: str, output_dir: str = "data"):
        self.store_name = store_name
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.logger = self._setup_logger()
        self.csv_headers = [
            'store_name', 'product_name', 'brand', 'category', 'subcategory',
            'price', 'discounted_price', 'unit', 'quantity', 'url', 'image_url', 'last_updated'
        ]
    
    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.store_name}_scraper")
        logger.setLevel(logging.INFO)
        if not logger.handlers:
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            logger.addHandler(ch)
        return logger
    
    def _rate_limit(self, min_delay: float = 1.0, max_delay: float = 3.0):
        time.sleep(random.uniform(min_delay, max_delay))
    
    def _clean_price(self, price_str: str) -> Optional[float]:
        if not price_str: return None
        try:
            cleaned = str(price_str).replace('Rs.', '').replace('PKR', '').replace('Rs', '').replace(',', '').strip()
            return float(cleaned)
        except (ValueError, AttributeError): return None
    
    def _parse_unit_quantity(self, text: str) -> tuple[Optional[str], Optional[float]]:
        if not text: return None, None
        import re
        patterns = [r'(\d+\.?\d*)\s*(kg|g|l|ml|piece|pcs|pack)', r'(\d+\.?\d*)(kg|g|l|ml|piece|pcs|pack)']
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match: return match.group(2), float(match.group(1))
        return None, None
    
    def save_to_csv(self, products: List[Dict], filename: Optional[str] = None):
        if not products: return
        if filename is None:
            filename = f"{self.store_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        filepath = self.output_dir / filename
        with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=self.csv_headers)
            writer.writeheader()
            for product in products:
                row = {header: product.get(header, None) for header in self.csv_headers}
                writer.writerow(row)
        print(f"\u2713 Saved {len(products)} products to {filepath}")

    def validate_product(self, product: Dict) -> bool: return bool(product.get('product_name') and product.get('price'))
    
    @abstractmethod
    async def scrape(self) -> List[Dict]: pass
    @abstractmethod
    async def get_categories(self) -> List[Dict]: pass
    
    def create_product_dict(self, **kwargs) -> Dict:
        d = {h: kwargs.get(h) for h in self.csv_headers}
        d['store_name'] = self.store_name
        d['last_updated'] = datetime.now().isoformat()
        return d

In [None]:
# @title Jalalsons Scraper Logic
import asyncio
from playwright.async_api import async_playwright

class JalalsonsScraper(BaseScraper):
    def __init__(self, output_dir: str = "data", target_branch: Optional[str] = None):
        super().__init__(store_name="Jalalsons", base_url="https://jalalsons.com.pk", output_dir=output_dir)
        self.target_branch = target_branch
        self.playwright = None
        self.browser = None
        self.page = None

    async def setup_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
        self.page = await self.browser.new_page()

    async def close_browser(self):
        if self.page: await self.page.close()
        if self.browser: await self.browser.close()
        if self.playwright: await self.playwright.stop()

    async def close_popup(self):
        try:
            if await self.page.locator("#website_custom_popup").is_visible():
                await self.page.locator('#website_custom_popup .modal-header a.cursor-pointer.ms-auto').click()
        except: pass

    async def get_branches(self) -> List[str]:
        await self.page.goto(self.base_url)
        await self.close_popup()
        await self.page.click("a#delivery-loc-tab")
        await self.page.wait_for_selector("#selectDeliveryBranch")
        branches = await self.page.locator("#selectDeliveryBranch option").all_text_contents()
        return [b for b in branches if "Please select" not in b and "Lahore" in b]

    async def select_branch(self, branch_name: str, is_first: bool = False):
        if not is_first:
            await self.page.click("a#get_current_loc")
            await self.page.wait_for_selector("#selectDeliveryBranch")
        await self.page.select_option("#selectDeliveryBranch", label=branch_name)
        await asyncio.sleep(2)
        await self.page.click("a#delivery_order")
        await self.page.wait_for_selector("ul.navbar-nav")

    async def get_categories(self) -> List[Dict]:
        nav_items = await self.page.locator("ul.navbar-nav > li.nav-item").element_handles()
        results = []
        target = ["BAKERY", "DELI", "JS ICECREAM", "SWEETS", "DEALS", "GROCERY"]
        for li in nav_items[:-3]:
            a = await li.query_selector("a.nav-link")
            name = (await a.inner_text()).strip()
            if name not in target: continue
            await a.hover()
            await asyncio.sleep(0.5)
            subs = await li.query_selector_all("ul.dropdown-content a")
            if subs:
                for s in subs: results.append({"main_category": name, "subcategory": (await s.inner_text()).strip(), "url": self.base_url + await s.get_attribute("href")})
            else:
                results.append({"main_category": name, "subcategory": name, "url": self.base_url + await a.get_attribute("href")})
        return results

    async def scrape_category(self, category: Dict, branch: str) -> List[Dict]:
        products = []
        try:
            await self.page.goto(category['url'], timeout=60000)
            await self.page.wait_for_selector(".single_product_theme", timeout=10000)
            els = await self.page.query_selector_all(".single_product_theme")
            for el in els:
                n = await el.query_selector("p.product_name_theme")
                p_val = await el.query_selector("span.price-value")
                i = await el.query_selector("img")
                if n and p_val:
                    u, q = self._parse_unit_quantity(await n.inner_text())
                    prod = self.create_product_dict(product_name=f"{await n.inner_text()} [{branch}]", price=self._clean_price(await p_val.inner_text()), url=category['url'], image_url=await i.get_attribute("src"), category=category['main_category'], subcategory=category['subcategory'], unit=u, quantity=q)
                    if self.validate_product(prod): products.append(prod)
            return products
        except: return []

    async def scrape(self) -> List[Dict]:
        all_p = []
        await self.setup_browser()
        try:
            branches = await self.get_branches()
            if self.target_branch: branches = [b for b in branches if self.target_branch.lower() in b.lower()]
            for i, b in enumerate(branches[:1]): # Limit 1 branch for demo
                await self.select_branch(b, i==0)
                cats = await self.get_categories()
                for cat in cats[:5]: # Limit categories for demo
                    all_p.extend(await self.scrape_category(cat, b))
                    await asyncio.sleep(1)
        finally: await self.close_browser()
        return all_p

In [None]:
# @title Execution
# You can set target_branch="DHA" to filter
scraper = JalalsonsScraper(target_branch=None)
products = await scraper.scrape()
if products:
    scraper.save_to_csv(products)
import pandas as pd
if products:
    df = pd.DataFrame(products)
    display(df.head())