# Rahim Store Scraper
This notebook scrapes product data from Rahim Store Pakistan using Playwright with department-based pagination.

### Instructions:
1. Run the **Setup Cell** to install dependencies and Playwright browser.
2. Run the **Scraper Logic Cell** to define the classes.
3. Run the **Execution Cell** to start scraping.

In [None]:
# @title Setup Cell
!pip install playwright nest_asyncio pandas
!playwright install chromium
import os
os.makedirs('data', exist_ok=True)
import nest_asyncio
nest_asyncio.apply()

In [None]:
# @title Base Infrastructure
import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
import time
import random

class BaseScraper(ABC):
    def __init__(self, store_name: str, base_url: str, output_dir: str = "data"):
        self.store_name = store_name
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.logger = self._setup_logger()
        self.csv_headers = [
            'store_name', 'product_name', 'brand', 'category', 'subcategory',
            'price', 'discounted_price', 'unit', 'quantity', 'url', 'image_url', 'last_updated'
        ]
    
    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.store_name}_scraper")
        logger.setLevel(logging.INFO)
        if not logger.handlers:
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            logger.addHandler(ch)
        return logger
    
    def _rate_limit(self, min_delay: float = 1.0, max_delay: float = 3.0):
        time.sleep(random.uniform(min_delay, max_delay))
    
    def _clean_price(self, price_str: str) -> Optional[float]:
        if not price_str: return None
        try:
            cleaned = str(price_str).replace('Rs.', '').replace('PKR', '').replace('Rs', '').replace(',', '').strip()
            return float(cleaned)
        except (ValueError, AttributeError): return None
    
    def _parse_unit_quantity(self, text: str) -> tuple[Optional[str], Optional[float]]:
        if not text: return None, None
        import re
        patterns = [r'(\d+\.?\d*)\s*(kg|g|l|ml|piece|pcs|pack)', r'(\d+\.?\d*)(kg|g|l|ml|piece|pcs|pack)']
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match: return match.group(2), float(match.group(1))
        return None, None
    
    def save_to_csv(self, products: List[Dict], filename: Optional[str] = None):
        if not products: return
        if filename is None:
            filename = f"{self.store_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        filepath = self.output_dir / filename
        with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=self.csv_headers)
            writer.writeheader()
            for product in products:
                row = {header: product.get(header, None) for header in self.csv_headers}
                writer.writerow(row)
        print(f"\u2713 Saved {len(products)} products to {filepath}")

    def validate_product(self, product: Dict) -> bool: return bool(product.get('product_name') and product.get('price'))
    
    @abstractmethod
    async def scrape(self) -> List[Dict]: pass
    @abstractmethod
    async def get_categories(self) -> List[Dict]: pass
    
    def create_product_dict(self, **kwargs) -> Dict:
        d = {h: kwargs.get(h) for h in self.csv_headers}
        d['store_name'] = self.store_name
        d['last_updated'] = datetime.now().isoformat()
        return d

In [None]:
# @title Rahim Store Scraper Logic
import asyncio
from playwright.async_api import async_playwright

class RahimStoreScraper(BaseScraper):
    def __init__(self, output_dir: str = "data"):
        super().__init__(store_name="Rahim Store", base_url="https://www.rahimstore.com/department", output_dir=output_dir)
        self.departments = ['001', '002', '003', '004', '005', '006']
        self.playwright = None
        self.browser = None
        self.context = None

    async def setup_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
        self.context = await self.browser.new_context(viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0')
    
    async def close_browser(self):
        if self.context: await self.context.close()
        if self.browser: await self.browser.close()
        if self.playwright: await self.playwright.stop()

    async def wait_for_products(self, page):
        try: await page.wait_for_selector('.item.img-hover-zoom--quick-zoom', timeout=15000); return True
        except: return False

    async def scrape_dept(self, dept_id: str) -> List[Dict]:
        page = await self.context.new_page()
        all_p = []
        try:
            await page.goto(f"{self.base_url}/{dept_id}", wait_until='domcontentloaded')
            pg = 1
            while pg <= 5: # Limit pages for demo
                if not await self.wait_for_products(page): break
                cards = await page.query_selector_all('.item.img-hover-zoom--quick-zoom')
                for card in cards:
                    n_el = await card.query_selector('a[style*="display:block"]')
                    p_el = await card.query_selector('strong')
                    if n_el and p_el:
                        n = await n_el.inner_text(); p_txt = await p_el.inner_text()
                        u, q = self._parse_unit_quantity(n)
                        prod = self.create_product_dict(product_name=n.strip(), price=self._clean_price(p_txt), url=self.base_url+await n_el.get_attribute('href'), category=f"Dept {dept_id}", unit=u, quantity=q)
                        if self.validate_product(prod): all_p.append(prod)
                # Next page
                next_btn = await page.query_selector('a.page-link[aria-label="Next"]')
                if next_btn and not await next_btn.evaluate('(e)=>e.parentElement.classList.contains("disabled")'):
                    await next_btn.click(); await asyncio.sleep(3); pg += 1
                else: break
            return all_p
        finally: await page.close()

    async def scrape(self) -> List[Dict]:
        all_p = []
        await self.setup_browser()
        try:
            for d in self.departments[:3]: # Limit departments for demo
                all_p.extend(await self.scrape_dept(d))
                await asyncio.sleep(2)
        finally: await self.close_browser()
        return all_p

    async def get_categories(self) -> List[Dict]: return []

In [None]:
# @title Execution
scraper = RahimStoreScraper()
products = await scraper.scrape()
if products:
    scraper.save_to_csv(products)
import pandas as pd
if products:
    df = pd.DataFrame(products)
    display(df.head())