# GrocerApp Scraper
This notebook scrapes product data from GrocerApp Pakistan using Playwright for infinite scroll support.

### Instructions:
1. Run the **Setup Cell** to install dependencies and Playwright browser.
2. Run the **Scraper Logic Cell** to define the classes.
3. Run the **Execution Cell** to start scraping.

In [None]:
# @title Setup Cell
!pip install playwright nest_asyncio pandas
!playwright install chromium
import os
os.makedirs('data', exist_ok=True)
import nest_asyncio
nest_asyncio.apply()

In [None]:
# @title Base Infrastructure
import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
import time
import random

class BaseScraper(ABC):
    def __init__(self, store_name: str, base_url: str, output_dir: str = "data"):
        self.store_name = store_name
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.logger = self._setup_logger()
        self.csv_headers = [
            'store_name', 'product_name', 'brand', 'category', 'subcategory',
            'price', 'discounted_price', 'unit', 'quantity', 'url', 'image_url', 'last_updated'
        ]
    
    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.store_name}_scraper")
        logger.setLevel(logging.INFO)
        if not logger.handlers:
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            logger.addHandler(ch)
        return logger
    
    def _rate_limit(self, min_delay: float = 1.0, max_delay: float = 3.0):
        time.sleep(random.uniform(min_delay, max_delay))
    
    def _clean_price(self, price_str: str) -> Optional[float]:
        if not price_str: return None
        try:
            cleaned = str(price_str).replace('Rs.', '').replace('PKR', '').replace('Rs', '').replace(',', '').strip()
            return float(cleaned)
        except (ValueError, AttributeError): return None
    
    def _parse_unit_quantity(self, text: str) -> tuple[Optional[str], Optional[float]]:
        if not text: return None, None
        import re
        patterns = [r'(\d+\.?\d*)\s*(kg|g|l|ml|piece|pcs|pack)', r'(\d+\.?\d*)(kg|g|l|ml|piece|pcs|pack)']
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match: return match.group(2), float(match.group(1))
        return None, None
    
    def save_to_csv(self, products: List[Dict], filename: Optional[str] = None):
        if not products: return
        if filename is None:
            filename = f"{self.store_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        filepath = self.output_dir / filename
        with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=self.csv_headers)
            writer.writeheader()
            for product in products:
                row = {header: product.get(header, None) for header in self.csv_headers}
                writer.writerow(row)
        print(f"\u2713 Saved {len(products)} products to {filepath}")

    def validate_product(self, product: Dict) -> bool: return bool(product.get('product_name') and product.get('price'))
    
    @abstractmethod
    async def scrape(self) -> List[Dict]: pass
    @abstractmethod
    async def get_categories(self) -> List[Dict]: pass
    
    def create_product_dict(self, **kwargs) -> Dict:
        d = {h: kwargs.get(h) for h in self.csv_headers}
        d['store_name'] = self.store_name
        d['last_updated'] = datetime.now().isoformat()
        return d

In [None]:
# @title GrocerApp Scraper Logic
import asyncio
from playwright.async_api import async_playwright

class GrocerAppScraper(BaseScraper):
    def __init__(self, output_dir: str = "data"):
        super().__init__(store_name="GrocerApp", base_url="https://grocerapp.pk", output_dir=output_dir)
        self.playwright = None
        self.browser = None
        self.context = None
    
    async def setup_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
        self.context = await self.browser.new_context(viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0')
    
    async def close_browser(self):
        if self.context: await self.context.close()
        if self.browser: await self.browser.close()
        if self.playwright: await self.playwright.stop()
    
    async def scroll_to_bottom(self, page):
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(3)
            new_height = await page.evaluate("document.body.scrollHeight")
            if new_height == last_height: break
            last_height = new_height
            if await page.locator(".MuiCard-root").count() > 300: break
            
    async def get_categories(self) -> List[Dict]:
        page = await self.context.new_page()
        try:
            await page.goto(f"{self.base_url}/categories", wait_until="networkidle", timeout=60000)
            await asyncio.sleep(2)
            categories = await page.evaluate('''() => {
                const cats = [];
                document.querySelectorAll('a[href]').forEach(a => {
                    const nameEl = a.querySelector('p');
                    const name = nameEl ? nameEl.innerText.trim() : '';
                    const path = a.getAttribute('href');
                    if (name && path && path.length > 1 && !path.includes('categories') && !path.includes('cart') && !path.includes('login')) {
                        if (!cats.find(c => c.url === a.href)) cats.push({ name, url: a.href });
                    }
                });
                return cats;
            }''')
            return categories
        finally: await page.close()

    async def scrape_category(self, category: Dict) -> List[Dict]:
        page = await self.context.new_page()
        products = []
        try:
            await page.goto(category['url'], wait_until="networkidle", timeout=60000)
            await self.scroll_to_bottom(page)
            cards = await page.evaluate('''() => {
                const res = [];
                document.querySelectorAll('.MuiCard-root').forEach(card => {
                    const nameEl = card.querySelector('.MuiTypography-body1');
                    const priceEl = card.querySelector('.MuiTypography-subtitle2');
                    const imgEl = card.querySelector('img');
                    const linkEl = card.querySelector('a');
                    if (nameEl && priceEl) res.push({
                        name: nameEl.innerText.trim(),
                        priceText: priceEl.innerText.trim(),
                        img: imgEl ? imgEl.src : null,
                        url: linkEl ? linkEl.href : null
                    });
                });
                return res;
            }''')
            for card in cards:
                unit, quantity = self._parse_unit_quantity(card['name'])
                product = self.create_product_dict(
                    product_name=card['name'], price=self._clean_price(card['priceText']),
                    url=card['url'], image_url=card['img'], category=category['name'],
                    unit=unit, quantity=quantity
                )
                if self.validate_product(product): products.append(product)
            return products
        finally: await page.close()

    async def scrape(self) -> List[Dict]:
        all_products = []
        await self.setup_browser()
        try:
            categories = await self.get_categories()
            for category in categories[:10]: # Limit categories for testing
                products = await self.scrape_category(category)
                all_products.extend(products)
                await asyncio.sleep(1)
        finally: await self.close_browser()
        return all_products

In [None]:
# @title Execution
scraper = GrocerAppScraper()
products = await scraper.scrape()
if products:
    scraper.save_to_csv(products)
import pandas as pd
if products:
    df = pd.DataFrame(products)
    display(df.head())