# Al-Fatah Scraper
This notebook scrapes product data from Al-Fatah Pakistan using their Shopify JSON API.

### Instructions:
1. Run the **Setup Cell** to install dependencies.
2. Run the **Scraper Logic Cell** to define the classes.
3. Run the **Execution Cell** to start scraping.

In [None]:
# @title Setup Cell
!pip install aiohttp pandas
import os
os.makedirs('data', exist_ok=True)

In [None]:
# @title Base Infrastructure
import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
import time
import random

class BaseScraper(ABC):
    def __init__(self, store_name: str, base_url: str, output_dir: str = "data"):
        self.store_name = store_name
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.logger = self._setup_logger()
        self.csv_headers = [
            'store_name', 'product_name', 'brand', 'category', 'subcategory',
            'price', 'discounted_price', 'unit', 'quantity', 'url', 'image_url', 'last_updated'
        ]
    
    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.store_name}_scraper")
        logger.setLevel(logging.INFO)
        if not logger.handlers:
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            logger.addHandler(ch)
        return logger
    
    def _rate_limit(self, min_delay: float = 1.0, max_delay: float = 3.0):
        time.sleep(random.uniform(min_delay, max_delay))
    
    def _clean_price(self, price_str: str) -> Optional[float]:
        if not price_str: return None
        try:
            cleaned = str(price_str).replace('Rs.', '').replace('PKR', '').replace('Rs', '').replace(',', '').strip()
            return float(cleaned)
        except (ValueError, AttributeError): return None
    
    def _parse_unit_quantity(self, text: str) -> tuple[Optional[str], Optional[float]]:
        if not text: return None, None
        import re
        patterns = [r'(\d+\.?\d*)\s*(kg|g|l|ml|piece|pcs|pack)', r'(\d+\.?\d*)(kg|g|l|ml|piece|pcs|pack)']
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match: return match.group(2), float(match.group(1))
        return None, None
    
    def save_to_csv(self, products: List[Dict], filename: Optional[str] = None):
        if not products: return
        if filename is None:
            filename = f"{self.store_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        filepath = self.output_dir / filename
        with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=self.csv_headers)
            writer.writeheader()
            for product in products:
                row = {header: product.get(header, None) for header in self.csv_headers}
                writer.writerow(row)
        print(f"\u2713 Saved {len(products)} products to {filepath}")

    def validate_product(self, product: Dict) -> bool: return bool(product.get('product_name') and product.get('price'))
    
    @abstractmethod
    async def scrape(self) -> List[Dict]: pass
    @abstractmethod
    async def get_categories(self) -> List[Dict]: pass
    
    def create_product_dict(self, **kwargs) -> Dict:
        d = {h: kwargs.get(h) for h in self.csv_headers}
        d['store_name'] = self.store_name
        d['last_updated'] = datetime.now().isoformat()
        return d

In [None]:
# @title Al-Fatah Scraper Logic
import asyncio
import aiohttp

class AlFatahScraper(BaseScraper):
    def __init__(self, output_dir: str = "data"):
        super().__init__(store_name="Al-Fatah", base_url="https://alfatah.pk", output_dir=output_dir)
    
    async def fetch_page(self, session: aiohttp.ClientSession, page: int) -> List[Dict]:
        url = f"{self.base_url}/collections/all/products.json"
        try:
            async with session.get(url, params={'limit': 250, 'page': page}, timeout=30) as response:
                if response.status != 200: return []
                data = await response.json()
                return data.get('products', [])
        except Exception as e:
            self.logger.error(f"Error fetching page {page}: {e}")
            return []

    async def get_categories(self) -> List[Dict]:
        return [{'name': 'All Products', 'url': f"{self.base_url}/collections/all"}]

    async def scrape(self) -> List[Dict]:
        all_products = []
        page = 1
        async with aiohttp.ClientSession(headers={'User-Agent': 'Mozilla/5.0'}) as session:
            while True:
                products = await self.fetch_page(session, page)
                if not products: break
                for item in products:
                    variants = item.get('variants', [])
                    if not variants: continue
                    v = variants[0]
                    unit, quantity = self._parse_unit_quantity(item.get('title', ''))
                    product = self.create_product_dict(
                        product_name=item.get('title'),
                        price=self._clean_price(v.get('price')),
                        discounted_price=self._clean_price(v.get('compare_at_price')),
                        url=f"{self.base_url}/products/{item.get('handle')}",
                        image_url=item.get('images', [{}])[0].get('src'),
                        brand=item.get('vendor'),
                        category=item.get('product_type', 'General'),
                        unit=unit, quantity=quantity
                    )
                    if self.validate_product(product): all_products.append(product)
                self.logger.info(f"\u2713 Page {page}: Extracted {len(products)} products")
                page += 1
                if page > 100: break
                await asyncio.sleep(1)
        return all_products

In [None]:
# @title Execution
scraper = AlFatahScraper()
products = await scraper.scrape()
if products:
    scraper.save_to_csv(products)
import pandas as pd
if products:
    df = pd.DataFrame(products)
    display(df.head())