In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
from config.config import ITEM_URL_BASE, BASE_URL, HOME_FILE, OUTPUT_FILE
from logger.logger import logger  # Import the configured logger

headers = {'User-Agent': 'Mozilla/5.0'}

def get_unique_item_ids(max_pages=100):
    """Fetch unique item IDs from paginated pages with proper logging."""
    item_ids = set()
    page = 1
    has_more_pages = True
    
    logger.info(f"Starting to scrape item IDs (max pages: {max_pages})")
    
    while has_more_pages:
        if max_pages and page > max_pages:
            logger.info(f"Reached max pages limit ({max_pages})")
            break
            
        url = f"{BASE_URL}?page={page}"
        logger.debug(f"Fetching page {page}: {url}")
        
        try:
            response = requests.get(url, headers=headers)
            
            if response.status_code != 200:
                logger.error(f"Failed to fetch page {page}. Status code: {response.status_code}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            
            # Find all item links and extract IDs from hrefs
            current_page_ids = set()
            for a in soup.select("a.item_link"):
                href = a.get("href")
                if href:
                    match = re.search(r'/items/(\d+)', href)
                    if match:
                        item_id = match.group(1)
                        current_page_ids.add(item_id)
            
            # Check if we found any new items on this page
            if not current_page_ids:
                logger.info(f"No items found on page {page}, stopping pagination")
                has_more_pages = False
            else:
                new_items = current_page_ids - item_ids
                if not new_items:
                    logger.info(f"No new items found on page {page}, stopping pagination")
                    has_more_pages = False
                else:
                    logger.info(f"Found {len(new_items)} new items on page {page}")
                    item_ids.update(current_page_ids)
                    page += 1
                    
        except requests.RequestException as e:
            logger.error(f"Network error fetching page {page}: {str(e)}", exc_info=True)
            break
        except Exception as e:
            logger.error(f"Unexpected error processing page {page}: {str(e)}", exc_info=True)
            break

    # Save raw HTML for debugging
    try:
        with open(HOME_FILE, mode="w", newline="", encoding="utf-8") as f:
            f.write(response.text)
        logger.debug(f"Saved last page HTML to {HOME_FILE}")
    except Exception as e:
        logger.error(f"Failed to save HTML to {HOME_FILE}: {str(e)}", exc_info=True)

    logger.info(f"Finished scraping. Found {len(item_ids)} unique items")
    return item_ids

# Run the function and print results
item_ids = get_unique_item_ids()
links = list(item_ids)
print(links)
print(len(links))

2025-05-11 01:03:35 - scraper - INFO - Starting to scrape item IDs (max pages: 100)
2025-05-11 01:03:36 - scraper - INFO - Found 28 new items on page 1
2025-05-11 01:03:37 - scraper - INFO - Found 28 new items on page 2
2025-05-11 01:03:38 - scraper - INFO - Found 28 new items on page 3
2025-05-11 01:03:38 - scraper - INFO - Found 28 new items on page 4
2025-05-11 01:03:39 - scraper - INFO - Found 28 new items on page 5
2025-05-11 01:03:40 - scraper - INFO - Found 28 new items on page 6
2025-05-11 01:03:41 - scraper - INFO - Found 28 new items on page 7
2025-05-11 01:03:41 - scraper - INFO - Found 28 new items on page 8
2025-05-11 01:03:42 - scraper - INFO - Found 27 new items on page 9
2025-05-11 01:03:43 - scraper - INFO - Found 28 new items on page 10
2025-05-11 01:03:44 - scraper - INFO - Found 28 new items on page 11
2025-05-11 01:03:45 - scraper - INFO - Found 28 new items on page 12
2025-05-11 01:03:46 - scraper - INFO - Found 28 new items on page 13
2025-05-11 01:03:46 - scrape

['4732723', '2713423', '4823403', '5184339', '5156987', '4382279', '5192358', '5138970', '5192029', '5123748', '5153633', '5163670', '5192495', '5102625', '4776636', '5149972', '5192270', '5167981', '4647662', '5186769', '5184529', '5158217', '5191980', '5112392', '5191834', '4959532', '5135806', '5129932', '5140449', '5192028', '5153012', '5162563', '4786675', '5124957', '5091145', '5013154', '4797897', '5191153', '5192503', '5169318', '5187322', '5082549', '5125360', '3562101', '5165411', '4810964', '5059153', '5192510', '5179327', '5130441', '5182310', '5192309', '5161261', '4664196', '4743358', '5181171', '4875445', '5066883', '5029492', '5192060', '5192370', '5192325', '4266396', '4768514', '5186031', '4774744', '5120036', '4693377', '5178524', '3933573', '4834449', '5192323', '5101433', '4467649', '5183345', '5191990', '5100262', '5103466', '5019836', '5192214', '5155249', '4929802', '5191237', '4949873', '5192194', '5006282', '5119146', '5151886', '3788409', '3818917', '5115350'

In [2]:
def extract_item_data(link):
    """Extract detailed item data from individual product page with comprehensive logging"""
    url = ITEM_URL_BASE + link
    logger.debug(f"Starting to extract data from: {url}")
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            logger.error(f"Failed to fetch {url} - Status code: {response.status_code}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract item ID
        item_id = link.split("/")[-1]
        logger.debug(f"Processing item ID: {item_id}")

        # Extract amount and currency using meta tags
        amount_tag = soup.find('meta', {'property': 'product:price:amount'})
        currency_tag = soup.find('meta', {'property': 'product:price:currency'})

        amount = amount_tag['content'] if amount_tag else None
        currency = currency_tag['content'] if currency_tag else None
        logger.debug(f"Extracted price: {amount} {currency}")

        # Extract property fields dynamically
        properties = {}
        prop_div = soup.find("div", class_="product-properties__column")
        if prop_div:
            for prop in prop_div.find_all("div", class_="product-properties__i"):
                try:
                    name = prop.find("label", class_="product-properties__i-name").get_text(strip=True)
                    value = prop.find("span", class_="product-properties__i-value").get_text(strip=True)
                    properties[name] = value
                    logger.debug(f"Found property: {name} = {value}")
                except Exception as e:
                    logger.warning(f"Failed to extract property: {str(e)}", exc_info=True)

        # Extract locations
        location_tags = soup.find_all('a', {'data-stat': 'product-locations'})
        locations = [tag.get_text(strip=True) for tag in location_tags] if location_tags else []
        logger.debug(f"Found locations: {locations}")

        # Construct result dictionary
        item_data = {
            'item_id': item_id,
            'amount': amount,
            'currency': currency,
            'location': locations
        }

        # Add all found properties dynamically
        item_data.update(properties)
        
        logger.info(f"Successfully processed item: {item_id}")
        return item_data

    except requests.RequestException as e:
        logger.error(f"Network error while processing {url}: {str(e)}", exc_info=True)
        return None
    except Exception as e:
        logger.error(f"Unexpected error processing {url}: {str(e)}", exc_info=True)
        return None


def scrape_all_items():
    """Main scraping function that processes all items with progress tracking"""
    if not links:
        logger.warning("No links to process - empty list provided")
        return

    results = []
    logger.info(f"Starting to process {len(links)} items")

    for idx, link in enumerate(links, 1):
        logger.info(f"[{idx}/{len(links)}] Processing: {link}")
        data = extract_item_data(link)
        if data:
            results.append(data)
            logger.debug(f"Added item {data.get('item_id')} to results")
        else:
            logger.warning(f"Failed to process item: {link}")

    # Save to JSON
    try:
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(results)} items to {OUTPUT_FILE}")
        if len(results) != len(links):
            logger.warning(f"Processed {len(results)} out of {len(links)} items ({(len(results)/len(links))*100:.1f}% success rate)")
    except Exception as e:
        logger.error(f"Failed to save data to {OUTPUT_FILE}: {str(e)}", exc_info=True)


# Run the scraper
scrape_all_items()

2025-05-11 01:05:02 - scraper - INFO - Starting to process 2682 items
2025-05-11 01:05:02 - scraper - INFO - [1/2682] Processing: 4732723
2025-05-11 01:05:02 - scraper - INFO - Successfully processed item: 4732723
2025-05-11 01:05:02 - scraper - INFO - [2/2682] Processing: 2713423
2025-05-11 01:05:03 - scraper - INFO - Successfully processed item: 2713423
2025-05-11 01:05:03 - scraper - INFO - [3/2682] Processing: 4823403
2025-05-11 01:05:04 - scraper - INFO - Successfully processed item: 4823403
2025-05-11 01:05:04 - scraper - INFO - [4/2682] Processing: 5184339
2025-05-11 01:05:05 - scraper - INFO - Successfully processed item: 5184339
2025-05-11 01:05:05 - scraper - INFO - [5/2682] Processing: 5156987
2025-05-11 01:05:06 - scraper - INFO - Successfully processed item: 5156987
2025-05-11 01:05:06 - scraper - INFO - [6/2682] Processing: 4382279
2025-05-11 01:05:07 - scraper - INFO - Successfully processed item: 4382279
2025-05-11 01:05:07 - scraper - INFO - [7/2682] Processing: 519235