In [9]:
import requests
from bs4 import BeautifulSoup
import re
import json
from config.config import BASE_URL
from logger.logger import logger

headers = {'User-Agent': 'Mozilla/5.0'}

def get_unique_item_ids(max_pages=2):
    """Fetch unique item IDs from paginated pages with proper logging."""
    item_ids = set()
    page = 1
    has_more_pages = True
    
    logger.info(f"Starting to scrape item IDs (max pages: {max_pages})")
    
    while has_more_pages:
        if max_pages and page > max_pages:
            logger.info(f"Reached max pages limit ({max_pages})")
            break
            
        url = f"{BASE_URL}?page={page}"
        logger.debug(f"Fetching page {page}: {url}")
        
        try:
            response = requests.get(url, headers=headers)
            
            if response.status_code != 200:
                logger.error(f"Failed to fetch page {page}. Status code: {response.status_code}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            
            # Find all item links and extract IDs from hrefs
            current_page_ids = set()
            for a in soup.select("a.products-i__link"):
                href = a.get("href")
                if href:
                    match = re.search(r'/autos/(\d+)', href)
                    if match:
                        item_id = match.group(1)
                        current_page_ids.add(item_id)
            
            # Check if we found any new items on this page
            if not current_page_ids:
                logger.info(f"No items found on page {page}, stopping pagination")
                has_more_pages = False
            else:
                new_items = current_page_ids - item_ids
                if not new_items:
                    logger.info(f"No new items found on page {page}, stopping pagination")
                    has_more_pages = False
                else:
                    logger.info(f"Found {len(new_items)} new items on page {page}")
                    item_ids.update(current_page_ids)
                    page += 1
                    
        except requests.RequestException as e:
            logger.error(f"Network error fetching page {page}: {str(e)}", exc_info=True)
            break
        except Exception as e:
            logger.error(f"Unexpected error processing page {page}: {str(e)}", exc_info=True)
            break
    logger.info(f"Finished scraping. Found {len(item_ids)} unique items")
    return item_ids    

# Run the function and print results
item_ids = get_unique_item_ids()
links = list(item_ids)
print(links)
print(len(links))

2025-06-04 20:16:27 - scraper - INFO - Starting to scrape item IDs (max pages: 2)
2025-06-04 20:16:29 - scraper - INFO - Found 36 new items on page 1
2025-06-04 20:16:33 - scraper - INFO - Found 35 new items on page 2
2025-06-04 20:16:33 - scraper - INFO - Reached max pages limit (2)
2025-06-04 20:16:33 - scraper - INFO - Finished scraping. Found 71 unique items


['9485366', '9470045', '9499757', '9441472', '9512961', '9506811', '9511123', '9493725', '9512964', '8860926', '9512956', '9512959', '9468381', '9122833', '9502951', '9433877', '9504555', '9512955', '9499025', '9248630', '7796314', '9512958', '9459711', '9316223', '9448228', '9436263', '9419397', '9512965', '9457851', '9512972', '9435389', '8904645', '9473032', '9512940', '9059811', '9480968', '9201400', '9512971', '9444787', '9399711', '9004270', '9310974', '9428149', '8801689', '9486312', '8854919', '9479885', '9512952', '9512926', '9321904', '9480305', '9512954', '9512962', '9270343', '9490749', '9512905', '9400495', '9512920', '9375328', '9502894', '9512909', '9261772', '9434653', '8210854', '9474554', '9500610', '9497935', '9462931', '9072179', '9498204', '9508303']
71
