In [22]:
import requests
from bs4 import BeautifulSoup
import re
import json

ITEM_URL_BASE = "https://bina.az/items/"
BASE_URL = "https://bina.az/items/all"
HOME_FILE = "data/test.html"
OUTPUT_FILE = "data/items_data.json"
headers = {'User-Agent': 'Mozilla/5.0'}

def get_unique_item_ids(max_pages=3):
    item_ids = set()
    page = 1
    has_more_pages = True
    
    while has_more_pages:
        if max_pages and page > max_pages:
            break
            
        url = f"{BASE_URL}?page={page}"
        print(f"Fetching page {page}: {url}")
        
        try:
            response = requests.get(url, headers=headers)
            
            if response.status_code != 200:
                print(f"Failed to fetch page {page}: {response.status_code}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            
            # Find all item links and extract IDs from hrefs
            current_page_ids = set()
            for a in soup.select("a.item_link"):  # Adjust selector if needed
                href = a.get("href")
                if href:
                    # Use regex to extract numeric ID from /items/123456 format
                    match = re.search(r'/items/(\d+)', href)
                    if match:
                        item_id = match.group(1)
                        current_page_ids.add(item_id)
            
            # Check if we found any new items on this page
            if not current_page_ids:
                print(f"No items found on page {page}, stopping pagination.")
                has_more_pages = False
            else:
                new_items = current_page_ids - item_ids
                if not new_items:
                    print(f"No new items found on page {page}, stopping pagination.")
                    has_more_pages = False
                else:
                    print(f"Found {len(new_items)} new items on page {page}")
                    item_ids.update(current_page_ids)
                    page += 1
                    
        except Exception as e:
            print(f"Error fetching page {page}: {str(e)}")
            break

    # Optional: Save raw HTML for debugging (only the last page)
    with open(HOME_FILE, mode="w", newline="", encoding="utf-8") as f:
        f.write(response.text)

    return item_ids

# Run the function and print results
item_ids = get_unique_item_ids()
links = list(item_ids)
print(links)
print(len(links))

Fetching page 1: https://bina.az/items/all?page=1
Found 28 new items on page 1
Fetching page 2: https://bina.az/items/all?page=2
Found 28 new items on page 2
Fetching page 3: https://bina.az/items/all?page=3
Found 28 new items on page 3
['4959734', '5165167', '5122759', '4802446', '5138486', '5186362', '4113285', '5171230', '4351437', '4939762', '4061219', '4687575', '4839041', '5172433', '5037112', '4707683', '5114032', '4749417', '5094015', '4966017', '5151006', '4986500', '5184519', '5165139', '5038500', '4624278', '4661811', '5097244', '5170034', '5109657', '5111890', '3485086', '4822793', '5185733', '5104486', '4959777', '5153278', '4999301', '5118470', '4901100', '5115965', '4158518', '4884485', '5068771', '4891042', '5060163', '5107114', '3818945', '5144278', '4486588', '4280166', '5110014', '5112825', '4632009', '5181767', '5089925', '5103590', '4983218', '5187275', '3713070', '4038124', '5167908', '4266689', '5181362', '5186086', '5184523', '5168065', '5163744', '5187276', '51

In [23]:
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Assume links is defined from previous scraping step (e.g., from get_links_from_base())
# Example: links = ["/items/5160924", "/items/5110023", ...]

def extract_item_data(link):
    url = ITEM_URL_BASE + link
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract item ID
    item_id = link.split("/")[-1]

    # Extract amount and currency using meta tags
    amount_tag = soup.find('meta', {'property': 'product:price:amount'})
    currency_tag = soup.find('meta', {'property': 'product:price:currency'})

    amount = amount_tag['content'] if amount_tag else None
    currency = currency_tag['content'] if currency_tag else None

    # Extract property fields dynamically
    properties = {}
    prop_div = soup.find("div", class_="product-properties__column")
    if prop_div:
        for prop in prop_div.find_all("div", class_="product-properties__i"):
            name = prop.find("label", class_="product-properties__i-name").get_text(strip=True)
            value = prop.find("span", class_="product-properties__i-value").get_text(strip=True)
            properties[name] = value

    # Construct result dictionary
    item_data = {
        'item_id': item_id,
        'amount': amount,
        'currency': currency,
    }

    # Add all found properties dynamically
    item_data.update(properties)

    return item_data


def scrape_all_items():
    results = []

    for idx, link in enumerate(links):
        print(f"[{idx+1}/{len(links)}] Processing: {link}")
        data = extract_item_data(link)
        if data:
            results.append(data)

    # Save to JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"Scraping completed. Data saved to {OUTPUT_FILE}")


# Run the scraper
scrape_all_items()

[1/84] Processing: 4959734
[2/84] Processing: 5165167
[3/84] Processing: 5122759
[4/84] Processing: 4802446
[5/84] Processing: 5138486
[6/84] Processing: 5186362
[7/84] Processing: 4113285
[8/84] Processing: 5171230
[9/84] Processing: 4351437
[10/84] Processing: 4939762
[11/84] Processing: 4061219
[12/84] Processing: 4687575
[13/84] Processing: 4839041
[14/84] Processing: 5172433
[15/84] Processing: 5037112
[16/84] Processing: 4707683
[17/84] Processing: 5114032
[18/84] Processing: 4749417
[19/84] Processing: 5094015
[20/84] Processing: 4966017
[21/84] Processing: 5151006
[22/84] Processing: 4986500
[23/84] Processing: 5184519
[24/84] Processing: 5165139
[25/84] Processing: 5038500
[26/84] Processing: 4624278
[27/84] Processing: 4661811
[28/84] Processing: 5097244
[29/84] Processing: 5170034
[30/84] Processing: 5109657
[31/84] Processing: 5111890
[32/84] Processing: 3485086
[33/84] Processing: 4822793
[34/84] Processing: 5185733
[35/84] Processing: 5104486
[36/84] Processing: 4959777
[