In [17]:
import requests
from bs4 import BeautifulSoup
import re

ITEM_URL_BASE = "https://bina.az/items/"
BASE_URL = "https://bina.az/items/all"
HOME_FILE = "data/test.html"
OUTPUT_FILE = "data/items_data.json"
headers = {'User-Agent': 'Mozilla/5.0'}

def get_unique_item_ids():
    # Send a request to the base URL
    response = requests.get(BASE_URL, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        return set()

    soup = BeautifulSoup(response.text, "html.parser")

    item_ids = set()
    
    # Find all item links and extract IDs from hrefs
    for a in soup.select("a.item_link"):  # Adjust selector if needed
        href = a.get("href")
        if href:
            # Use regex to extract numeric ID from /items/123456 format
            match = re.search(r'/items/(\d+)', href)
            if match:
                item_id = match.group(1)
                item_ids.add(item_id)

    # Optional: Save raw HTML for debugging
    with open(HOME_FILE, mode="w", newline="", encoding="utf-8") as f:
        f.write(response.text)

    return item_ids

# Run the function and print results
item_ids = get_unique_item_ids()
links = list(item_ids)
print(links)

['5112825', '4983218', '4113285', '4711907', '4038124', '5167908', '4687575', '4266689', '4839041', '5107114', '5172433', '5037112', '5184523', '5168065', '5184519', '5186450', '4624278', '4530215', '5179247', '5170034', '5181371', '5145673', '4999301', '5160637', '5068771', '5119065', '4632009', '5144278']


In [18]:
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Assume links is defined from previous scraping step (e.g., from get_links_from_base())
# Example: links = ["/items/5160924", "/items/5110023", ...]

def extract_item_data(link):
    url = ITEM_URL_BASE + link
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract item ID
    item_id = link.split("/")[-1]

    # Extract amount and currency using meta tags
    amount_tag = soup.find('meta', {'property': 'product:price:amount'})
    currency_tag = soup.find('meta', {'property': 'product:price:currency'})

    amount = amount_tag['content'] if amount_tag else None
    currency = currency_tag['content'] if currency_tag else None

    # Extract property fields dynamically
    properties = {}
    prop_div = soup.find("div", class_="product-properties__column")
    if prop_div:
        for prop in prop_div.find_all("div", class_="product-properties__i"):
            name = prop.find("label", class_="product-properties__i-name").get_text(strip=True)
            value = prop.find("span", class_="product-properties__i-value").get_text(strip=True)
            properties[name] = value

    # Construct result dictionary
    item_data = {
        'item_id': item_id,
        'amount': amount,
        'currency': currency,
    }

    # Add all found properties dynamically
    item_data.update(properties)

    return item_data


def scrape_all_items():
    results = []

    for idx, link in enumerate(links):
        print(f"[{idx+1}/{len(links)}] Processing: {link}")
        data = extract_item_data(link)
        if data:
            results.append(data)

    # Save to JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"Scraping completed. Data saved to {OUTPUT_FILE}")


# Run the scraper
scrape_all_items()

[1/28] Processing: 5112825
[2/28] Processing: 4983218
[3/28] Processing: 4113285
[4/28] Processing: 4711907
[5/28] Processing: 4038124
[6/28] Processing: 5167908
[7/28] Processing: 4687575
[8/28] Processing: 4266689
[9/28] Processing: 4839041
[10/28] Processing: 5107114
[11/28] Processing: 5172433
[12/28] Processing: 5037112
[13/28] Processing: 5184523
[14/28] Processing: 5168065
[15/28] Processing: 5184519
[16/28] Processing: 5186450
[17/28] Processing: 4624278
[18/28] Processing: 4530215
[19/28] Processing: 5179247
[20/28] Processing: 5170034
[21/28] Processing: 5181371
[22/28] Processing: 5145673
[23/28] Processing: 4999301
[24/28] Processing: 5160637
[25/28] Processing: 5068771
[26/28] Processing: 5119065
[27/28] Processing: 4632009
[28/28] Processing: 5144278
Scraping completed. Data saved to data/items_data.json
