In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

In [11]:
url = "https://www.ebay.co.uk/sch/i.html?_dcat=66700&_fsrp=1&rt=nc&_from=R40&LH_PrefLoc=1&_ipg=240&LH_ItemCondition=4&LH_Sold=1&_nkw=pram+buggy+pushchair+stroller&_sacat=0&LH_BIN=1&_sop=12&LH_SellerType=1"

In [None]:


def scrape_ebay_sold_items(url):
    """
    Final corrected version using su-card-container__content containers
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        print("Extracting from su-card-container__content containers...")

        # Find the correct item containers
        item_containers = soup.find_all('div', class_='su-card-container__content')
        print(f"Found {len(item_containers)} item containers")

        extracted_items = []

        for i, container in enumerate(item_containers):
            item_data = extract_from_container(container)

            if item_data:
                extracted_items.append(item_data)

                # Show progress for first few items
                if len(extracted_items) <= 5:
                    print(f"  ✓ Item {len(extracted_items)}: {item_data['description'][:40]}... - £{item_data['price_sold']}")

        # Remove duplicates (if any)
        df = pd.DataFrame(extracted_items)

        if not df.empty:
            # Remove exact duplicates
            df = df.drop_duplicates(subset=['description', 'price_sold', 'date_sold'])
            df = df.reset_index(drop=True)

            print(f"Final dataset: {len(df)} unique items")

        return df

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame(columns=['date_sold', 'description', 'price_sold'])

def extract_from_container(container):
    """
    Extract complete item data from su-card-container__content
    """
    try:
        # Find price within this container
        price_elem = container.find('span', class_='s-card__price')
        if not price_elem:
            return None

        price_text = price_elem.get_text().strip()

        # Skip price ranges
        if 'to' in price_text.lower():
            return None

        price_match = re.search(r'£([\d,]+\.?\d*)', price_text)
        if not price_match:
            return None

        price_sold = float(price_match.group(1).replace(',', ''))

        # Find sold date within this container
        date_elem = container.find('span', string=re.compile(r'sold.*\d+.*\w+.*\d{4}', re.I))
        if not date_elem:
            return None

        date_text = date_elem.get_text().strip()
        date_match = re.search(r'sold\s+(\d{1,2}\s+\w+\s+\d{4})', date_text, re.I)
        if not date_match:
            return None

        try:
            date_sold = datetime.strptime(date_match.group(1), '%d %b %Y').date()
        except:
            return None

        # Find description within this container
        desc_elem = container.find('a', href=re.compile(r'/itm/'))
        if not desc_elem:
            return None

        description = desc_elem.get_text().strip()

        # Validate description (should be substantial and item-specific)
        if not description or len(description) < 5:
            return None

        return {
            'date_sold': date_sold,
            'description': description,
            'price_sold': price_sold
        }

    except Exception:
        return None

In [None]:
# Extract the data
df = scrape_ebay_sold_items(url)

if not df.empty:
    print(f"Dataset summary:")
    print(f"  Total items: {len(df)}")
    print(f"  Date range: {df['date_sold'].min()} to {df['date_sold'].max()}")
    print(f"  Price range: £{df['price_sold'].min():.2f} to £{df['price_sold'].max():.2f}")

    # Save final results
    df.to_csv('ebay_sold_items_final.csv', index=False)
    print(f"\n💾 Saved final dataset to 'ebay_sold_items_final.csv'")

else:
    print("❌ Extraction failed")