<a href="https://colab.research.google.com/github/parpsyche/eauctionsindia_scrape/blob/main/scrape_delhi_properties.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
BASE_URL = "https://www.eauctionsindia.com"
START_URL = "https://www.eauctionsindia.com/city/new-delhi/1"
MAX_WORKERS = 10  # Number of threads to use for scraping detail pages
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- Helper Functions for Safe Extraction ---
def find_detail_by_strong_text(soup, card_title, detail_label):
    """A helper to find data points based on their <strong> label within a specific card."""
    try:
        card_header = soup.find('h5', class_='text-secondary', string=card_title)
        if not card_header: return 'N/A'

        card_body = card_header.find_parent('div', class_='card-header').find_next_sibling('div', class_='card-body')
        if not card_body: return 'N/A'

        strong_tag = card_body.find('strong', string=lambda text: text and detail_label in text)
        if not strong_tag: return 'N/A'

        value_tag = strong_tag.find_next_sibling()
        return value_tag.get_text(strip=True) if value_tag else 'N/A'
    except (AttributeError, TypeError):
        return 'N/A'

# --- Stage 1: Scraping Listing Pages ---
def get_last_page_number(session, url):
    """Finds the last page number from the pagination section."""
    try:
        logging.info(f"Determining total number of pages from: {url}")
        response = session.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        page_numbers = [int(a.text) for a in soup.select('ul.pagination li a.page-link') if a.text.isdigit()]
        if not page_numbers:
            logging.warning("Pagination controls not found. Assuming only one page.")
            return 1

        last_page = max(page_numbers)
        logging.info(f"Found {last_page} pages to scrape.")
        return last_page
    except Exception as e:
        logging.error(f"Could not determine the last page number: {e}")
        return 0

def scrape_summary_page(session, page_url):
    """Scrapes summary auction listings from a single page."""
    try:
        response = session.get(page_url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        listing_cards = soup.select('div.col-xl-9 > div.row.mb-3[style*="border"]')
        page_auctions = []

        for card in listing_cards:
            summary_data = {}
            title_element = card.find('h5', class_='font-weight-bold')
            if title_element:
                summary_data['title'] = title_element.get_text(strip=True)
                summary_data['property_url'] = BASE_URL + title_element.parent['href']

            price_span = card.find(lambda tag: tag.name == 'span' and 'Reserve Price' in tag.text)
            summary_data['reserve_price'] = price_span.get_text(strip=True).replace('Reserve Price :', '').strip() if price_span else 'N/A'

            auction_id_span = card.find(lambda tag: tag.name == 'span' and 'Auction ID' in tag.text)
            summary_data['auction_id'] = auction_id_span.get_text(strip=True).replace('Auction ID :', '').replace('#', '').strip() if auction_id_span else 'N/A'

            page_auctions.append(summary_data)
        return page_auctions
    except requests.exceptions.RequestException as e:
        logging.warning(f"Could not retrieve summary page {page_url}. Error: {e}")
        return []

# --- Stage 2: Scraping Detail Pages (Multithreaded) ---
def scrape_property_details(args):
    """Worker function for threads to scrape a single property's detail page."""
    session, summary_data = args
    url = summary_data.get('property_url')

    if not url or url == 'N/A':
        return summary_data

    try:
        response = session.get(url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- Extract details using helper function ---
        summary_data['emd'] = find_detail_by_strong_text(soup, 'Bank Details', 'EMD :')
        summary_data['bank_name'] = find_detail_by_strong_text(soup, 'Bank Details', 'Bank Name :')
        summary_data['branch_name_detail'] = find_detail_by_strong_text(soup, 'Bank Details', 'Branch Name :')
        summary_data['service_provider'] = find_detail_by_strong_text(soup, 'Bank Details', 'Service Provider :')
        summary_data['borrower_name'] = find_detail_by_strong_text(soup, 'Property Details', 'Borrower Name :')
        summary_data['property_type_detail'] = find_detail_by_strong_text(soup, 'Property Details', 'Property Type :')
        summary_data['auction_start_date'] = find_detail_by_strong_text(soup, 'Property Details', 'Auction Start Date :')
        summary_data['auction_end_time'] = find_detail_by_strong_text(soup, 'Property Details', 'Auction End Time :')
        summary_data['submission_date'] = find_detail_by_strong_text(soup, 'Property Details', 'Application Subbmision Date :')

        # --- CORRECTED: Extract description with a more specific selector ---
        try:
            desc_header = soup.find('h5', class_='text-secondary', string='Description')
            card_body = desc_header.find_parent('.card-header').find_next_sibling('.card-body')
            description_p = card_body.find('div', class_='mb-4').find('p')
            summary_data['description'] = description_p.get_text(strip=True) if description_p else 'N/A'
        except (AttributeError, TypeError):
            summary_data['description'] = 'N/A'

        # --- CORRECTED: Extract all download links more robustly ---
        try:
            dl_header = soup.find('h5', class_='text-secondary', string='Downloads')
            card_body = dl_header.find_parent('.card-header').find_next_sibling('.card-body')
            all_links = []
            link_tags = card_body.select('a[href]')
            for link in link_tags:
                href = link['href']
                if href.startswith('/'):
                    full_url = BASE_URL + href
                else:
                    full_url = href
                all_links.append(full_url)
            summary_data['download_links'] = ', '.join(all_links) if all_links else 'N/A'
        except (AttributeError, TypeError):
            summary_data['download_links'] = 'N/A'

        return summary_data

    except requests.exceptions.RequestException as e:
        logging.warning(f"Failed to fetch details for {url}: {e}")
        return summary_data

# --- Main Orchestrator ---
def main():
    with requests.Session() as session:
        # --- STAGE 1: Get all summary listings ---
        pagination_base_url = START_URL.rsplit('/', 1)[0]
        last_page = get_last_page_number(session, START_URL)
        if last_page == 0:
            logging.error("Could not determine pages. Aborting.")
            return

        all_summaries = []
        logging.info("--- STAGE 1: Fetching all property summaries ---")
        for page_num in tqdm(range(1, last_page + 1), desc="Scraping Summary Pages"):
            page_url = f"{pagination_base_url}/{page_num}"
            summaries = scrape_summary_page(session, page_url)
            if summaries:
                all_summaries.extend(summaries)
            time.sleep(0.5)

        if not all_summaries:
            logging.error("No property summaries found. Exiting.")
            return

        logging.info(f"Found {len(all_summaries)} total properties to process.")

        # --- STAGE 2: Use ThreadPoolExecutor to scrape details in parallel ---
        logging.info("\n--- STAGE 2: Fetching property details (multithreaded) ---")
        all_detailed_data = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            args_for_map = [(session, summary) for summary in all_summaries]

            future_to_summary = {executor.submit(scrape_property_details, arg): arg for arg in args_for_map}
            for future in tqdm(as_completed(future_to_summary), total=len(all_summaries), desc="Scraping Details"):
                try:
                    result = future.result()
                    if result:
                        all_detailed_data.append(result)
                except Exception as e:
                    logging.error(f"A thread generated an exception: {e}")

    # --- STAGE 3: Process and save the collected data ---
    if not all_detailed_data:
        logging.warning("Scraping finished, but no detailed data was collected.")
        return

    logging.info(f"Successfully scraped details for {len(all_detailed_data)} listings.")
    df = pd.DataFrame(all_detailed_data)

    desired_order = [
        'auction_id', 'title', 'reserve_price', 'emd', 'bank_name',
        'property_type_detail', 'borrower_name', 'description', 'auction_start_date',
        'auction_end_time', 'submission_date', 'branch_name_detail', 'service_provider',
        'property_url', 'download_links'
    ]
    for col in desired_order:
        if col not in df.columns:
            df[col] = 'N/A'
    df = df[desired_order]

    print("\n--- Scraping Complete ---")
    print(f"Total listings processed: {len(df)}")
    print("\nSample of final data:")
    print(df.head())

    try:
        output_filename = 'eauctionsindia_new-delhi_full_details.csv'
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        logging.info(f"All data has been successfully saved to {output_filename}")
    except Exception as e:
        logging.error(f"Failed to save data to CSV file: {e}")

if __name__ == "__main__":
    main()

Scraping Summary Pages: 100%|██████████| 174/174 [02:23<00:00,  1.21it/s]
Scraping Details:  23%|██▎       | 474/2087 [01:14<03:49,  7.04it/s]