In [7]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import time
import pandas as pd

In case you haven't installed playwright headless browser execute the following cell. 

In [8]:
# !playwright install

In [19]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright, TimeoutError
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Apply nest_asyncio patch
nest_asyncio.apply()

async def scrape_immoscout24():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # Set to False for debugging
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            viewport={'width': 1920, 'height': 1080},
        )
        page = await context.new_page()
        
        # URL for apartments for sale in Berlin
        url = 'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen'
        
        logger.info(f"Navigating to URL: {url}")
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        logger.info("Page loaded. Waiting for network idle...")
        await page.wait_for_load_state('networkidle', timeout=60000)
        logger.info("Network is idle. Looking for listings...")

        # Disable webdriver detection
        await page.evaluate("() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) }")

        apartments = []
        has_next_page = True
        page_num = 1

        while has_next_page:
            logger.info(f"Scraping page {page_num}...")
            
            # Wait for the listings to load
            try:
                await page.wait_for_selector('article[data-id]', state='visible', timeout=60000)
            except TimeoutError:
                logger.error(f"Timeout waiting for listings on page {page_num}. Taking a screenshot for debugging.")
                await page.screenshot(path=f'debug_screenshot_page{page_num}.png')
                logger.info(f"Current URL: {page.url}")
                break

            # Extract data from each listing
            page_apartments = await page.evaluate("""
                () => {
                    const listings = document.querySelectorAll('#listings');
                    return Array.from(listings).map(listing => {
                        const title = listing.querySelector('.result-list-entry__brand-title')?.textContent.trim();
                        const price = listing.querySelector('.result-list-entry__primary-criterion:nth-child(1) dd:nth-child(1)')?.textContent.trim();
                        const size = listing.querySelector('.result-list-entry__primary-criterion:nth-child(2) dd:nth-child(1)')?.textContent.trim();
                        const address = listing.querySelector('.result-list-entry__map-link')?.textContent.trim();
                        const link = listing.querySelector('a.result-list-entry__brand-title-container')?.href;

                        return { title, price, size, address, link };
                    });
                }
            """)

            apartments.extend(page_apartments)

            # Check if there's a next page
            has_next_page = await page.evaluate("""
                () => {
                    const nextButton = document.querySelector('a[data-is24-qa="paging-next"]');
                    return nextButton && !nextButton.hasAttribute('disabled');
                }
            """)

            if has_next_page:
                await page.click('a[data-is24-qa="paging-next"]')
                await page.wait_for_load_state('networkidle')
                page_num += 1
            
            # Add a small delay to be respectful to the website
            await asyncio.sleep(2)

        await browser.close()

        logger.info(f"Total apartments scraped: {len(apartments)}")
        return apartments

async def main():
    try:
        apartments = await scrape_immoscout24()
        
        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(apartments)
        
        # Display the first few rows of the DataFrame
        logger.info("First few rows of scraped data:")
        logger.info(df.head())
        
        # Save the DataFrame to a CSV file
        df.to_csv('berlin_apartments.csv', index=False)
        logger.info("Data saved to berlin_apartments.csv")

        return df 
    except Exception as e:
        logger.error(f"An error occurred in main: {str(e)}")
        raise

# Run the main function
try:
    df = asyncio.get_event_loop().run_until_complete(main())
except Exception as e:
    logger.error(f"An error occurred while running the script: {str(e)}")


INFO:__main__:Navigating to URL: https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen
INFO:__main__:Page loaded. Waiting for network idle...
INFO:__main__:Network is idle. Looking for listings...
INFO:__main__:Scraping page 1...
ERROR:__main__:Timeout waiting for listings on page 1. Taking a screenshot for debugging.
INFO:__main__:Current URL: https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen
INFO:__main__:Total apartments scraped: 0
INFO:__main__:First few rows of scraped data:
INFO:__main__:Empty DataFrame
Columns: []
Index: []
INFO:__main__:Data saved to berlin_apartments.csv


In [17]:
df.head()