In [7]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright, TimeoutError
import pandas as pd
import logging

In case you haven't installed playwright headless browser execute the following cell. 

In [8]:
# !playwright install

In [50]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright, TimeoutError
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Apply nest_asyncio patch
nest_asyncio.apply()

async def scrape_immoscout24():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            viewport={'width': 1920, 'height': 1080},
        )
        page = await context.new_page()
        
        url = 'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen'
        
        logger.info(f"Navigating to URL: {url}")
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        logger.info("Page loaded. Waiting for network idle...")
        await page.wait_for_load_state('networkidle', timeout=60000)

        # Automatically accept cookie consent if the button is present
        try:
            await page.wait_for_selector('button[data-testid="uc-accept-all-button"]', timeout=10000)
            cookie_button = await page.query_selector('button[data-testid="uc-accept-all-button"]')
            if cookie_button:
                await cookie_button.click()
                logger.info("Clicked cookie consent button")
                await page.wait_for_load_state('networkidle')
        except TimeoutError:
            logger.warning("Cookie consent button not found or took too long to appear.")

        apartments = []
        page_num = 1

        while True:
            logger.info(f"Scraping page {page_num}...")
            
            try:
                # Wait for listings to load
                await page.wait_for_selector('.result-list__listing', state='visible', timeout=60000)
            except TimeoutError:
                logger.error(f"Timeout waiting for listings on page {page_num}. Taking a screenshot for debugging.")
                await page.screenshot(path=f'debug_screenshot_page{page_num}.png')
                logger.info(f"Current URL: {page.url}")
                break

            # Log the number of listings found
            listings_count = await page.evaluate('() => document.querySelectorAll(".result-list__listing").length')
            logger.info(f"Found {listings_count} listings on page {page_num}")

            if listings_count == 0:
                logger.warning("No listings found on this page. Taking a screenshot.")
                await page.screenshot(path=f'debug_no_listings_page{page_num}.png')
                break

            # Extract data from each listing
            page_apartments = await page.evaluate("""
                () => {
                    const listings = document.querySelectorAll('.result-list__listing');
                    return Array.from(listings).map(listing => {
                        const title = listing.querySelector('.result-list-entry__brand-title')?.textContent.trim();
                        const price = listing.querySelector('.result-list-entry__primary-criterion:nth-child(1) dd:nth-child(1)')?.textContent.trim();
                        const size = listing.querySelector('.result-list-entry__primary-criterion:nth-child(2) dd:nth-child(1)')?.textContent.trim();
                        const address = listing.querySelector('.result-list-entry__map-link')?.textContent.trim();
                        const link = listing.querySelector('a.result-list-entry__brand-title-container')?.href;

                        return { title, price, size, address, link };
                    });
                }
            """)

            apartments.extend(page_apartments)
            logger.info(f"Extracted {len(page_apartments)} apartments from page {page_num}")

            # Check if there's a next page
            next_button = await page.query_selector('.p-next')
            if next_button:
                is_disabled = await next_button.evaluate("el => el.classList.contains('disabled')")
                
                if not is_disabled:
                    logger.info("Found next page button. Clicking...")
                    
                    # Click the next button and wait for new content to load
                    await next_button.click()
                    
                    # Wait for new content to load after clicking the next button
                    try:
                        await asyncio.sleep(3)  # Short delay before checking again
                        
                        # Ensure that we wait for the listings on the new page to be visible
                        await page.wait_for_selector('.result-list__listing', state='visible', timeout=60000)

                        new_url = page.url
                        logger.info(f"Navigated to next page. New URL: {new_url}")
                        
                        page_num += 1
                    except TimeoutError:
                        logger.error("Timeout waiting for next page to load. Taking a screenshot.")
                        await page.screenshot(path=f'debug_screenshot_nextpage{page_num}.png')
                        break
                else:
                    logger.info("Next button is disabled; reached the last page.")
                    break
            
            else:
                logger.info("No more pages to scrape.")
                break
            
            await asyncio.sleep(5)

        await browser.close()

        logger.info(f"Total apartments scraped: {len(apartments)}")
        return apartments

async def main():
    try:
        apartments = await scrape_immoscout24()
        
        if apartments:
            # Convert the list of dictionaries to a DataFrame
            df = pd.DataFrame(apartments)
            
            # Display the first few rows of the DataFrame
            logger.info("First few rows of scraped data:")
            logger.info(df.head().to_string())
            
            # Save the DataFrame to a CSV file
            df.to_csv('berlin_apartments.csv', index=False)
            logger.info("Data saved to berlin_apartments.csv")

            return df 
        else:
            logger.warning("No apartments were scraped.")
            return None

    except Exception as e:
        logger.error(f"An error occurred in main: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    try:
        df = asyncio.get_event_loop().run_until_complete(main())
        if df is not None:
            print(f"Successfully scraped {len(df)} apartments.")
        else:
            print("No data was scraped.")
    except Exception as e:
        logger.error(f"An error occurred while running the script: {str(e)}", exc_info=True)

INFO:__main__:Navigating to URL: https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen
INFO:__main__:Page loaded. Waiting for network idle...
INFO:__main__:Clicked cookie consent button
INFO:__main__:Scraping page 1...
INFO:__main__:Found 20 listings on page 1
INFO:__main__:Extracted 20 apartments from page 1
INFO:__main__:Found next page button. Clicking...
INFO:__main__:Navigated to next page. New URL: https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen?pagenumber=2
INFO:__main__:Scraping page 2...
INFO:__main__:Found 20 listings on page 2
INFO:__main__:Extracted 20 apartments from page 2
INFO:__main__:Found next page button. Clicking...
INFO:__main__:Navigated to next page. New URL: https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-kaufen?pagenumber=3
INFO:__main__:Scraping page 3...
INFO:__main__:Found 20 listings on page 3
INFO:__main__:Extracted 20 apartments from page 3
INFO:__main__:Found next page button. Clicking...
INFO:__mai

Successfully scraped 7180 apartments.


In [51]:
df.tail()

Unnamed: 0,title,price,size,address,link
7175,,1.299.020 €,"150,53 m²","Georg-Hermann-Allee 122, Bornstedt, Potsdam",
7176,Am Park Potsdam,224.281 - 1.145.493 €,"27,00 - 151,00 m²","Georg-Hermann-Allee 122, Bornstedt, Potsdam",https://www.immobilienscout24.de/neubau/d-h-pr...
7177,,998.000 €,129 m²,"Geschwister Scholl Str. 55-59, Potsdam West, P...",
7178,,989.000 €,126 m²,"Geschwister Scholl Str. 55-59, Potsdam West, P...",
7179,,549.000 €,67 m²,"Geschwister Scholl Str. 55-59, Potsdam West, P...",
