## Imports

In [32]:
import requests
from requests import Session
from bs4 import BeautifulSoup
import multiprocessing
import re
import pandas as pd
import json

## Helper Functions

In [33]:
def get_soup(url: str, headers: dict[str:str], session: Session, page_number: int):
    response = session.get(url, headers= headers)
    if page_number != None:
        print(f"Request for page {page_number} - response : {response.status_code}")
    else:
        print(f"Solving compound listing - Getting individual url")
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')
    return soup

## Getting the URLS

In [3]:
def get_url_list(page_number: int,
                  headers: dict[str:str] = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',} ,
                  session: Session = requests.Session()) -> list[str]:

    url_list = []

    base_url = f'https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&isALifeAnnuitySale=false&page={page_number}&orderBy=relevance'
    
    try:
        soup = get_soup(base_url, headers, session, page_number)

        listings = soup.find_all("div", attrs={"class": "card--result__body"})
        
        for listing in listings:
            for link in listing.find_all("a", attrs={"class": "card__title-link"}):
                href = link.get("href")
                if href:
                        url_list.append(href)
                else:
                        print(f"not found link {link}")

    except Exception as e:
        print(f"Error occurred on page {page_number + 1}: {e}")

    return url_list

## Getting the details from the listings

In [34]:
def get_dict_from_url(url: str, headers: dict[str:str], session: Session, line_number: int) -> dict:

    soup = get_soup(url, headers, session, line_number)

    # Parse the content to find the <script> tag containing "window.classified"
    script_tag = soup.find('script', string=re.compile(r'window\.classified\s*='))

    if script_tag:
        print(f"Got script_tag for {url}")
        # Extract the JSON part from the script content
        match = re.search(r'window\.classified\s*=\s*(\{.*?\});', script_tag.string)
        if match:
            classified_data = match.group(1)
            # Parse the JSON data
            classified_dict = json.loads(classified_data)

            return classified_dict

        else:
            print(f"JSON data not found within the script tag for {url}.")
    else:
        print(f"Script tag with 'window.classified' not found for {url}.")

In [35]:
def is_compound_sale(classified_dict: dict) -> bool:
    return classified_dict['cluster'] != None

In [36]:
def get_compound_sale_urls(soup: BeautifulSoup) -> list[str]:
    individual_urls = []
    
    # Find all tags that include the text 'apartment'
    tags_with_text = soup.find_all(string=lambda text: "apartment" in text.lower())
    
    # Check each tag for a parent with the class 'grid'
    for tag in tags_with_text:
        # Find the closest parent 'div' with class 'grid'
        grid = tag.find_parent('div', class_='grid')
        if grid:
            # Now, check for subtitles and extract valid links
            subtitles = grid.find_all('span', class_='text-block__subtitle')
            
            # We already know this grid contains an "apartment" mention
            links = grid.find_all('a', href=True)
            valid_links = [link['href'] for link in links if link['href'].startswith("https://www.immoweb.be/en/classified/")]

            # Extend individual_urls with valid links if found
            if valid_links:
                individual_urls.extend(valid_links)
                # Optionally print each found URL
                #for valid_link in valid_links:
                    #print(f"Found URL: {valid_link}")

    return individual_urls

In [37]:
def read_parse_listings(url_file_path: str,
                        headers: dict[str:str] = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',},
                        session: Session = requests.Session()) -> list[dict]:

    result = []
    compound_urls = []
    individual_urls = []

    with open(url_file_path, 'r') as file:
        for line_number, line in enumerate(file, start= 1):
            url = line.strip()

            listing_dict = get_dict_from_url(url, headers, session, line_number)

            if is_compound_sale(listing_dict):
                print(f"Solving compound listing for {url}")
                compound_urls.append(url + "\n")
                soup = get_soup(url, headers, session, None)
                individual_listings = get_compound_sale_urls(soup)
                for listing_url in individual_listings:
                    individual_urls.append(listing_url + "\n")
                    individual_dict = get_dict_from_url(listing_url, headers, session, None)
                    result.append(individual_dict)

            else:
                result.append(listing_dict)

    with open(url_file_path, "r") as f:
        lines = f.readlines()
    with open(url_file_path, "w") as f:
        for line in lines:
            if line not in compound_urls:
                f.write(line)
    with open(url_file_path, "a") as f:
        for line in individual_urls:
            f.write(line)

    return result

In [38]:
dicts = read_parse_listings('url.txt')

Request for page 1 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/house/for-sale/wetteren/9230/20316344
Request for page 2 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/borgerhout/2140/20317800
Request for page 3 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/hoboken/2660/20319728
Request for page 4 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/ninove/9400/20318192
Request for page 5 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/20319510
Request for page 6 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/deinze/9800/20316163
Solving compound listing for https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/deinze/9800/20316163
Solving compound listing - Getting individual ur

def get_compound_sale_urls(soup, listing_dict):

    individual_urls = []
    listing_id = []

    for unit in listing_dict['cluster']['units']:
        if 'APARTMENT' in unit.values():
            for listing in unit['items']:
                listing_id.append(listing['id'])

    for id in listing_id:
        tags_with_number = soup.find_all('a', href=lambda value: value and id in value)
        for tag in tags_with_number:
            individual_urls.append(tag['href'])


## Get the info from the list of dicts

In [32]:
def get_relevant_info(list_dicts_listings: list[dict]) -> list[dict]:
    for listing in list_dicts_listings:
        pass
    #write relevant code to extract the particular information needed from the listing dicts
    #place into new dict containing only relevant information
    #def get_empty_data():
    #data = {'locality':None,
     #       'property_type':None,
      #      'property_subtype':None,
       #     'price':None,
        #    'sale_type':None,
        #    'rooms':None,
        #    'area_living':None,
        #    'equipped_kitchen':None,
        #    'furnished':None,
        #    'fire':None,
        #    'terrace':None,
        #    'terrace_area':None,
        #    'garden':None,
        #    'garden_area':None,
        #    'land_surface':None,
        #    'plot_surface':None,
        #    'number_facade':None,
        #    'swimming_pool':None,
        #    'building_state':None,
        #    'immoweb_code': None}
    
    #return data

## Parse result to pd.DataFrame

In [None]:
def parse_listing_info(relevant_info_dicts: list[dict]) -> pd.DataFrame:
    df = pd.DataFrame(relevant_info_dicts)
    df.to_csv('Immoweb_scraping_result.csv', index= False, header= True)
    return df

## Main

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',} 
number_pages = 333
session = requests.Session()
# Get listing URLs
quick_get_urls(number_pages, headers, session)

    #get_url_list(headers, session)

    # Get listings details, only relevant info

dict_list = read_parse_listings("url.txt", headers, session)

result_list = get_relevant_info(dict_list)


    # Parse to pd.DataFrame
    df = parse_listing_info(result_list)
    df.to_csv('ImmowebScrapingResult.csv')

## Testing section

#### Getting listing urls from the search pages - check

In [9]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',} 
number_pages = 20
session = requests.Session()
# Get listing URLs
#get_url_list(number_pages, headers, session)

#### Multiprocessing - check (doesn't work in jupyter notebook)

#### Getting info from the listed urls, and parsing to dicts

In [21]:
result = []

with open('url.txt', 'r') as file:
    for line_number, line in enumerate(file, start= 1):
        url = line.strip()
        listing_dict = get_dict_from_url(url, headers, session, line_number)
        result.append(listing_dict)
        print(f"Got dict for {url}")

Request for page 1 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/house/for-sale/wetteren/9230/20316344
Got dict for https://www.immoweb.be/en/classified/house/for-sale/wetteren/9230/20316344
Request for page 2 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/borgerhout/2140/20317800
Got dict for https://www.immoweb.be/en/classified/apartment/for-sale/borgerhout/2140/20317800
Request for page 3 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/ninove/9400/20318192
Got dict for https://www.immoweb.be/en/classified/apartment/for-sale/ninove/9400/20318192
Request for page 4 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/20319510
Got dict for https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/20319510
Request for page 5 - response : 200
Got script_tag for https://www.immoweb.be/en/classified/new-real-estat

KeyboardInterrupt: 

In [30]:
for unit in result[6]['cluster']['units']:
    if 'APARTMENT' in unit.values():
        for listing in unit['items']:
            print(listing['id'])


20317715
20317708
20317704


In [28]:
'APARTMENT' in result[6]['cluster']['units'][0].values()

True