In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import itertools
from functools import partial
from tqdm.contrib.concurrent import thread_map

def get_urls_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')

    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card--title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_search_url_list(min_price: int, max_price: int):
    estate_types = ["house","apartment"]

    search_links = []
    for estate in estate_types:
        page = 1
        while True:
            url = f"https://www.immoweb.be/en/search/{estate}/for-sale?countries=BE&minPrice={min_price}&maxPrice={max_price}&page={page}&orderBy=relevance"
            req = requests.get(url)
            soup = BeautifulSoup(req.content, 'html.parser')
            card_results = soup.find_all('article', class_='card--result')
            if not card_results:  # if no more results found, stop iterating over pages
                break
            search_links.append(url)
            page += 1
    return search_links

def get_property(url, session): 
    try:
        req = session.get(url)
        read_html_prop = pd.read_html(req.text)
        property = pd.concat(read_html_prop).set_index(0).T
        property["id"] = url.split("/")[-1]
        property = property.set_index("id")
        property = property.loc[:, ~property.columns.duplicated()].copy()
        
        return property
    except Exception as e:
        print(type(e))
        return e
    
if __name__ == "__main__":
    min_max_price_list = [(i, i+40000) for i in range(10000, 2500000, 40000)]
    all_properties = []

    with requests.Session() as session:
        for min_price, max_price in min_max_price_list:
            search_links = get_search_url_list(min_price, max_price)
            urls = list(itertools.chain.from_iterable(thread_map(get_urls_from_search_page, search_links)))
            properties = [df for df in thread_map(partial(get_property, session=session), urls) if isinstance(df, pd.DataFrame)]
            all_properties.extend(properties)

    if all_properties:
        properties_df = pd.concat(all_properties)
        properties_df.to_csv("Full_properties.csv")
    else:
        print("No valid properties data was found.")


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import itertools
from functools import partial
from tqdm.contrib.concurrent import thread_map

def get_urls_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')

    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card--title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_search_url_list(min_price: int, max_price: int):
    estate_types = ["house","apartment"]

    search_links = []
    for estate in estate_types:
        page = 1
        while True:
            url = f"https://www.immoweb.be/en/search/{estate}/for-sale?countries=BE&minPrice={min_price}&maxPrice={max_price}&page={page}&orderBy=relevance"
            req = requests.get(url)
            soup = BeautifulSoup(req.content, 'html.parser')
            card_results = soup.find_all('article', class_='card--result')
            if not card_results:  # if no more results found, stop iterating over pages
                break
            search_links.append(url)
            page += 1
    return search_links

def get_property(url, session): 
    try:
        req = session.get(url)
        read_html_prop = pd.read_html(req.text)
        property = pd.concat(read_html_prop).set_index(0).T
        property["id"] = url.split("/")[-1]
        property = property.set_index("id")
        property = property.loc[:, ~property.columns.duplicated()].copy()
        
        return property
    except Exception as e:
        print(type(e))
        return e

def scrape_properties_in_price_range(price_range: tuple, session: requests.Session):
    min_price, max_price = price_range
    search_links = get_search_url_list(min_price, max_price)
    urls = list(itertools.chain.from_iterable(thread_map(get_urls_from_search_page, search_links)))
    properties = [df for df in thread_map(partial(get_property, session=session), urls) if isinstance(df, pd.DataFrame)]
    return properties
    
if __name__ == "__main__":
    min_max_price_list = [(i, i+40000) for i in range(10000, 2500000, 40000)]
    all_properties = []

    with requests.Session() as session:
        all_properties = list(itertools.chain.from_iterable(thread_map(partial(scrape_properties_in_price_range, session=session), min_max_price_list)))

    if all_properties:
        properties_df = pd.concat(all_properties)
        properties_df.to_csv("Full_properties.csv")
    else:
        print("No valid properties data was found.")


100%|██████████| 11/11 [00:04<00:00,  2.43it/s]
0it [00:00, ?it/s]
100%|██████████| 46/46 [00:14<00:00,  3.08it/s]
0it [00:00, ?it/s]
  3%|▎         | 2/63 [03:01<1:40:19, 98.69s/it]
[A
[A
[A
[A
[A
[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A


[A[A[A
[A

[A[A


[A[A[A
[A


[A[A[A

[A[A


[A[A[A
[A

[A[A



[A[A[A[A


[A[A[A
[A

[A[A



[A[A[A[A


[A[A[A
[A

[A[A



[A[A[A[A


[A[A[A
[A




[A[A[A[A[A

[A[A


[A[A[A
[A

100%|██████████| 76/76 [00:21<00:00,  3.54it/s]



[A[A[A
100%|██████████| 68/68 [00:11<00:00,  5.90it/s]


[A[A


[A[A[A
[A

[A[A


[A[A[A
[A



[A[A[A[A




[A[A[A[A[A

[A[A


[A[A[A





[A[A[A[A[A[A
[A



[A[A[A[A

[A[A






[A[A[A[A[A[A[A


[A[A[A
100%|██████████| 68/68 [00:20<00:00,  3.37it/s]








[A[A[A[A[A[A[A[A

[A[A


0it [00:15, ?it/s]

100%|██████████

SSLError: HTTPSConnectionPool(host='www.immoweb.be', port=443): Max retries exceeded with url: /en/search/house/for-sale?countries=BE&minPrice=2290000&maxPrice=2330000&page=38&orderBy=relevance (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1007)')))