In [12]:
import functools
import itertools
import requests
import json
import pandas as pd
from tqdm.contrib.concurrent import thread_map

In [13]:
def get_price_range(min_price, max_price, session, properties_per_page=30):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page=1&orderBy=newest&isALifeAnnuitySale=false&minPrice={min_price}&maxPrice={max_price}"
    num_pages = session.get(api_url).json()['marketingCount'] // properties_per_page + 1 # total nb of properties / nb of properties per page
    if num_pages > 333: # there is a hard limit of 333 pages
        mid_price = (min_price + max_price) // 2 
        return get_price_range(min_price, mid_price, session).union(get_price_range(mid_price, max_price, session)) # split the range in two. Union prevents duplicates.
    return get_ids_for_category(min_price, max_price, num_pages, session)

def get_ids_from_page(i, min_price, max_price, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page={i}&orderBy=newest&isALifeAnnuitySale=false&minPrice={min_price}&maxPrice={max_price}"
    return set(result['id'] for result in session.get(api_url).json()['results'])

def get_ids_for_category(min_price, max_price, num_pages, session):
    return set(itertools.chain.from_iterable(thread_map(functools.partial(get_ids_from_page, min_price=min_price, max_price=max_price, session=session), range(1, num_pages+ 1), max_workers=64)))

def get_property(id, session):
    property_url = f"http://www.immoweb.be/en/classified/{id}"
    
    tables = pd.read_html(session.get(property_url, timeout=5).text)
    df = pd.concat(tables).set_index(0).T
    df['id'] = id
    df = df.set_index('id')
    return df.loc[:, ~df.columns.duplicated()]


def get_properties_by_id(ids, session, max_workers=64):
    return pd.concat(thread_map(functools.partial(get_property, session=session), ids, max_workers=max_workers))

def get_properties():
    with requests.Session() as session:
        ids = get_price_range(min_price=0, max_price=10**8, session=session)
        return get_properties_by_id(ids, session)

In [16]:
import time

start = time.time()
properties = get_properties()
properties.to_csv('properties_pierre.csv')
end = time.time()

print(f"Time elapsed: {end - start} seconds.")
print(f"Number of properties: {len(properties)}")
print(f"Number of properties per second: {len(properties) / (end - start)}")
print(f"Number of seconds per property: {(end - start) / len(properties)}")

100%|██████████| 52/52 [00:00<00:00, 68.95it/s]
100%|██████████| 135/135 [00:01<00:00, 102.64it/s]
100%|██████████| 270/270 [00:02<00:00, 106.86it/s]
100%|██████████| 316/316 [00:02<00:00, 126.40it/s]
100%|██████████| 218/218 [00:02<00:00, 106.05it/s]
100%|██████████| 222/222 [00:01<00:00, 128.68it/s]
100%|██████████| 214/214 [00:01<00:00, 126.18it/s]
100%|██████████| 198/198 [00:01<00:00, 112.13it/s]
100%|██████████| 313/313 [00:02<00:00, 122.76it/s]
100%|██████████| 234/234 [00:02<00:00, 100.81it/s]
100%|██████████| 178/178 [00:01<00:00, 126.67it/s]
 89%|████████▉ | 200/224 [00:11<00:01, 17.70it/s]


TypeError: list indices must be integers or slices, not str