In [1]:
import csv
import functools
import itertools
import json
import re
from concurrent.futures import ThreadPoolExecutor
from time import perf_counter
from typing import List, Dict

import lxml
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.contrib.concurrent import thread_map


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def get_price_range(min_price, max_price, session, properties_per_page=30):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page=1&orderBy=newest&isALifeAnnuitySale=false&minPrice={min_price}&maxPrice={max_price}"
    num_pages = session.get(api_url).json()['marketingCount'] // properties_per_page + 1 # total nb of properties / nb of properties per page
    if num_pages > 333: # there is a hard limit of 333 pages
        mid_price = (min_price + max_price) // 2 
        return get_price_range(min_price, mid_price, session).union(get_price_range(mid_price, max_price, session)) # split the range in two. Union prevents duplicates.
    return get_ids_for_category(min_price, max_price, num_pages, session)

def get_ids_from_page(i, min_price, max_price, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page={i}&orderBy=newest&isALifeAnnuitySale=false&minPrice={min_price}&maxPrice={max_price}"
    return set(result['id'] for result in session.get(api_url).json()['results'])

def get_ids_for_category(min_price, max_price, num_pages, session):
    return set(itertools.chain.from_iterable(thread_map(functools.partial(get_ids_from_page, min_price=min_price, max_price=max_price, session=session), range(1, num_pages+ 1), max_workers=64)))

def get_property(id, session): 
    property_url = f"http://www.immoweb.be/en/classified/{id}"

    try:
        req = session.get(property_url, timeout=10)
        tables = pd.read_html(req.text)
        df = pd.concat(tables).set_index(0).T
        df['id'] = id
        df = df.set_index('id')
        df_prop = df.loc[:, ~df.columns.duplicated()].copy()

        original_dict = {}

        window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
        if window_data and json.loads(window_data[0]):
            list_of_property_info = json.loads(window_data[0])
            original_dict = list_of_property_info[0]['classified']

        for i in df_prop.index:
            original_dict[df_prop.loc[i, df_prop.columns[0]]] = df_prop.loc[i, df_prop.columns[1]]

        df_dict = pd.DataFrame([original_dict])
        df_dict = df_dict.set_index('id')
        
    except Exception as e:
        print(type(e))
        return pd.DataFrame()

    return df_dict


def get_properties_by_id(ids, session, max_workers=64):
    return pd.concat(thread_map(functools.partial(get_property, session=session), ids, max_workers=max_workers))

def get_properties():
    with requests.Session() as session:
        ids = get_price_range(min_price=0, max_price=10**6, session=session)
        return get_properties_by_id(ids, session)

In [14]:
import time

start = time.time()
properties = get_properties()
properties.to_csv('full_properties.csv')
end = time.time()

print(f"Time elapsed: {end - start} seconds.")
print(f"Number of properties: {len(properties)}")
print(f"Number of properties per second: {len(properties) / (end - start)}")
print(f"Number of seconds per property: {(end - start) / len(properties)}")

100%|██████████| 116/116 [00:02<00:00, 44.69it/s]
100%|██████████| 285/285 [00:08<00:00, 35.10it/s]
100%|██████████| 201/201 [00:05<00:00, 37.19it/s]
100%|██████████| 291/291 [00:12<00:00, 23.65it/s]
100%|██████████| 260/260 [00:08<00:00, 32.17it/s]
100%|██████████| 258/258 [00:08<00:00, 31.52it/s]
100%|██████████| 224/224 [00:06<00:00, 34.37it/s]
100%|██████████| 244/244 [00:07<00:00, 32.29it/s]
100%|██████████| 315/315 [00:11<00:00, 26.41it/s]
100%|██████████| 242/242 [00:08<00:00, 27.83it/s]
100%|██████████| 227/227 [00:06<00:00, 36.93it/s]
100%|██████████| 164/164 [00:04<00:00, 34.64it/s]
100%|██████████| 153/153 [00:04<00:00, 32.76it/s]
  4%|▍         | 2799/69695 [01:20<22:31, 49.50it/s]  

<class 'ValueError'>


  9%|▉         | 6251/69695 [03:07<23:37, 44.77it/s]  

<class 'ValueError'>


 22%|██▏       | 15587/69695 [07:49<2:16:45,  6.59it/s]

<class 'ValueError'>


 23%|██▎       | 16115/69695 [08:13<43:54, 20.34it/s]  

<class 'ValueError'>


 24%|██▍       | 16561/69695 [08:37<56:27, 15.69it/s]

<class 'ValueError'>


 24%|██▍       | 16860/69695 [08:58<1:20:52, 10.89it/s]

<class 'ValueError'>
<class 'ValueError'>


 24%|██▍       | 16975/69695 [09:03<41:02, 21.41it/s]  

<class 'requests.exceptions.ReadTimeout'>


 24%|██▍       | 16991/69695 [09:04<47:11, 18.61it/s]

<class 'requests.exceptions.ReadTimeout'>


 25%|██▌       | 17630/69695 [09:36<1:23:13, 10.43it/s]

<class 'ValueError'>


 26%|██▌       | 17782/69695 [09:47<58:10, 14.87it/s]  

<class 'ValueError'>


 32%|███▏      | 22236/69695 [12:38<2:11:27,  6.02it/s]

<class 'ValueError'>


 48%|████▊     | 33322/69695 [17:51<08:28, 71.58it/s]  

<class 'ValueError'>


 55%|█████▌    | 38380/69695 [20:13<07:14, 72.02it/s]

<class 'ValueError'>


 57%|█████▋    | 39650/69695 [20:48<21:03, 23.79it/s]

<class 'ValueError'>


 59%|█████▊    | 40911/69695 [21:32<17:38, 27.20it/s]  

<class 'ValueError'>


 61%|██████    | 42257/69695 [22:19<17:05, 26.75it/s]  

<class 'ValueError'>


 61%|██████    | 42514/69695 [22:37<1:28:20,  5.13it/s]

<class 'ValueError'>


 61%|██████    | 42536/69695 [22:38<56:37,  7.99it/s]  

<class 'ValueError'>
<class 'ValueError'>


 66%|██████▌   | 45653/69695 [24:26<12:47, 31.34it/s]

<class 'ValueError'>


 68%|██████▊   | 47471/69695 [25:12<06:43, 55.06it/s]

<class 'ValueError'>


 70%|███████   | 48955/69695 [25:53<09:37, 35.92it/s]

<class 'ValueError'>


 77%|███████▋  | 53382/69695 [27:49<06:58, 39.00it/s]

<class 'ValueError'>


 82%|████████▏ | 57175/69695 [29:20<05:53, 35.43it/s]  

<class 'ValueError'>


 90%|█████████ | 62845/69695 [31:39<01:29, 76.41it/s]

<class 'ValueError'>


 93%|█████████▎| 64703/69695 [32:25<01:32, 54.11it/s]

<class 'ValueError'>


 95%|█████████▍| 66193/69695 [33:03<01:09, 50.50it/s]

<class 'ValueError'>


 96%|█████████▌| 66616/69695 [33:15<03:52, 13.22it/s]

<class 'ValueError'>
<class 'ValueError'>


100%|██████████| 69695/69695 [34:29<00:00, 33.68it/s]
