In [4]:
import requests
import json
import csv
import re
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter

start_time = perf_counter()
all_data_file = "C:/Users/Moura/git/immoweb_project/all_data7.csv"

header = [
    "Id", 'Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale',
    'Number of rooms', 'Living Area', 'Fully equipped kitchen', 'Furnished', 'Open fire',
    'Terrace', 'Garden', 'Surface area of the plot of land', 'Number of facades',
    "Swimming pool", "State of the building", "Url"
]

def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    all_immo_links = []
    search_links = []

    for estate in estate_types:
        for page in range(1, 333):
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)

    with ThreadPoolExecutor() as pool:
        all_immo_links = list(pool.map(get_from_search_page, search_links))
    
    return [link for sublist in all_immo_links for link in sublist]

def get_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')
    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card__title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_one_property_info(url_one_property: str) -> Dict:
    req = requests.get(url_one_property)
    soup = BeautifulSoup(req.content, 'html.parser')
    try:
        read_html_prop = pd.read_html(req.text)
    except ValueError:  # no tables found
        return {}

    if len(read_html_prop) == 0:
        return {}  # No tables found, return empty dictionary

    # rest of your code

    df_one_property = pd.concat(read_html_prop, ignore_index=True)
    house_dict = {}

    list_of_property_info = []
    window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])[0]['classified'])

    house_dict = list_of_property_info[0]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    house_dict['Url'] = url_one_property  # Add the URL to the dictionary for each property

    return house_dict

def clean_data_to_csv(original_dict) -> Dict:
    has_fireplace = int(original_dict.get('How many fireplaces?', 0))
    living_area = original_dict.get('Living area')
    living_area_value = living_area.split(' ', 1)[0] if living_area is not None else None

    surface_of_plot = original_dict.get('Surface of the plot')
    surface_of_plot_value = surface_of_plot.split(' ', 1)[0] if surface_of_plot is not None else None
    new_dict = {
        "Id": original_dict.get('id'),
        'Locality': original_dict.get('Neighbourhood or locality'),
        'Type of property': original_dict.get('type'),
        'Subtype of property': original_dict.get('subtype'),
     'Type of sale': original_dict.get('transactionType'),
        'Number of rooms': original_dict.get('Bedrooms'),
        'Living Area': living_area_value,
        'Fully equipped kitchen': original_dict['kitchen']['type'],
        'Furnished': True if original_dict.get('Furnished', '').lower() == 'yes' else False,
        'Open fire': True if has_fireplace >= 1 else False,
        'Terrace': False if original_dict['outdoor']['terrace']['exists'].lower() != 'true' else True,
        'Garden': False if int(original_dict['outdoor']['garden']['surface'] or 0) < 1 else True,
        'Surface area of the plot of land': surface_of_plot_value,
        'Number of facades': original_dict.get('Number of frontages'),
        "Swimming pool": original_dict['wellnessEquipment']['hasSwimmingPool'],
        "State of the building": original_dict.get('Building condition'),
        "Url": original_dict.get('Url')
    }
    return new_dict

def adding_one_line_into_csv(new_dict):
    with open(all_data_file, 'a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writerow(new_dict)

def get_collective_data():
    immo_links = get_url_list()

    with ThreadPoolExecutor() as pool:
        property_info_list = list(pool.map(get_one_property_info, immo_links))

    with ThreadPoolExecutor() as pool:
        clean_data_list = list(pool.map(clean_data_to_csv, property_info_list))

    with ThreadPoolExecutor() as pool:
        list(pool.map(adding_one_line_into_csv, clean_data_list))

get_collective_data()

print("Scraping completed.")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")
def print_csv_content(file_path):
    df = pd.read_csv(file_path)
    print(df)


print_csv_content(all_data_file)



In [3]:
import requests
import json
import csv
import re
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter

start_time = perf_counter()
all_data_file = "all_data4.csv"

header = [
    "Id", 'Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale',
    'Number of rooms', 'Living Area', 'Fully equipped kitchen', 'Furnished', 'Open fire',
    'Terrace', 'Garden', 'Surface area of the plot of land', 'Number of facades',
    "Swimming pool", "State of the building", "Url"
]

def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    all_immo_links = []
    search_links = []

    for estate in estate_types:
        for page in range(1, 333):
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)

    with ThreadPoolExecutor() as pool:
        all_immo_links = pool.map(get_from_search_page, search_links)
    
    return [link for sublist in all_immo_links for link in sublist]

def get_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')
    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card__title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_one_property_info(url_one_property: str) -> Dict:
    req = requests.get(url_one_property)
    soup = BeautifulSoup(req.content, 'html.parser')
    read_html_prop = pd.read_html(req.text)
    
    if len(read_html_prop) == 0:
        return {}  # No tables found, return empty dictionary
    
    df_one_property = pd.concat(read_html_prop, ignore_index=True)
    house_dict = {}

    list_of_property_info = []
    window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])[0]['classified'])

    house_dict = list_of_property_info[0]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    house_dict['url'] = url_one_property  # Add the URL to the dictionary for each property

    return house_dict



def clean_data_to_csv(original_dict) -> Dict:
    has_fireplace = int(original_dict.get('How many fireplaces?', 0))
    living_area = original_dict.get('Living area')
    living_area_value = living_area.split(' ', 1)[0] if living_area is not None else None

    surface_of_plot = original_dict.get('Surface of the plot')
    surface_of_plot_value = surface_of_plot.split(' ', 1)[0] if surface_of_plot is not None else None

    new_dict = {
        "Id": original_dict.get('id'),
        'Locality': original_dict.get('Neighbourhood or locality'),
        'Type of property': original_dict.get('type'),
        'Subtype of property': original_dict.get('subtype'),
        'Price': original_dict.get('price'),
        'Type of sale': original_dict.get('transactionType'),
        'Number of rooms': original_dict.get('Bedrooms'),
        'Living Area': living_area_value,
        'Fully equipped kitchen': original_dict['kitchen']['type'],
        'Furnished': True if original_dict.get('Furnished', '').lower() == 'yes' else False,
        'Open fire': True if has_fireplace >= 1 else False,
        'Terrace': False if original_dict['outdoor']['terrace']['exists'].lower() != 'true' else True,
        'Garden': False if int(original_dict['outdoor']['garden']['surface'] or 0) < 1 else True,
        'Surface area of the plot of land': surface_of_plot_value,
        'Number of facades': original_dict.get('Number of frontages'),
        "Swimming pool": original_dict['wellnessEquipment']['hasSwimmingPool'],
        "State of the building": original_dict.get('Building condition'),
        "Url": original_dict.get('url')
    }
    return new_dict

def adding_one_line_into_csv(new_dict):
    with open(all_data_file, 'a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writerow(new_dict)

def get_collective_data():
    immo_links = get_url_list()

    with ThreadPoolExecutor() as pool:
        property_info_list = list(pool.map(get_one_property_info, immo_links))

    for property_info in property_info_list:
        clean_data = clean_data_to_csv(property_info)
        adding_one_line_into_csv(clean_data)

get_collective_data()

print("Scraping completed.")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")



In [None]:
import requests
import json
import re
import csv
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter

start_time = perf_counter()
all_data_file = "all_data.csv"

def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    all_immo_links = []
    search_links = []

    for estate in estate_types:
        for page in range(1, 11):
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)
            print(page)

    with ThreadPoolExecutor() as pool:
        all_immo_links = pool.map(get_from_search_page, search_links)
    
    return [link for sublist in all_immo_links for link in sublist]

def get_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')
    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card__title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_one_property_info(url_one_property: str) -> Dict:
    req = requests.get(url_one_property)
    read_html_prop = pd.read_html(req.text)
    df_one_property = pd.concat(read_html_prop, ignore_index=True)

    list_of_property_info = []
    window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])[0]['classified'])

    house_dict = list_of_property_info[0]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    return house_dict

def clean_data_to_csv(original_dict) -> Dict:
    has_fireplace = int(original_dict.get('How many fireplaces?', 0))
    living_area = original_dict.get('Living area')
    living_area_value = living_area.split(' ', 1)[0] if living_area is not None else None

    surface_of_plot = original_dict.get('Surface of the plot')
    surface_of_plot_value = surface_of_plot.split(' ', 1)[0] if surface_of_plot is not None else None

    new_dict = {
        "Id": original_dict.get('id'),
        'Locality': original_dict.get('Neighbourhood or locality'),
        'Type of property': original_dict.get('type'),
        'Subtype of property': original_dict.get('subtype'),
        'Price': original_dict.get('price'),
        'Type of sale': original_dict.get('transactionType'),
        'Number of rooms': original_dict.get('Bedrooms'),
        'Living Area': living_area_value,
        'Fully equipped kitchen': original_dict['kitchen']['type'],
        'Furnished': True if original_dict.get('Furnished', '').lower() == 'yes' else False,
        'Open fire': True if has_fireplace >= 1 else False,
        'Terrace': False if original_dict['outdoor']['terrace']['exists'].lower() != 'true' else True,
        'Garden': False if int(original_dict['outdoor']['garden']['surface'] or 0) < 1 else True,
        'Surface area of the plot of land': surface_of_plot_value,
        'Number of facades': original_dict.get('Number of frontages'),
        "Swimming pool": original_dict['wellnessEquipment']['hasSwimmingPool'],
        "State of the building": original_dict.get('Building condition'),
        "Url": original_dict.get('url')
    }
    return new_dict

def adding_one_line_into_csv(new_dict):
    header = [
        "Id", 'Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale',
        'Number of rooms', 'Living Area', 'Fully equipped kitchen', 'Furnished', 'Open fire',
        'Terrace', 'Garden', 'Surface area of the plot of land', 'Number of facades',
        "Swimming pool", "State of the building", "Url"
    ]
    with open(all_data_file, 'a') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writerow(new_dict)

def get_collective_data():
    immo_links = get_url_list()

    with ThreadPoolExecutor() as pool:
        property_info_list = list(pool.map(get_one_property_info, immo_links))

    for property_info in property_info_list:
        clean_data = clean_data_to_csv(property_info)
        adding_one_line_into_csv(clean_data)

get_collective_data()

print("Scraping completed.")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")



In [None]:
import csv
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter
from requests_html import HTMLSession


start_time = perf_counter()
all_data_file = "all_data5.csv"

header = [
    "Id", 'Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale',
    'Number of rooms', 'Living Area', 'Fully equipped kitchen', 'Furnished', 'Open fire',
    'Terrace', 'Garden', 'Surface area of the plot of land', 'Number of facades',
    "Swimming pool", "State of the building", "Url"
]


def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    search_links = []

    for estate in estate_types:
        for page in range(1, 11):
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)

    return search_links


def get_from_search_page(search_url):
    session = HTMLSession()
    response = session.get(search_url)
    response.html.render()  # Render JavaScript content

    card_results = response.html.find('article.card--result')
    immo_links = []

    for article in card_results:
        link = article.find('a.card__title-link', first=True)
        if link:
            immo_links.append(link.attrs['href'])

    return immo_links


def get_one_property_info(url_one_property: str) -> Dict:
    session = HTMLSession()
    response = session.get(url_one_property)
    response.html.render()  # Render JavaScript content

    read_html_prop = pd.read_html(response.html.html)
    df_one_property = pd.concat(read_html_prop, ignore_index=True)

    list_of_property_info = []
    window_data = response.html.search("window.dataLayer = {};{}", first=True)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])['classified'])

    house_dict = list_of_property_info[0]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    return house_dict


def clean_data_to_csv(original_dict) -> Dict:
    has_fireplace = int(original_dict.get('How many fireplaces?', 0))
    living_area = original_dict.get('Living area')
    living_area_value = living_area.split(' ', 1)[0] if living_area is not None else None

    surface_of_plot = original_dict.get('Surface of the plot')
    surface_of_plot_value = surface_of_plot.split(' ', 1)[0] if surface_of_plot is not None else None

    new_dict = {
        "Id": original_dict.get('id'),
        'Locality': original_dict.get('Neighbourhood or locality'),
        'Type of property': original_dict.get('type'),
        'Subtype of property': original_dict.get('subtype'),
        'Price': original_dict.get('price'),
        'Type of sale': original_dict.get('transactionType'),
        'Number of rooms': original_dict.get('Bedrooms'),
        'Living Area': living_area_value,
        'Fully equipped kitchen': original_dict['kitchen']['type'],
        'Furnished': True if original_dict.get('Furnished', '').lower() == 'yes' else False,
        'Open fire': True if has_fireplace >= 1 else False,
        'Terrace': False if original_dict['outdoor']['terrace']['exists'].lower() != 'true' else True,
        'Garden': False if int(original_dict['outdoor']['garden']['surface'] or 0) < 1 else True,
        'Surface area of the plot of land': surface_of_plot_value,
        'Number of facades': original_dict.get('Number of frontages'),
        "Swimming pool": original_dict['wellnessEquipment']['hasSwimmingPool'],
        "State of the building": original_dict.get('Building condition'),
        "Url": original_dict.get('url')
    }
    return new_dict


def adding_one_line_into_csv(new_dict):
    with open(all_data_file, 'a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writerow(new_dict)


def get_collective_data():
    search_links = get_url_list()
    all_immo_links = []

    with ThreadPoolExecutor() as pool:
        all_immo_links = list(pool.map(get_from_search_page, search_links))

    immo_links = [link for sublist in all_immo_links for link in sublist]

    with ThreadPoolExecutor() as pool:
        property_info_list = list(pool.map(get_one_property_info, immo_links))

    for property_info in property_info_list:
        clean_data = clean_data_to_csv(property_info)
        adding_one_line_into_csv(clean_data)


get_collective_data()

print("Scraping completed.")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")



In [15]:
import requests
import json
import csv
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter

start_time = perf_counter()
all_data_file = "all_data.csv"

def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    all_immo_links = []
    search_links = []

    for estate in estate_types:
        for page in range(1, 10):  # for testing purpose I changed from 334 to 2
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)

    with ThreadPoolExecutor() as pool:
        all_immo_links = list(pool.map(get_from_search_page, search_links))
    return all_immo_links[0]

def get_from_search_page(search_url):

    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')
    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card__title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

def get_one_property_info(url_one_property: str) -> Dict:
    req = requests.get(url_one_property)
    # print(req.status_code)
    read_html_prop = pd.read_html(req.text)
    df_one_property = pd.concat(read_html_prop, ignore_index=True)

    list_of_property_info = []
    window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])[0]['classified'])

    house_dict = list_of_property_info[0]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    return house_dict

def clean_data_to_csv(original_dict) -> Dict:
    """
    Take info from get_one_prop_info() as JSON or dict,
    and add them to a CSV for only one property.
    """

    has_fireplace = int(original_dict.get('How many fireplaces?', 0))
    living_area = original_dict.get('Living area')
    living_area_value = living_area.split(' ', 1)[0] if living_area is not None else None

    surface_of_plot = original_dict.get('Surface of the plot')
    surface_of_plot_value = surface_of_plot.split(' ', 1)[0] if surface_of_plot is not None else None

    new_dict = {
        "Id": original_dict.get('id'),
        'Locality': original_dict.get('Neighbourhood or locality'),
        'Type of property': original_dict.get('type'),
        'Subtype of property': original_dict.get('subtype'),
        'Price': original_dict.get('price'),
        'Type of sale': original_dict.get('transactionType'),
        'Number of rooms': original_dict.get('Bedrooms'),
        'Living Area': living_area_value,
        'Fully equipped kitchen': original_dict['kitchen']['type'],
        'Furnished': True if original_dict.get('Furnished', '').lower() == 'yes' else False,
        'Open fire': True if has_fireplace >= 1 else False,
        'Terrace': False if original_dict['outdoor']['terrace']['exists'].lower() != 'true' else True,
        'Garden': False if int(original_dict['outdoor']['garden']['surface'] or 0) < 1 else True,
        'Surface area of the plot of land': surface_of_plot_value,
        'Number of facades': original_dict.get('Number of frontages'),
        "Swimming pool": original_dict['wellnessEquipment']['hasSwimmingPool'],
        "State of the building": original_dict.get('Building condition'),
        "Url": original_dict.get('url')
    }
    return new_dict
                                                            
def adding_one_line_into_csv(new_dict):
    header = [
        "Id", 'Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale',
        'Number of rooms', 'Living Area', 'Fully equipped kitchen', 'Furnished', 'Open fire',
        'Terrace', 'Garden', 'Surface area of the plot of land', 'Number of facades',
        "Swimming pool", "State of the building", "Url"
    ]
    with open(all_data_file, 'w') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerow(new_dict)

def get_collective_data():
    """
    clean_data_to_csv() for all URLs from get_url_list() using pool
    """
    immo_links = get_url_list()

    with ThreadPoolExecutor() as pool:
        # Map get_one_property_info function to each link
        property_info_list = list(pool.map(get_one_property_info, immo_links))

    for property_info in property_info_list:
        clean_data = clean_data_to_csv(property_info)
        adding_one_line_into_csv(clean_data)

get_collective_data()

print("Scraping completed.")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")    

Scraping completed.

Time spent inside the loop: 14.359548362999703 seconds.


In [None]:
import requests
from bs4 import BeautifulSoup

root_url = "https://www.immoweb.be/en/search/"
distribution_type = "BUY"
estate_type = "HOUSE"
# need to change in  a list 
# estate_type =['house', 'apartment'] 
country = "BE"

search_url = f"{root_url}{distribution_type}/{estate_type}?countries={country}"
print(search_url)
response = requests.get(search_url)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
house_listings = soup.find_all("a", class_="card__title-link")
house_urls = [listing["href"] for listing in house_listings]
for url in house_urls:
    print(url)


In [None]:
import requests
from bs4 import BeautifulSoup
"house/for-sale?countries=BE&priceType=SALE_PRICE&page=1&orderBy=relevance"
"https://www.immoweb.be/en/search/house/for-sale?countries=BE&priceType=SALE_PRICE&page=333&orderBy=relevance"
root_url = "https://www.immoweb.be/en/search/"
estate_types = ['house', 'apartment']
max_page = 333  # Set the maximum page number to 333
immo_link = []
for estate in estate_types:
    page = 1
    while page <= max_page:
        url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
        req = requests.get(url)
        print("Page: ", page)
        print("Status Code:", req.status_code)

        soup = BeautifulSoup(req.content, 'html.parser')
        card_results = soup.find_all('article', class_='card--result')

        href_links = []

        for article in card_results:
            link = article.find('a', class_='card__title-link')
            if link:
                href_links.append(link['href'])

        for i, link in enumerate(href_links, 1):
            immo_link = immo_link.append(link)
        page += 1




In [None]:
import re
import json
import requests
from bs4 import BeautifulSoup
import threading
from time import perf_counter

start_time = perf_counter()
root_url = "https://www.immoweb.be/en/search/"
estate_types = ['house', 'apartment']
max_page = 333  # Set the maximum page number to 333
immo_link = []

# Function to retrieve and parse data from a URL
def retrieve_data(url):
    response = requests.get(url) 
    js_content = response.text

    # Extract the content within the window.dataLayer assignment using regex
    match = re.search(r'window\.dataLayer\s*=\s*(\[.*?\]);', js_content)

    if match:
        data = match.group(1)  # Get the matched content within the first capturing group
        data_dict = json.loads(data)  # Parse the content into a Python dictionary
        
        # Replace empty values with 'none'
        def replace_empty_with_none(obj):
            for key, value in obj.items():
                if isinstance(value, dict):
                    replace_empty_with_none(value)
                elif isinstance(value, str) and not value:
                    obj[key] = 'none'
        
        replace_empty_with_none(data_dict)
        
        with lock:
            data_list.append(data_dict)

lock = threading.Lock() 
data_list = []

for estate in estate_types:
    page = 1
    while page <= max_page:
        url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
        req = requests.get(url)
        print("Page:", page)
        print("Status Code:", req.status_code)

        soup = BeautifulSoup(req.content, 'html.parser')
        card_results = soup.find_all('article', class_='card--result')

        href_links = []

        for article in card_results:
            link = article.find('a', class_='card__title-link')
            if link:
                href_links.append(link['href'])

        for link in href_links:
            immo_link.append(link)
            # print("URL:", link)
            # Retrieve and parse the data from the URL using threading
            thread = threading.Thread(target=retrieve_data, args=(link,))
            thread.start()

        page += 1

print("Scraping completed.")
for thread in threading.enumerate():
    if thread != threading.current_thread():
        thread.join()

# Print the parsed data
# for index, data_dict in enumerate(data_list):
    # print(f"Data from URL {immo_link[index]}:")
    # print(data_dict)
    # print()

print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")    


In [18]:
import re
import json
import requests
from bs4 import BeautifulSoup
from time import perf_counter
from json import loads

start_time = perf_counter()
root_url = "https://www.immoweb.be/en/search/"
estate_types = ['house', 'apartment']
max_page = 333  # Set the maximum page number to 333
immo_link = []
data_dict =  []

url = "https://www.immoweb.be/en/classified/apartment-block/for-sale/henri-chapelle/4841/10130670"
# Function to retrieve and parse data from a URL
def retrieve_data(url):
        # print(url)
        response = requests.get(url)
        js_content = response.text
        # print(js_content)

        # Extract the content within the window.dataLayer assignment using regex
        match = re.search(r'window\.dataLayer\s*=\s*(\[.*?\]);', js_content)
        if match:
            data = match.group(1)  # Get the matched content within the first capturing group
            data_dict = json.loads(data)  # Parse the content into a Python dictionary

            soup = BeautifulSoup(requests.get(url).content)
            script_text = soup.find("script", text=re.compile("var\s+dataLayer")).text.split("= ", 1)[1]
            json_data = loads(script_text[:script_text.find(";")])
            print("json_data")
            print(json_data)

            # print(data_dict)
            # Replace empty values with 'none'
            def replace_empty_with_none(dict_to_clean):
                for key, value in dict_to_clean.items():
                    if isinstance(value, dict):
                        replace_empty_with_none(value)
                    elif isinstance(value, str) and not value:
                        dict_to_clean[key] = 'none'
            replace_empty_with_none(data_dict)

            with lock:
                data_list.append(data_dict)
retrieve_data(url)
print(data_list)

[]


In [19]:
example_dict = {"Nepal": "", "Italy": "Rome", "England": "London"}

def replace_empty_with_none(dict_to_clean):
        for key, value in dict_to_clean.items():
            if isinstance(value, dict):
                replace_empty_with_none(value)
            elif isinstance(value, str) and not value:
                dict_to_clean[key] = 'none'


replace_empty_with_none(example_dict)
print(example_dict)

{'Nepal': 'none', 'Italy': 'Rome', 'England': 'London'}


In [None]:

def get_url_list():
    for estate in estate_types:
        page = 1
        while page <= max_page:
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            req = requests.get(url)
            soup = BeautifulSoup(req.content, 'html.parser')
            card_results = soup.find_all('article', class_='card--result')
            immo_links = []
            for article in card_results:
                link = article.find('a', class_='card__title-link')
                if link:
                    immo_links.append(link['href'])
            page += 1
    print(immo_links)

get_url_list()


In [2]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
from multiprocessing import Pool
import json
from time import perf_counter

root_url = "https://www.immoweb.be/en/search/"
estate_types = ['house', 'apartment']
max_page = 333  # Set the maximum page number to 333

def get_url_list(estate):
    immo_links = []
    page = 1
    while page <= max_page:
        url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
        req = requests.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')
        card_results = soup.find_all('article', class_='card--result')
        for article in card_results:
            link = article.find('a', class_='card__title-link')
            if link:
                immo_links.append(link['href'])
        page += 1
    return immo_links

def get_immo_dict(link):
    req = requests.get(link)
    soup = BeautifulSoup(req.text, 'html.parser')
    script_tags = soup.find_all('script')
    second_script = script_tags[1]
    script_content = second_script.string
    new_script_content = script_content.split('"classified": ')[1]
    new_new_cont = new_script_content.split(""",
                                    "customer": """)[0]
    dict1 = json.loads(new_new_cont)
    return dict1

def replace_empty_with_none(dict_to_clean):
    for key, value in dict_to_clean.items():
        if isinstance(value, dict):
            replace_empty_with_none(value)
        elif isinstance(value, str) and not value:
            dict_to_clean[key] = None
    return dict_to_clean

start_time = perf_counter()

immo_links = []
with Pool() as pool:
    immo_links = pool.map(get_url_list, estate_types)
    print(immo_links)

immo_dicts = []
with Pool() as pool:
    results = pool.map(get_immo_dict, immo_links)
    print(immo_dicts)
    for result in results:
        result = replace_empty_with_none(result)
        immo_dicts.append(result)

with open('immo_dump.json', 'w') as outfile:
    json.dump(immo_dicts, outfile, indent=4)

print("Scraping completed")
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")

KeyboardInterrupt: 

function :
    get_url_list
        list(url_list)

    get_immo_dict(url_list)
        list(immo_dict) or json
    
    replace_empty_with_none(immo_dict)
        list(cleaned_immo_dict) or json
        
    get_csv_file(immo_dict)
    
    