In [5]:
import requests, json, lxml, re
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from time import perf_counter
import pandas as pd
from typing import List, Dict
import multiprocessing
import concurrent.futures

In [2]:
def get_from_search_page(search_url):
    req = requests.get(search_url)
    soup = BeautifulSoup(req.content, 'html.parser')
    card_results = soup.find_all('article', class_='card--result')
    immo_links = []
    for article in card_results:
        link = article.find('a', class_='card__title-link')
        if link:
            immo_links.append(link['href'])
    return immo_links

In [10]:
def get_url_list() -> List:
    root_url = "https://www.immoweb.be/en/search/"
    estate_types = ['house', 'apartment']
    all_immo_links = []
    search_links = []

    for estate in estate_types:
        for page in range(1,2):
            url = f"{root_url}{estate}/for-sale?countries=BE&page={page}&orderBy=relevance"
            search_links.append(url)
    
    with ThreadPoolExecutor() as pool:
        all_immo_links = list(pool.map(get_from_search_page, search_links))
    return all_immo_links

In [11]:
start = perf_counter()
print(get_url_list())
end=perf_counter()
pool_time = end-start
print("pool_time: ", pool_time)

[['https://www.immoweb.be/en/classified/mansion/for-sale/puurs/2870/10566398', 'https://www.immoweb.be/en/classified/mansion/for-sale/ninove/9400/10566078', 'https://www.immoweb.be/en/classified/villa/for-sale/herent/3020/10522729', 'https://www.immoweb.be/en/classified/house/for-sale/liege/4000/10564022', 'https://www.immoweb.be/en/classified/mixed-use-building/for-sale/antwerp/2060/10564982', 'https://www.immoweb.be/en/classified/mixed-use-building/for-sale/antwerp/2060/10564986', 'https://www.immoweb.be/en/classified/apartment-block/for-sale/henri-chapelle/4841/10130670', 'https://www.immoweb.be/en/classified/house/for-sale/henri-chapelle/4841/10130669', 'https://www.immoweb.be/en/classified/mansion/for-sale/nivelles/1400/10193358', 'https://www.immoweb.be/en/classified/house/for-sale/donceel/4357/10563785', 'https://www.immoweb.be/en/classified/house/for-sale/jodoigne/1370/10564815', 'https://www.immoweb.be/en/classified/house/for-sale/beveren/9120/10561682', 'https://www.immoweb.b

In [21]:
def get_one_property_info(url_one_property:str) -> Dict: 
    req = requests.get(url_one_property)
    print(req.status_code)
    read_html_prop = pd.read_html(req.text)
    df_one_property = pd.concat(read_html_prop, ignore_index=True)

    list_of_property_info = []
    window_data = re.findall("window.dataLayer =(.+?);\n", req.text, re.S)
    if window_data:
        list_of_property_info.append(json.loads(window_data[0])[0]['classified'])
    print (list_of_property_info[0])
    
    house_dict = {}
    house_dict["Id"] = url_one_property.split("/")[-1]

    for i in df_one_property.index:
        house_dict[df_one_property[0][i]] = df_one_property[1][i]

    return house_dict

# print(get_url_list()[0])
get_one_property_info("https://www.immoweb.be/en/classified/mansion/for-sale/puurs/2870/10566398")


200


{'id': '10566398',
 'type': 'house',
 'subtype': 'mansion',
 'price': '439000',
 'transactionType': 'for sale',
 'zip': '2870',
 'visualisationOption': 'xl',
 'kitchen': {'type': 'installed'},
 'building': {'constructionYear': '1909', 'condition': 'to be done up'},
 'energy': {'heatingType': 'gas'},
 'certificates': {'primaryEnergyConsumptionLevel': '240'},
 'bedroom': {'count': '4'},
 'land': {'surface': '497'},
 'atticExists': '',
 'basementExists': 'true',
 'outdoor': {'garden': {'surface': ''}, 'terrace': {'exists': 'true'}},
 'specificities': {'SME': {'office': {'exists': 'true'}}},
 'wellnessEquipment': {'hasSwimmingPool': ''},
 'parking': {'parkingSpaceCount': {'indoor': '1', 'outdoor': ''}},
 'condition': {'isNewlyBuilt': ''}}

In [None]:
def clean_data_to_csv():
    """
    take info from get_one_prop_info() as json or dict,
    and add them to a csv for only one property.
    Mourad
    """
    pass


In [None]:

def get_collective_data():
    """
    clean_data_to_csv() for all urls from get_url_list() using pool
    """
    pass

In [None]:
def cleaning_data_from_csv():
    """
    
    """
    pass

In [None]:
headers_df = [
        "Id",
        "Locality",
        "Type of property",
        "Subtype of property",
        "Price",
        "Type of sale",
        "Number of rooms",
        "Living Area",
        "Fully equipped kitchen",
        "Furnished",
        "Open fire",
        "Terrace",
        "Terrace area",
        "Garden",
        "Garden area",
        "Surface of the land",
        "Surface area of the plot of land",
        "Number of facades",
        "Swimming pool",
        "State of the building",
        "Url"
    ]

df = pd.DataFrame(columns=headers_df)

In [36]:
url = 'https://www.immoweb.be/en/classified/mansion/for-sale/puurs/2870/10566398'
content = requests.get(url).content
all_tables = pd.read_html(content,keep_default_na=False)
data_frame_houses = pd.concat(all_tables,  ignore_index=True)

In [None]:
data_frame_houses

In [38]:
house_dict = {}
# bu sözlüğe önce id eklenebilir 
for i in data_frame_houses.index:
    house_dict[data_frame_houses[0][i]] = data_frame_houses[1][i]
house_dict

{'Construction year': '1909',
 'Building condition': 'To be done up',
 'Street frontage width': '10 m',
 'Number of frontages': '2',
 'Covered parking spaces': '1',
 'Living area': '282  m² square meters',
 'Living room surface': '52  m² square meters',
 'Kitchen type': 'Installed',
 'Kitchen surface': '14  m² square meters',
 'Bedrooms': '4',
 'Bedroom 1 surface': '19  m² square meters',
 'Bedroom 2 surface': '17  m² square meters',
 'Bedroom 3 surface': '17  m² square meters',
 'Bathrooms': '1',
 'Toilets': '2',
 'Office surface': '28  m² square meters',
 'Office': 'Yes',
 'Basement': 'Yes',
 'Surface of the plot': '497  m²  square meters',
 'Width of the lot on the street': '10 m  meters',
 'Gas, water & electricity': 'No',
 'Garden': 'Yes',
 'Garden orientation': 'East',
 'Terrace surface': '40  m²  square meters',
 'Primary energy consumption': '240  kWh/m²  kilowatt hour per square meters',
 'Energy class': 'C',
 '': '',
 'Reference number of the EPC report': '20230308-0002314212

In [20]:
# house_keys = data_frame_houses.iloc[:,0]
# house_values = data_frame_houses.iloc[:,1]

In [39]:
headers_df = [
        "Locality",
        "Type of property",
        "Subtype of property",
        "Price",
        "Type of sale",
        "Number of rooms",
        "Living Area",
        "Fully equipped kitchen",
        "Furnished",
        "Open fire",
        "Terrace",
        "Terrace area",
        "Garden",
        "Garden area",
        "Surface of the land",
        "Surface area of the plot of land",
        "Number of facades",
        "Swimming pool",
        "State of the building",
        "Id",
        "Url"
    ]

df = pd.DataFrame(columns=headers_df)

In [40]:
df

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Living Area,Fully equipped kitchen,Furnished,Open fire,...,Terrace area,Garden,Garden area,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building,Id,Url
