In [297]:
%load_ext autoreload
%autoreload 2

import json
from datetime import datetime
import os
from pathlib import Path
import re
import sys

import pandas as pd
import polars as pl
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt

ROOT_PATH = Path(os.path.abspath("")).parents[1]

sys.path.append(str(ROOT_PATH))
from src.functions.json_values import (
    get_json_values,
    get_json_values_parallel,
    select_value,
)
from src.functions.flatten_dict import flatten_dict

CLASSIFIED_PATH = ROOT_PATH / "data/classified"
SCRAPING_DAY = 22

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [298]:
# Extract date of classified
def get_day(file_name: str):
    ymd_groups = r"^classified_.+_(\d+)_(\d+).json$"
    year_string = re.findall(ymd_groups, file_name)[0][
        0
    ]  # First element of list, first group
    return datetime.strptime(year_string, "%Y%m%d").day

In [299]:
# Keep classified files that were captured on the 15th of May
list_filenames = os.listdir(CLASSIFIED_PATH)
new_list_filenames = []

for filename in list_filenames:
    try:
        filename_day = get_day(filename)
    except IndexError:
        print(
            filename
        )  # Some filenames are changed by mistakes <- shouldn't happen too often
        continue
    if filename_day == SCRAPING_DAY:
        new_list_filenames.append(filename)

list_filenames = new_list_filenames

classified_10000290apartmentforrent8600Diksmuide_20230522_212157.json~


In [300]:
# Read one file and extract useful variables
with open(CLASSIFIED_PATH / list_filenames[0]) as json_file:
    class_dict = json.load(json_file)

class_dict.keys()

dict_keys(['user', 'classified', 'customer', 'screen', 'other_defined', 'classified_table'])

In [301]:
# Find and select renting locations
list_transactions = get_json_values(
    list_filenames, CLASSIFIED_PATH, "classified", "transactionType"
)

In [302]:
# Get unique
np.unique(np.array(list_transactions))

array(['for rent', 'for sale'], dtype='<U8')

In [303]:
## Filter for rent and apartments
list_types = get_json_values(list_filenames, CLASSIFIED_PATH, "classified", "type")

In [304]:
np.array(np.unique(list_types))

array(['apartment', 'apartment group', 'house', 'house group'],
      dtype='<U15')

In [305]:
## Filter JSONS about apartment to rent
bl_type = np.isin(np.array(list_types), ["apartment", "apartment group"])
bl_transact = np.isin(np.array(list_transactions), "for rent")
bl = np.logical_and(bl_type, bl_transact)

list_apartrent = [filename for filename, b in zip(list_filenames, bl) if b]

In [306]:
# Possible values
pl.DataFrame(flatten_dict(select_value(list_apartrent[0], CLASSIFIED_PATH))).shape

(1, 75)

In [307]:
select_value(list_apartrent[0], CLASSIFIED_PATH)

{'user': {'loginStatus': 'logged out', 'id': '', 'personal': {'language': ''}},
 'classified': {'id': '10578471',
  'type': 'apartment',
  'subtype': 'apartment',
  'price': '1050',
  'transactionType': 'for rent',
  'zip': '1060',
  'visualisationOption': 'xl',
  'kitchen': {'type': 'installed'},
  'building': {'constructionYear': '1929', 'condition': 'good'},
  'energy': {'heatingType': 'gas'},
  'certificates': {'primaryEnergyConsumptionLevel': '324'},
  'bedroom': {'count': '2'},
  'land': {'surface': ''},
  'atticExists': '',
  'basementExists': '',
  'outdoor': {'garden': {'surface': ''}, 'terrace': {'exists': ''}},
  'specificities': {'SME': {'office': {'exists': 'true'}}},
  'wellnessEquipment': {'hasSwimmingPool': ''},
  'parking': {'parkingSpaceCount': {'indoor': '', 'outdoor': ''}},
  'condition': {'isNewlyBuilt': ''}},
 'customer': {'id': '2575488',
  'name': 'we invest bruxelles',
  'family': 'agency',
  'groupInfo': {'id': '', 'name': ''},
  'networkInfo': {'id': '', 'nam

In [308]:
# Defining component I want to extract from the apartments to rent
component_dict = {
    "listing_id": ["classified", "id"],
    "price": ["classified", "price"],
    "contact_type": ["customer", "family"],
    "contact_name": ["customer", "name"],
    "subtype": ["classified", "subtype"],
    "zip_code": ["classified", "zip"],
    "construction_year": ["building", "constructionYear"],
    "building_condition": ["building", "condition"],
    "energy_consumption": ["certificates", "primaryEnergyConsumptionLevel"],
    "energy_class": ["classified_table", "energy_class"],
    "n_bedrooms": ["bedroom", "count"],
    "land_surface": ["land", "surface"],
    "surroundings_type": ["classified_table", "surroundings_type"],
    "living_area": ["classified_table", "living_area"],
}

In [309]:
pl_apart = pl.DataFrame()

for key in component_dict.keys():
    data_arr = get_json_values(list_apartrent, CLASSIFIED_PATH, *component_dict[key])
    pl_apart = pl_apart.with_columns(pl.lit(data_arr).alias(key))


pl_apart.head()

listing_id,price,contact_type,contact_name,subtype,zip_code,construction_year,building_condition,energy_consumption,energy_class,n_bedrooms,land_surface,surroundings_type,living_area
str,str,str,str,str,str,f32,f32,f32,str,f32,f32,str,str
"""10578471""","""1050""","""agency""","""we invest brux…","""apartment""","""1060""",,,,"""f""",,,"""urban""","""92"""
"""10579694""","""2750""","""agency""","""bathim &amp; c…","""penthouse""","""1050""",,,,"""d""",,,"""urban""","""200"""
"""10578602""","""1250""","""agency""","""bathim &amp; c…","""apartment""","""1000""",,,,"""b""",,,"""living area (r…","""50"""
"""10527337""","""1350""","""agency""","""place 4 you""","""duplex""","""1150""",,,,"""g""",,,"""living area (r…","""80"""
"""10578603""","""1250""","""agency""","""bathim &amp; c…","""apartment""","""1030""",,,,"""d""",,,"""isolated""","""85"""


In [310]:
# With price, save only max price, parse
price_list = pl_apart.select("price").to_numpy().ravel().tolist()

In [311]:
# Investigating strange price patterns
pattern = r"(\d+)(.*)"
for price in price_list:
    found = re.findall(pattern, price)
    try:
        if found[0][1] != "":
            print(found[0])
    except IndexError:
        print(found)

('900', ' - 922')
('950', ' - 1050')
('950', ' - 1390')
('915', ' - 915')
('860', ' - 1180')
[]
('825', ' - 995')
('765', ' - 853')
('950', ' - 1150')


In [312]:
adjusted_prices = []

pattern = r"(\d+)( - )?(\d+)"
for price in price_list:
    found = re.findall(pattern, price)
    try:
        if found[0][1] != "":
            adjusted_prices.append(found[0][2])
        else:
            adjusted_prices.append(price)
    except IndexError:
        adjusted_prices.append(None)

In [313]:
# Replace adjusted prices (we retain the max rent price)
pl_apart = pl_apart.with_columns(pl.lit(adjusted_prices).cast(pl.Int64).alias("price"))

In [314]:
# Cast rows as relevant
q = [
    pl.col("price").cast(pl.Int64),
    pl.col("n_bedrooms").cast(pl.Int64),
    pl.col("energy_consumption").cast(pl.Int64),
    pl.col("construction_year").cast(pl.Int64),
    pl.col("zip_code").cast(pl.Int64),
    pl.col("living_area").cast(pl.Float64),
]

pl_apart = pl_apart.with_columns(q)

In [315]:
pl_apart.head()

listing_id,price,contact_type,contact_name,subtype,zip_code,construction_year,building_condition,energy_consumption,energy_class,n_bedrooms,land_surface,surroundings_type,living_area
str,i64,str,str,str,i64,i64,f32,i64,str,i64,f32,str,f64
"""10578471""",1050,"""agency""","""we invest brux…","""apartment""",1060,,,,"""f""",,,"""urban""",92.0
"""10579694""",2750,"""agency""","""bathim &amp; c…","""penthouse""",1050,,,,"""d""",,,"""urban""",200.0
"""10578602""",1250,"""agency""","""bathim &amp; c…","""apartment""",1000,,,,"""b""",,,"""living area (r…",50.0
"""10527337""",1350,"""agency""","""place 4 you""","""duplex""",1150,,,,"""g""",,,"""living area (r…",80.0
"""10578603""",1250,"""agency""","""bathim &amp; c…","""apartment""",1030,,,,"""d""",,,"""isolated""",85.0


In [316]:
# Contact types
pl_apart.groupby("contact_type").count()

contact_type,count
str,u32
"""agency_paying_…",127
"""real_estate_ag…",121
"""property_devel…",163
"""notary""",1
"""company""",32
"""private""",2434
"""agency""",7041
"""company_paying…",33


In [317]:
# Load zipcodes and coordinates data (and merge)
pl_zip = pl.read_csv(ROOT_PATH / "data/zipcode-belgium.csv", has_header=False)
pl_zip.columns = ["zip_code", "commune", "geo_long", "geo_lat"]

pl_zip.head()

zip_code,commune,geo_long,geo_lat
i64,str,f64,f64
1000,"""Bruxelles""",4.351697,50.846557
1020,"""Laeken""",4.3487134,50.883392
1030,"""Schaerbeek""",4.3737121,50.867604
1040,"""Etterbeek""",4.3895104,50.836851
1050,"""Ixelles""",4.3815707,50.822285


In [318]:
pl_apart = pl_apart.join(pl_zip, on="zip_code", how="left")

In [319]:
# Any coordinate data missing?
pl_apart.select(pl.col("geo_long").is_null().sum())

geo_long
u32
0


In [320]:
# Check for duplicate ids -> A lot
# Filter them out
q = pl.col("listing_id").is_duplicated().is_not()
pl_apart = pl_apart.filter(q)

In [321]:
# Read municipality shape file
gpd_municipalities = gpd.read_file(
    ROOT_PATH / "data/adminvector_3812.gpkg", layer="municipality"
)
gpd_municipalities.head()

Unnamed: 0,tgid,modifdate,arrondissementcapital,provincecapital,regioncapital,countrycapital,niscode,city,languagestatute,nameger,namefre,namedut,geometry
0,{8BF44CB0-B8FD-44F6-A64F-1307610DA4C9},2007-01-05,False,False,False,False,72004,1,1,Bree,Bree,Bree,"MULTIPOLYGON Z (((735277.942 700725.863 0.000,..."
1,{54A85359-4967-4318-AA63-D234DDED2FD7},2007-01-05,False,False,False,False,63004,0,2,Baelen,Baelen,Baelen,"MULTIPOLYGON Z (((767808.071 646176.387 0.000,..."
2,{95E2AAE2-F9DB-456C-A113-2A21ED6F932F},2007-01-05,False,False,False,False,13003,0,1,Balen,Balen,Balen,"MULTIPOLYGON Z (((708505.571 703629.811 0.000,..."
3,{4487B4B8-4422-4856-97C5-33174BF84028},2007-01-05,False,False,False,False,62011,0,2,Bassenge,Bassenge,Bitsingen,"MULTIPOLYGON Z (((738180.089 660714.898 0.000,..."
4,{68A224E9-B2E7-4225-9FDA-03AE2B9C8C41},2007-01-05,False,False,False,False,85046,0,2,Habay,Habay,Habay,"MULTIPOLYGON Z (((741033.075 543058.412 0.000,..."


In [322]:
# Convert to pandas
df_apart = pl_apart.to_pandas()

In [323]:
# Spatial join
apart_geom = [Point(x, y) for x, y in zip(df_apart["geo_long"], df_apart["geo_lat"])]
df_apart = df_apart.drop(["geo_long", "geo_lat"], axis=1)
gpd_apart = gpd.GeoDataFrame(df_apart, geometry=apart_geom)

In [324]:
gpd_apart = gpd_apart.sjoin(
    gpd_municipalities.to_crs("EPSG:4326"), how="left", predicate="within"
)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)
