In [20]:
%load_ext autoreload
%autoreload 2

import json
from datetime import datetime
import os
from pathlib import Path
import re
import sys

import pandas as pd
import polars as pl
import numpy as np

ROOT_PATH = Path(os.path.abspath("")).parents[1]

sys.path.append(str(ROOT_PATH))
from src.functions.json_values import (
    get_json_values,
    get_json_values_parallel,
    select_value,
)
from src.functions.flatten_dict import flatten_dict

CLASSIFIED_PATH = ROOT_PATH / "data/classified"
SCRAPING_DAY = 22

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Extract date of classified
def get_day(file_name: str):
    ymd_groups = r"^classified_.+_(\d+)_(\d+).json$"
    year_string = re.findall(ymd_groups, file_name)[0][
        0
    ]  # First element of list, first group
    return datetime.strptime(year_string, "%Y%m%d").day

In [4]:
# Keep classified files that were captured on the 15th of May
list_filenames = os.listdir(CLASSIFIED_PATH)
list_filenames = [
    filename for filename in list_filenames if get_day(filename) == SCRAPING_DAY
]

In [5]:
# Read one file and extract useful variables
with open(CLASSIFIED_PATH / list_filenames[0]) as json_file:
    class_dict = json.load(json_file)

class_dict.keys()

dict_keys(['user', 'classified', 'customer', 'screen', 'other_defined', 'classified_table'])

In [6]:
# Find and select renting locations
list_transactions = get_json_values(
    list_filenames, CLASSIFIED_PATH, "classified", "transactionType"
)

In [7]:
# Get unique
np.unique(np.array(list_transactions))

array(['for rent', 'for sale'], dtype='<U8')

In [8]:
## Filter for rent and apartments
list_types = get_json_values(list_filenames, CLASSIFIED_PATH, "classified", "type")

In [10]:
np.array(np.unique(list_types))

array(['apartment', 'apartment group', 'house', 'house group'],
      dtype='<U15')

In [12]:
## Filter JSONS about apartment to rent
bl_type = np.isin(np.array(list_types), ["apartment", "apartment group"])
bl_transact = np.isin(np.array(list_transactions), "for rent")
bl = np.logical_and(bl_type, bl_transact)

list_apartrent = [filename for filename, b in zip(list_filenames, bl) if b]

In [22]:
# Possible values
pl.DataFrame(flatten_dict(select_value(list_apartrent[0], CLASSIFIED_PATH))).shape

(1, 75)

In [23]:
select_value(list_apartrent[0], CLASSIFIED_PATH)

{'user': {'loginStatus': 'logged out', 'id': '', 'personal': {'language': ''}},
 'classified': {'id': '10578471',
  'type': 'apartment',
  'subtype': 'apartment',
  'price': '1050',
  'transactionType': 'for rent',
  'zip': '1060',
  'visualisationOption': 'xl',
  'kitchen': {'type': 'installed'},
  'building': {'constructionYear': '1929', 'condition': 'good'},
  'energy': {'heatingType': 'gas'},
  'certificates': {'primaryEnergyConsumptionLevel': '324'},
  'bedroom': {'count': '2'},
  'land': {'surface': ''},
  'atticExists': '',
  'basementExists': '',
  'outdoor': {'garden': {'surface': ''}, 'terrace': {'exists': ''}},
  'specificities': {'SME': {'office': {'exists': 'true'}}},
  'wellnessEquipment': {'hasSwimmingPool': ''},
  'parking': {'parkingSpaceCount': {'indoor': '', 'outdoor': ''}},
  'condition': {'isNewlyBuilt': ''}},
 'customer': {'id': '2575488',
  'name': 'we invest bruxelles',
  'family': 'agency',
  'groupInfo': {'id': '', 'name': ''},
  'networkInfo': {'id': '', 'nam

In [None]:
# Defining component I want to extract from the apartments to rent
component_dict = {
    "listing_id": ["classified", "id"],
    "price": ["classified", "price"],
    "contact_type": ["customer", "family"],
    "contact_name": ["customer", "name"],
    "subtype": ["classified", "subtype"],
    "zip_code": ["classified", "zip"],
    "construction_year": ["building", "constructionYear"],
    "building_condition": ["building", "condition"],
    "energy_consumption": ["certificates", "primaryEnergyConsumptionLevel"],
    "energy_class": ["classified_table", "energy_class"],
    "n_bedrooms": ["bedroom", "count"],
    "land_surface": ["land", "surface"],
    "surroundings_type": ["classified_table", "surroundings_type"],
}