# Data Prep - IPTU Data
The IPTU stands for the real state property tax owners pay annually to the local government administrations in the whole country. In the case of the city of São Paulo, this dataset is made available publicly and lists several relevant attributes of real state properties across the entire city. At the lowest level of geospatial reference, the properties are encoded with their zip codes, which, in the city of São Paulo, can be roughly approximated to the street the property is located at. 

The strategy for this data preparation phase is to prepare the features for the properties at the `zip_code` level and match it to the `zip_code` polygons / (lat, long) points so that these features can be normalized at the district level.

In [1]:
# installing dependencies for data preparation
!pip install -r ../configs/dependencies/dataprep_requirements.txt >> ../configs/dependencies/package_installation.txt

In [2]:
# loading the magic commands:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [3]:
###### Loading the necessary libraries #########

# PySpark dependencies:s
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import pyspark.sql.types as T
from pyspark.sql.window import Window

# Sedona dependencies:
from sedona.utils.adapter import Adapter
from sedona.register import SedonaRegistrator
from sedona.utils import KryoSerializer, SedonaKryoRegistrator
from sedona.core.SpatialRDD import SpatialRDD
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.core.formatMapper import GeoJsonReader

# database utilities:
from sqlalchemy import create_engine
import sqlite3 as db
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
import fiona

# plotting and data visualization:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, Image

# other relevant libraries:
import warnings
import unidecode
import inflection
import unicodedata
from datetime import datetime, timedelta
from functools import partial
import json
import re
import os
from glob import glob
import shutil
import itertools
import chardet

# importing the atlas utilities:
from atlasutils import (
    save_to_filesystem,
    save_as_table,
    rotate_xticks,
    get_file_encoding,
    normalize_entities,
    normalize_column_name,
    apply_category_map,
    standardize_variable_names,
    get_null_columns,
    replace_decimal_separator,
    convert_to_geopandas,
    drop_invalid_census_columns,
    clean_census_column_name,
    get_file_crs,
    get_column_values,
)


# setting global parameters for visualizationsss:
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

# 0. Configuring Spark

In [4]:
# function to encapsulate standard spark configurations:
def init_spark(app_name):

    spark = (
        SparkSession.builder.appName(app_name)
        .config("spark.files.overwrite", "true")
        .config("spark.serializer", KryoSerializer.getName)
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config(
            "spark.jars.packages",
            "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,"
            "org.datasyslab:geotools-wrapper:geotools-24.1",
        )
        .config("spark.sql.repl.eagerEval.enabled", True)
        .config("spark.sql.repl.eagerEval.maxNumRows", 5)
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .config("sedona.global.charset", "utf8")
        .config("sedona.global.index", "true")
        .enableHiveSupport()
        .getOrCreate()
    )

    SedonaRegistrator.registerAll(spark)

    return spark

In [5]:
# init the spark session:
spark = init_spark("SP Atlas - IPTU Data")

In [6]:
# verifying the session status:
spark

# 1. Loading and Inspecting the Data

In [7]:
# loading the raw dataset:
RAW_DATA_DIR = "../data/raw/"

# let's probe the IPTU files to verify the enconding:
with open(RAW_DATA_DIR + "sp_iptu/iptu_raw.csv", "rb") as file:
    # reading only a portion of the file to match the potential encoding
    start, end = (0, 20000)
    file.seek(start)
    print(chardet.detect(file.read(end - start)))

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


The IPTU raw datasets are large files with multiple columns. By analyzing a sample of the dataset previously, I selected the columns that will be necessary for our project and defined the schema.

In [8]:
# defining the schemas original columns:
IPTU_COLUMNS = sorted(
    [
        "CODLOG DO IMOVEL",
        "NOME DE LOGRADOURO DO IMOVEL",
        "NUMERO DO IMOVEL",
        "NUMERO DO CONTRIBUINTE",
        "CEP DO IMOVEL",
        "BAIRRO DO IMOVEL",
        "FRACAO IDEAL",
        "AREA DO TERRENO",
        "AREA CONSTRUIDA",
        "AREA OCUPADA",
        "VALOR DO M2 DO TERRENO",
        "VALOR DO M2 DE CONSTRUCAO",
        "ANO DA CONSTRUCAO CORRIGIDO",
        "QUANTIDADE DE PAVIMENTOS",
        "TIPO DE USO DO IMOVEL",
        "ANO DE INICIO DA VIDA DO CONTRIBUINTE",
        "TESTADA PARA CALCULO",
        "FATOR DE OBSOLESCENCIA",
        "QUANTIDADE DE ESQUINAS/FRENTES",
        "TIPO DE CONTRIBUINTE 1",
        "TIPO DE CONTRIBUINTE 2",
        "TIPO DE PADRAO DA CONSTRUCAO",
        "TIPO DE TERRENO",
    ]
)

# reading the dataset with the selected columns and specified encoding:
df_iptu = spark.read.csv(
    RAW_DATA_DIR + "sp_iptu/iptu_raw.csv",
    header=True,
    sep=";",
    encoding="ISO-8859-1",
).select(*IPTU_COLUMNS)

In [9]:
# generating an id using the some of the columns:
df_iptu = df_iptu.withColumn(
    "property_id",
    F.concat(
        F.col("CODLOG DO IMOVEL"),
        F.col("NOME DE LOGRADOURO DO IMOVEL"),
        F.col("NUMERO DO IMOVEL"),
    ),
)

# dropping the previous columns:
df_iptu = df_iptu.drop(
    "CODLOG DO IMOVEL", "NOME DE LOGRADOURO DO IMOVEL", "NUMERO DO IMOVEL"
)

In [10]:
# verifying some of the results:
df_iptu.count()

3383561

In [11]:
# verifying the schema:
df_iptu.printSchema()

root
 |-- ANO DA CONSTRUCAO CORRIGIDO: string (nullable = true)
 |-- ANO DE INICIO DA VIDA DO CONTRIBUINTE: string (nullable = true)
 |-- AREA CONSTRUIDA: string (nullable = true)
 |-- AREA DO TERRENO: string (nullable = true)
 |-- AREA OCUPADA: string (nullable = true)
 |-- BAIRRO DO IMOVEL: string (nullable = true)
 |-- CEP DO IMOVEL: string (nullable = true)
 |-- FATOR DE OBSOLESCENCIA: string (nullable = true)
 |-- FRACAO IDEAL: string (nullable = true)
 |-- NUMERO DO CONTRIBUINTE: string (nullable = true)
 |-- QUANTIDADE DE ESQUINAS/FRENTES: string (nullable = true)
 |-- QUANTIDADE DE PAVIMENTOS: string (nullable = true)
 |-- TESTADA PARA CALCULO: string (nullable = true)
 |-- TIPO DE CONTRIBUINTE 1: string (nullable = true)
 |-- TIPO DE CONTRIBUINTE 2: string (nullable = true)
 |-- TIPO DE PADRAO DA CONSTRUCAO: string (nullable = true)
 |-- TIPO DE TERRENO: string (nullable = true)
 |-- TIPO DE USO DO IMOVEL: string (nullable = true)
 |-- VALOR DO M2 DE CONSTRUCAO: string (nullab

In [12]:
# verifying null values:
get_null_columns(df_iptu, normalize=True)

-RECORD 0-----------------------------------------------------
 ANO DA CONSTRUCAO CORRIGIDO           | 0.0                  
 ANO DE INICIO DA VIDA DO CONTRIBUINTE | 0.0                  
 AREA CONSTRUIDA                       | 0.0                  
 AREA DO TERRENO                       | 0.0                  
 AREA OCUPADA                          | 0.0                  
 BAIRRO DO IMOVEL                      | 0.3584179507920797   
 CEP DO IMOVEL                         | 0.0                  
 FATOR DE OBSOLESCENCIA                | 0.0                  
 FRACAO IDEAL                          | 0.0                  
 NUMERO DO CONTRIBUINTE                | 0.0                  
 QUANTIDADE DE ESQUINAS/FRENTES        | 0.0                  
 QUANTIDADE DE PAVIMENTOS              | 0.0                  
 TESTADA PARA CALCULO                  | 0.0                  
 TIPO DE CONTRIBUINTE 1                | 0.019362440931314673 
 TIPO DE CONTRIBUINTE 2                | 0.792349539434

## 1.1 Changes identified
There are several changes we need to make to the dataset. 

1. The `CEP DO IMOVEL` column requires the removal of the `-` character;
2. The `BAIRRO DO IMOVEL` contains null about `44%` of null values. We can't do much about this from the raw data perspective. Later on, we can salvage the layers for zip codes to get back the neighborhood and other geospatial encodings of the dataset;
3. All column names will be processed to remove spaces and unnecessary characters;
4. All columns are encoded as strings, and some of the columns need to be converted to numerical types;

In [13]:
# dropping the compromised columns:
df_iptu = df_iptu.drop("BAIRRO DO IMOVEL")

# 2. Fixing Column names and converting data types

In [8]:
# defining an UDF for replacing the decimal separator
replace_decimal_separator_udf = F.udf(replace_decimal_separator)

In [15]:
# defining the column names:
new_cols = {
    "construction_year": "integer",
    "owner_start_contribution_year": "integer",
    "area_built": "integer",
    "area_lot": "integer",
    "area_occupied": "integer",
    "zipcode": "string",
    "obsolescence_factor": "float",
    "fraction_factor": "float",
    "owner_id": "string",
    "number_fronts_corners": "integer",
    "number_floors": "integer",
    "front_factor": "float",
    "owner_type_1": "string",
    "owner_type_2": "string",
    "construction_type": "string",
    "lot_type": "string",
    "property_type": "string",
    "construction_square_meter_value": "float",
    "lot_square_meter_value": "float",
    "property_id": "string",
}

# saving the previous columns for later
previous_columns = df_iptu.columns

# types list:
types = list(new_cols.values())
new_columns = list(new_cols.keys())

# # converting data types:
for i in range(len(previous_columns)):

    if types[i] == "float":
        df_iptu = df_iptu.withColumn(
            previous_columns[i],
            replace_decimal_separator_udf(F.col(previous_columns[i])),
        ).withColumn(previous_columns[i], F.col(previous_columns[i]).cast("float"))

    elif types[i] == "integer":
        df_iptu = df_iptu.withColumn(
            previous_columns[i], F.col(previous_columns[i]).cast("integer")
        )

    df_iptu = df_iptu.withColumnRenamed(previous_columns[i], new_columns[i])

In [16]:
# verifying the results:
df_iptu

construction_year,owner_start_contribution_year,area_built,area_lot,area_occupied,zipcode,obsolescence_factor,fraction_factor,owner_id,number_fronts_corners,number_floors,front_factor,owner_type_1,owner_type_2,construction_type,lot_type,property_type,construction_square_meter_value,lot_square_meter_value,property_id
1924,1963,135,136,108,01104-001,0.2,1.0,0010030001-4,1,1,13.0,PESSOA FISICA (CPF),,Comercial horizon...,De esquina,Loja,1566.0,2103.0,03812-1R S CAETANO13
1944,1963,67,90,67,01104-001,0.2,1.0,0010030002-2,0,1,6.0,PESSOA FISICA (CPF),,Comercial horizon...,Normal,Loja,1566.0,2103.0,03812-1R S CAETANO19
1965,1963,140,105,84,01104-001,0.35,1.0,0010030003-0,0,2,7.85,PESSOA FISICA (CPF),,Comercial horizon...,Normal,Loja,1566.0,2103.0,03812-1R S CAETANO27
1944,1963,103,108,86,01104-001,0.2,1.0,0010030004-9,0,1,6.05,PESSOA FISICA (CPF),,Comercial horizon...,Normal,Loja,1566.0,2103.0,03812-1R S CAETANO33
1944,1963,98,120,96,01104-001,0.2,1.0,0010030005-7,0,1,6.7,PESSOA FISICA (CPF),,Comercial horizon...,Normal,Loja,1566.0,2103.0,03812-1R S CAETANO39


# 3. Fixing Zipcodes

In [17]:
# the change to the zip code column will be simple: replace '-' with '':
df_iptu = df_iptu.withColumn("zipcode", F.regexp_replace("zipcode", "-", ""))

# 4. Preparing the Text-based columns

There are three columns that need to be handled:

1. `lot_type`: we can convert to more readable names;
2. `owner_type_1`: same as the `lot_type`;
3. `construction_type`: will require more careful considerations;

In [9]:
# registering an udf for text normalization:
normalize_entities_udf = F.udf(normalize_entities)

## 4.1 Lot Types

In [19]:
# normalizing the lot_type column:
df_iptu = df_iptu.withColumn("lot_type", normalize_entities_udf("lot_type"))

In [20]:
# verifying which types of lot are there:
df_iptu.select("lot_type").distinct().show(truncate=False)

+---------------+
|lot_type       |
+---------------+
|de_duas_ou_mais|
|de_esquina     |
|normal         |
|terreno_interno|
|lote_de_fundos |
|lote_de_esquina|
+---------------+



In [21]:
# fixing the lot_type column:
df_iptu = df_iptu.withColumn(
    "lot_type",
    F.when(F.col("lot_type") == "de_esquina", "corner_lot")
    .when(F.col("lot_type") == "lote_de_fundos", "back_lot")
    .when(F.col("lot_type") == "lote_de_esquina", "corner_lot")
    .when(F.col("lot_type") == "normal", "regular_lot")
    .when(F.col("lot_type") == "terreno_interno", "internal_lot")
    .when(F.col("lot_type") == "de_duas_ou_mais", "multiple_type_lot"),
)

## 4.2 Owner Types

In [22]:
# verifying the number different types of owners:
df_iptu.select("owner_type_1").distinct().show(truncate=False)

+----------------------+
|owner_type_1          |
+----------------------+
|PESSOA FISICA (CPF)   |
|null                  |
|PESSOA JURIDICA (CNPJ)|
+----------------------+



In [23]:
# substituting the values for the owner type 1:
df_iptu = df_iptu.withColumn(
    "owner_type_1",
    F.when(F.col("owner_type_1") == "PESSOA FISICA (CPF)", "person")
    .when(F.col("owner_type_1") == "PESSOA JURIDICA (CNPJ)", "company")
    .otherwise("N/A"),
)

In [24]:
# listing the types of owners for column owner_type_2:
df_iptu.select("owner_type_2").distinct().show(truncate=False)

+--------------------+
|owner_type_2        |
+--------------------+
|PESSOA FISICA (CPF) |
|PESSOA JURIDICA (CNP|
|null                |
+--------------------+



In [25]:
# substituting the values for owner_type_2:
df_iptu = df_iptu.withColumn(
    "owner_type_2",
    F.when(F.col("owner_type_2") == "PESSOA FISICA (CPF)", "person")
    .when(F.col("owner_type_2") == "PESSOA JURIDICA (CNP", "company")
    .otherwise("N/A"),
)

## 4.3 Property age and time of ownership

In [26]:
# 2017 is the year the dataset was collected -> age needs to be collected from there
df_iptu = df_iptu.withColumn("property_age", F.lit(2017) - F.col("construction_year"))

df_iptu = df_iptu.withColumn(
    "years_of_ownership", F.lit(2017) - F.col("owner_start_contribution_year")
)

## 4.4 Property Types

In [27]:
# need to normalize property_type to avoid noisy data as well:
df_iptu = df_iptu.withColumn(
    "property_type", normalize_entities_udf(F.col("property_type"))
)

In [28]:
# extracting the distinct values of the property_type column:
property_types = get_column_values(df_iptu, "property_type")

# sorting the values:
property_types.sort()

In [29]:
# generating the new categories:
categories = [
    "apartment_buildings",
    "commercial_buildings",
    "other",
    "entertainment_venues",
    "sports_venues",
    "other",
    "school_buildings",
    "office_buildings",
    "other",
    "hotel_buildings",
    "apartment_buildings",
    "residential_parking_lots",
    "commercial_parking_lots",
    "commercial_parking_lots",
    "residential_parking_lots",
    "hospital_buildings",
    "hotel_buildings",
    "industrial_buildings",
    "stores",
    "stores",
    "stores",
    "other",
    "other",
    "commercial_buildings",
    "commercial_buildings",
    "other",
    "other",
    "apartment_buildings",
    "apartment_buildings",
    "office_buildings",
    "office_buildings",
    "houses",
    "houses",
    "houses",
    "religious_venues",
    "empty_lots",
]

# # mapping types to new categories:
cat_map = dict(zip(property_types, categories))

In [30]:
# applying the conversion map:
df_iptu = df_iptu.withColumn(
    "property_type", apply_category_map(cat_map)(F.col("property_type"))
)

## 4.5 Construction Types

In [31]:
# extracting the distinct values of the construction_type column:
construction_types = get_column_values(df_iptu, "construction_type")

# sorting the values:
construction_types.sort()

In [32]:
# listing all values
for col in construction_types:
    print(col)

Barracão/Telheiro/Oficina - padrão
Barracão/Telheiro/Oficina/Posto de
Comercial horizontal - padrão A
Comercial horizontal - padrão B
Comercial horizontal - padrão C
Comercial horizontal - padrão D
Comercial horizontal - padrão E
Comercial vertical - padrão A
Comercial vertical - padrão B
Comercial vertical - padrão C
Comercial vertical - padrão D
Comercial vertical - padrão E
Edifício de garagens - padrão A
Indústria - padrão E
Oficina/Posto de serviço/Armazém/D
Residencial horizontal - padrão A
Residencial horizontal - padrão B
Residencial horizontal - padrão C
Residencial horizontal - padrão D
Residencial horizontal - padrão E
Residencial horizontal - padrão F
Residencial vertical - padrão A
Residencial vertical - padrão B
Residencial vertical - padrão C
Residencial vertical - padrão D
Residencial vertical - padrão E
Residencial vertical - padrão F
TERRENO
Templo/Clube/Ginásio ou Estádio es


By taking a look at the [law that defines the IPTU](www.google.com), we can observe that the items related to `padrão` are in fact special denominations that can be useful later for building specific features, as they represent characteristics of the real state properties that are not commonly described (architectural style, for example).

In [33]:
types = [
    "warehouses",
    "warehouses",
    "commercial_horizontal",
    "commercial_horizontal",
    "commercial_horizontal",
    "commercial_horizontal",
    "commercial_horizontal",
    "commercial_vertical",
    "commercial_vertical",
    "commercial_vertical",
    "commercial_vertical",
    "commercial_vertical",
    "garage_buildings",
    "residential_horizontal",
    "residential_horizontal",
    "residential_horizontal",
    "residential_horizontal",
    "residential_horizontal",
    "residential_horizontal",
    "residential_vertical",
    "residential_vertical",
    "residential_vertical",
    "residential_vertical",
    "residential_vertical",
    "residential_vertical",
    "empty_lot",
    "entertainment_venue",
]

standards = [
    "A",
    "N",
    "A",
    "B",
    "C",
    "D",
    "E",
    "A",
    "B",
    "C",
    "D",
    "E",
    "A",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "N",
    "N",
]

type_map = dict(zip(construction_types, types))
std_map = dict(zip(construction_types, standards))

In [34]:
# applying the conversion maps:
df_iptu = df_iptu.withColumn(
    "construction_standard", apply_category_map(std_map)(F.col("construction_type"))
)

df_iptu = df_iptu.withColumn(
    "construction_type", apply_category_map(type_map)(F.col("construction_type"))
)

# 5. Exporting preprocessed dataset

In [35]:
# let's save the resulting dataframe to a processed stage for further handling:
PROCESSED_IPTU_DIR = "../data/processed/sp_iptu"

# using the helper function to save the file:
save_to_filesystem(df_iptu, PROCESSED_IPTU_DIR, "tb_iptu", "tb_iptu.parquet")

True

# 6. Level of Interest Features
Now that we preprocessed the dataset, we can start generating features at the specified levels of interest. As a reminder, the levels of interest we are working on in this project are the following:

1. `sector`: the lowest unit of measurement for the Brazilian Census, which is one of most important geospatially referenced datasets we will be working with;
2. `zipcode`: zip codes in the city of São Paulo can be roughly approximated to an entire street (also called a logradouro);
3. `area_of_ponderation`: areas of ponderation are contiguous groups of census sectors;
4. `neighborhoods`: areas that are often (but not directly) related to the neighborhoods of the city;
5. `districts`: districts are administrative regions defined by law (and thus, won't change much over time), used to allocate resources by the City Hall;

When comparing the **IPTU** dataset to the census-related ones, we have a significant twitst. Instead of `sectors` being the lowest level of interest, we have `zipcodes`. This gives us a situation that is similar to when we processed census data at the zipcode level, as the zipcodes are represented as `linestrings` instead of `points` or `polygons`. 

In [37]:
# reading the preprocessed dataset:
df_iptu = spark.read.parquet("../data/processed/sp_iptu/tb_iptu.parquet")

## 6.1 Zipcode Features

In [13]:
# reading the neighborhoods files:
zipcode_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/logradouros/*"
)

df_zipcode = Adapter.toDf(zipcode_rdd, spark)
df_zipcode.createOrReplaceTempView("tb_zipcode")

In [14]:
# converting the coordinate system in the zipcode file:
Q_ZIPCODE_CONVERSION = """
WITH zip_left as (
  SELECT 
    DISTINCT A.CEP_E as zipcode,
    ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
    A.NAME as street_name,
    A.LENGTH as street_length
    FROM tb_zipcode as A  
),

zip_right as (
    SELECT
        DISTINCT A.CEP_D as zipcode,
        ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
        A.NAME as street_name,
        A.LENGTH as street_length
        FROM tb_zipcode as A      
)

SELECT * FROM zip_left 
UNION 
SELECT * FROM zip_right
"""

df_zipcode = spark.sql(Q_ZIPCODE_CONVERSION)

# dropping duplicates:
df_zipcode = df_zipcode.drop_duplicates(subset=["zipcode"])
df_zipcode.createOrReplaceTempView("tb_zipcode")

### 6.1.1 Aggregating Zipcode level features

In [8]:
df_zip_iptu_num_features = df_iptu.groupby("zipcode").agg(
    # count-based aggregations:
    F.countDistinct(F.col("owner_id")).alias("number_unique_owners"),
    F.countDistinct(F.col("property_id")).alias("number_property"),
    # statistics about the variables:
    F.avg(F.col("construction_year")).alias("average_construction_year"),
    F.percentile_approx(F.col("construction_year"), 0.5).alias(
        "median_construction_year"
    ),
    F.max(F.col("construction_year")).alias("max_construction_year"),
    F.min(F.col("construction_year")).alias("min_construction_year"),
    F.stddev(F.col("construction_year")).alias("std_construction_year"),
    F.avg(F.col("owner_start_contribution_year")).alias(
        "average_owner_start_contribution_year"
    ),
    F.percentile_approx(F.col("owner_start_contribution_year"), 0.5).alias(
        "median_owner_start_contribution_year"
    ),
    F.max(F.col("owner_start_contribution_year")).alias(
        "max_owner_start_contribution_year"
    ),
    F.min(F.col("owner_start_contribution_year")).alias(
        "min_owner_start_contribution_year"
    ),
    F.stddev(F.col("owner_start_contribution_year")).alias(
        "std_owner_start_contribution_year"
    ),
    F.avg(F.col("area_built")).alias("average_area_built"),
    F.percentile_approx(F.col("area_built"), 0.5).alias("median_area_built"),
    F.max(F.col("area_built")).alias("max_area_built"),
    F.min(F.col("area_built")).alias("min_area_built"),
    F.stddev(F.col("area_built")).alias("std_area_built"),
    F.avg(F.col("area_lot")).alias("average_area_lot"),
    F.percentile_approx(F.col("area_lot"), 0.5).alias("median_area_lot"),
    F.max(F.col("area_lot")).alias("max_area_lot"),
    F.min(F.col("area_lot")).alias("min_area_lot"),
    F.stddev(F.col("area_lot")).alias("std_area_lot"),
    F.avg(F.col("area_occupied")).alias("average_area_occupied"),
    F.percentile_approx(F.col("area_occupied"), 0.5).alias("median_area_occupied"),
    F.max(F.col("area_occupied")).alias("max_area_occupied"),
    F.min(F.col("area_occupied")).alias("min_area_occupied"),
    F.stddev(F.col("area_occupied")).alias("std_area_occupied"),
    F.avg(F.col("obsolescence_factor")).alias("average_obsolescence_factor"),
    F.percentile_approx(F.col("obsolescence_factor"), 0.5).alias(
        "median_obsolescence_factor"
    ),
    F.max(F.col("obsolescence_factor")).alias("max_obsolescence_factor"),
    F.min(F.col("obsolescence_factor")).alias("min_obsolescence_factor"),
    F.stddev(F.col("obsolescence_factor")).alias("std_obsolescence_factor"),
    F.avg(F.col("fraction_factor")).alias("average_fraction_factor"),
    F.percentile_approx(F.col("fraction_factor"), 0.5).alias("median_fraction_factor"),
    F.max(F.col("fraction_factor")).alias("max_fraction_factor"),
    F.min(F.col("fraction_factor")).alias("min_fraction_factor"),
    F.stddev(F.col("fraction_factor")).alias("std_fraction_factor"),
    F.avg(F.col("number_fronts_corners")).alias("average_number_fronts_corners"),
    F.percentile_approx(F.col("number_fronts_corners"), 0.5).alias(
        "median_number_fronts_corners"
    ),
    F.max(F.col("number_fronts_corners")).alias("max_number_fronts_corners"),
    F.min(F.col("number_fronts_corners")).alias("min_number_fronts_corners"),
    F.stddev(F.col("number_fronts_corners")).alias("std_number_fronts_corners"),
    F.avg(F.col("number_floors")).alias("average_number_floors"),
    F.percentile_approx(F.col("number_floors"), 0.5).alias("median_number_floors"),
    F.max(F.col("number_floors")).alias("max_number_floors"),
    F.min(F.col("number_floors")).alias("min_number_floors"),
    F.stddev(F.col("number_floors")).alias("std_number_floors"),
    F.avg(F.col("front_factor")).alias("average_front_factor"),
    F.percentile_approx(F.col("front_factor"), 0.5).alias("median_front_factor"),
    F.max(F.col("front_factor")).alias("max_front_factor"),
    F.min(F.col("front_factor")).alias("min_front_factor"),
    F.stddev(F.col("front_factor")).alias("std_front_factor"),
    F.avg(F.col("construction_square_meter_value")).alias(
        "average_construction_square_meter_value"
    ),
    F.percentile_approx(F.col("construction_square_meter_value"), 0.5).alias(
        "median_construction_square_meter_value"
    ),
    F.max(F.col("construction_square_meter_value")).alias(
        "max_construction_square_meter_value"
    ),
    F.min(F.col("construction_square_meter_value")).alias(
        "min_construction_square_meter_value"
    ),
    F.stddev(F.col("construction_square_meter_value")).alias(
        "std_construction_square_meter_value"
    ),
    F.avg(F.col("lot_square_meter_value")).alias("average_lot_square_meter_value"),
    F.percentile_approx(F.col("lot_square_meter_value"), 0.5).alias(
        "median_lot_square_meter_value"
    ),
    F.max(F.col("lot_square_meter_value")).alias("max_lot_square_meter_value"),
    F.min(F.col("lot_square_meter_value")).alias("min_lot_square_meter_value"),
    F.stddev(F.col("lot_square_meter_value")).alias("std_lot_square_meter_value"),
    F.avg(F.col("property_age")).alias("average_property_age"),
    F.percentile_approx(F.col("property_age"), 0.5).alias("median_property_age"),
    F.max(F.col("property_age")).alias("max_property_age"),
    F.min(F.col("property_age")).alias("min_property_age"),
    F.stddev(F.col("property_age")).alias("std_property_age"),
    F.avg(F.col("years_of_ownership")).alias("average_years_of_ownership"),
    F.percentile_approx(F.col("years_of_ownership"), 0.5).alias(
        "median_years_of_ownership"
    ),
    F.max(F.col("years_of_ownership")).alias("max_years_of_ownership"),
    F.min(F.col("years_of_ownership")).alias("min_years_of_ownership"),
    F.stddev(F.col("years_of_ownership")).alias("std_years_of_ownership"),
)

# fixing the column names:
new_columns = list(map(lambda col: f"zipcode_{col}", df_zip_iptu_num_features.columns))

for i in range(len(df_zip_iptu_num_features.columns)):
    df_zip_iptu_num_features = df_zip_iptu_num_features.withColumnRenamed(
        df_zip_iptu_num_features.columns[i], new_columns[i]
    )
    
df_zip_iptu_num_features = df_zip_iptu_num_features.withColumnRenamed("zipcode_zipcode", "zipcode")

### 6.1.2 Exporting Zipcode level features

In [9]:
# verifying the results:
df_zip_iptu_num_features

zipcode_zipcode,zipcode_number_unique_owners,zipcode_number_property,zipcode_average_construction_year,zipcode_median_construction_year,zipcode_max_construction_year,zipcode_min_construction_year,zipcode_std_construction_year,zipcode_average_owner_start_contribution_year,zipcode_median_owner_start_contribution_year,zipcode_max_owner_start_contribution_year,zipcode_min_owner_start_contribution_year,zipcode_std_owner_start_contribution_year,zipcode_average_area_built,zipcode_median_area_built,zipcode_max_area_built,zipcode_min_area_built,zipcode_std_area_built,zipcode_average_area_lot,zipcode_median_area_lot,zipcode_max_area_lot,zipcode_min_area_lot,zipcode_std_area_lot,zipcode_average_area_occupied,zipcode_median_area_occupied,zipcode_max_area_occupied,zipcode_min_area_occupied,zipcode_std_area_occupied,zipcode_average_obsolescence_factor,zipcode_median_obsolescence_factor,zipcode_max_obsolescence_factor,zipcode_min_obsolescence_factor,zipcode_std_obsolescence_factor,zipcode_average_fraction_factor,zipcode_median_fraction_factor,zipcode_max_fraction_factor,zipcode_min_fraction_factor,zipcode_std_fraction_factor,zipcode_average_number_fronts_corners,zipcode_median_number_fronts_corners,zipcode_max_number_fronts_corners,zipcode_min_number_fronts_corners,zipcode_std_number_fronts_corners,zipcode_average_number_floors,zipcode_median_number_floors,zipcode_max_number_floors,zipcode_min_number_floors,zipcode_std_number_floors,zipcode_average_front_factor,zipcode_median_front_factor,zipcode_max_front_factor,zipcode_min_front_factor,zipcode_std_front_factor,zipcode_average_construction_square_meter_value,zipcode_median_construction_square_meter_value,zipcode_max_construction_square_meter_value,zipcode_min_construction_square_meter_value,zipcode_std_construction_square_meter_value,zipcode_average_lot_square_meter_value,zipcode_median_lot_square_meter_value,zipcode_max_lot_square_meter_value,zipcode_min_lot_square_meter_value,zipcode_std_lot_square_meter_value,zipcode_average_property_age,zipcode_median_property_age,zipcode_max_property_age,zipcode_min_property_age,zipcode_std_property_age,zipcode_average_years_of_ownership,zipcode_median_years_of_ownership,zipcode_max_years_of_ownership,zipcode_min_years_of_ownership,zipcode_std_years_of_ownership
1233001,855,55,1984.515789473684,1993,2014,1929,15.059893572953651,1985.898245614035,1994,2006,1963,14.686343900124342,169.08070175438596,96,6610,23,264.5316116710392,799.072514619883,692,2576,60,472.6525570652757,545.4152046783626,491,2301,50,403.3151637543701,0.6475672505229537,0.78,0.98,0.2,0.2136433699863181,0.0502345030059852,0.0257,1.0,0.008,0.1446293543222227,0.1602339181286549,0,1,0,0.3670375632723403,14.04327485380117,15,33,1,5.714868117464387,20.776268947891325,17.5,54.96,0.0,10.455761980217712,2193.856140350877,2007.0,3377.0,1079.0,369.9941503830005,5261.8421052631575,4670.0,6527.0,4621.0,644.2697150048115,32.48421052631579,24,88,3,15.059893572953785,31.101754385964917,23,54,11,14.686343900124305
1415004,574,18,1978.947735191637,1973,2000,1944,11.83720523060521,1978.5627177700349,1974,2011,1963,13.718005273784508,115.00522648083624,75,987,23,94.02976566638335,927.7369337979094,1012,1220,59,273.56736865983964,504.3257839721254,400,1012,52,270.75720877828024,0.5703832812685169,0.49,0.85,0.2,0.1618162461450923,0.031497386807433,0.0095,1.0,0.0033,0.1018506469857827,0.0278745644599303,0,1,0,0.164756984130015,14.775261324041812,16,18,1,3.071006313005965,19.518362473112365,20.0,26.5,4.5,3.934480599115575,2152.677700348432,2007.0,3377.0,1311.0,211.1275694134468,7425.254355400697,7353.0,7781.0,7334.0,171.6232168404358,38.05226480836237,44,73,17,11.837205230605086,38.43728222996516,43,54,6,13.718005273784737
1543070,7,7,1970.571428571429,1970,1976,1962,5.59336341441485,1963.0,1963,1963,1963,0.0,95.85714285714286,80,145,80,27.63107291576262,102.71428571428572,85,144,85,24.702997697407,48.42857142857143,40,75,40,14.740614447359862,0.44857143504279,0.44,0.54,0.3,0.0949436024761706,1.0,1.0,1.0,1.0,0.0,0.0,0,0,0,0.0,2.0,2,2,2,0.0,6.421428544180734,5.0,13.0,4.95,2.9960887357684407,1647.0,1647.0,1647.0,1647.0,0.0,2109.0,2109.0,2109.0,2109.0,0.0,46.42857142857143,47,55,41,5.593363414414825,54.0,54,54,54,0.0
1549010,235,175,1971.36170212766,1970,2014,1929,14.80734619585442,1970.1659574468083,1963,2008,1963,11.08986748966618,162.37021276595743,130,612,42,89.38729693037284,562.1063829787234,250,1700,42,627.2962683552458,171.2808510638298,121,536,33,119.1187273813308,0.4536595721194084,0.44,0.97,0.2,0.2025568667157485,0.7532370214449599,1.0,1.0,0.0192,0.4227829881528008,0.0723404255319148,0,2,0,0.2906683483687343,4.574468085106383,2,14,1,5.08115624187241,7.933744685193326,8.0,24.75,0.0,3.0700210848484493,1701.4127659574467,1647.0,2564.0,1067.0,291.92323485808566,1429.1404255319148,1438.0,1603.0,1310.0,89.07560329507895,45.63829787234042,47,88,3,14.807346195854418,46.83404255319149,54,54,9,11.089867489666068
2161020,201,193,1979.766169154229,1980,2014,1960,8.419622898398968,1974.885572139304,1974,2008,1963,11.360098626157244,178.37313432835822,126,5231,40,367.2879184980456,190.20398009950247,138,2293,60,166.3531580225623,120.49253731343283,100,2106,34,148.357781036351,0.4832835814253015,0.51,0.97,0.2,0.1929434228862956,1.0,1.0,1.0,1.0,0.0,0.1243781094527363,0,2,0,0.3734337107418235,1.63681592039801,2,3,1,0.5851818614287985,7.716517421143565,5.75,26.5,0.0,3.8439094401155782,1117.0945273631842,1241.0,1635.0,788.0,162.0595138210051,564.8009950248756,506.0,693.0,435.0,91.9726056986806,37.233830845771145,37,57,3,8.419622898398941,42.11442786069652,43,54,9,11.360098626157257


In [None]:
# caching the results of the groupby:
PROCESSED_IPTU_DIR = "../data/processed/sp_iptu"

# using the helper function to save the file:
save_to_filesystem(df_zip_iptu_num_features, PROCESSED_IPTU_DIR, "tb_zipcode_iptu", "tb_zipcode_iptu.parquet")

### 6.1.3 Joining features to Geometries

In [11]:
# reading the cache for the zipcode level features:
df_zip_agg = spark.read.parquet(PROCESSED_IPTU_DIR + "/tb_zipcode_iptu_no_geo.parquet")

In [12]:
# fixing the column names for zip_agg:
df_zip_agg = df_zip_agg.withColumnRenamed("zipcode_zipcode", "zipcode")

In [130]:
# joining onto the geometries:
df_zip_full = df_zipcode.join(df_zip_agg, how="left", on=["zipcode"])

In [132]:
# caching the results of the groupby:
PROCESSED_IPTU_DIR = "../data/processed/sp_iptu"

# using the helper function to save the file:
save_to_filesystem(
    df_zip_full,
    PROCESSED_IPTU_DIR,
    "tb_zipcode_iptu",
    "tb_zipcode_iptu.parquet",
)

True

## 6.2 Neighborhood Features

In [10]:
PROCESSED_IPTU_DIR = "../data/processed/sp_iptu/"
RAW_DATA_DIR = "../data/raw/"

In [134]:
# reading the neighborhoods files:
nb_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/neighborhoods/*"
)

df_nb = Adapter.toDf(nb_rdd, spark)
df_nb.createOrReplaceTempView("tb_neighborhood")

# verifying the neighborhoods geometry dataset:
df_nb

geometry,Name,descriptio
MULTIPOLYGON (((-...,Alto da Riviera,ALTO DA RIVIERA
POLYGON ((-46.589...,Alto da Mooca,ALTO DA MOOCA
POLYGON ((-46.719...,Alto da Lapa,ALTO DA LAPA
POLYGON ((-46.629...,Vila Agua Funda,VILA AGUA FUNDA
POLYGON ((-46.622...,Agua Fria,AGUA FRIA


In [23]:
# registering both dataframes to the SQL Context for spatial join:
df_nb.createOrReplaceTempView("tb_neighborhood")
df_zipcode.createOrReplaceTempView("tb_zipcode")
df_zip_agg.createOrReplaceTempView("tb_zip_features")

In [137]:
# performing the join of the zipcodes onto the neighborhoods:
Q_NEIGHBORHOOD_ZIPCODE_MATCH = """
SELECT 
    B.Name as neighborhood_name,
    A.zipcode
FROM tb_zipcode as A, tb_neighborhood as B
WHERE ST_Intersects(B.geometry, A.geometry)
"""

# matching the areas of ponderation to their sectors:
df_nb_match = spark.sql(Q_NEIGHBORHOOD_ZIPCODE_MATCH)
df_nb_match.createOrReplaceTempView("tb_nb_match")

In [138]:
# adding the features to the ponderation area back:
Q_NB_RAW_FEATURES = """
SELECT
    A.neighborhood_name,
    B.*
FROM tb_nb_match as A
LEFT JOIN tb_zip_features as B
ON A.zipcode = B.zipcode
"""

# reading back the dataset:
df_nb_raw_features = spark.sql(Q_NB_RAW_FEATURES)

In [140]:
# removing the prefixes:
for col in df_nb_raw_features.columns:
    if col != "zipcode":
        df_nb_raw_features = df_nb_raw_features.withColumnRenamed(
            col, col.replace("zipcode_", "")
        )

### 6.2.1 Aggregating Neighborhood Features

In [142]:
df_iptu_neighborhood = df_nb_raw_features.groupby("neighborhood_name").agg(
    F.sum(F.col("number_unique_owners")).alias("total_unique_owners"),
    F.sum(F.col("number_property")).alias("total_number_properties"),
    F.avg(F.col("average_area_built")).alias("average_area_built"),
    F.avg(F.col("average_area_lot")).alias("average_area_lot"),
    F.avg(F.col("average_area_occupied")).alias("average_area_occupied"),
    F.avg(F.col("average_construction_year")).alias("average_construction_year"),
    F.avg(F.col("average_construction_square_meter_value")).alias(
        "average_construction_square_meter_value"
    ),
    F.avg(F.col("average_fraction_factor")).alias("average_fraction_factor"),
    F.avg(F.col("average_front_factor")).alias("average_front_factor"),
    F.avg(F.col("average_lot_square_meter_value")).alias(
        "average_lot_square_meter_value"
    ),
    F.avg(F.col("average_number_floors")).alias("average_number_floors"),
    F.avg(F.col("average_number_fronts_corners")).alias(
        "average_number_fronts_corners"
    ),
    F.avg(F.col("average_obsolescence_factor")).alias("average_obsolescence_factor"),
    F.avg(F.col("average_owner_start_contribution_year")).alias(
        "average_owner_start_contribution_year"
    ),
    F.avg(F.col("average_property_age")).alias("average_property_age"),
    F.avg(F.col("average_years_of_ownership")).alias("average_years_of_ownership"),
)

# fixing the column names:
new_columns = list(map(lambda col: f"neighborhood_{col}", df_iptu_neighborhood.columns))

for i in range(len(df_iptu_neighborhood.columns)):
    df_iptu_neighborhood = df_iptu_neighborhood.withColumnRenamed(
        df_iptu_neighborhood.columns[i], new_columns[i]
    )

df_iptu_neighborhood = df_iptu_neighborhood.withColumnRenamed(
    "neighborhood_neighborhood_name", "neighborhood"
)

### 6.2.2 Joining features to Geometries

In [146]:
# adding the resulting dataframe to the SQL Context:
df_iptu_neighborhood.createOrReplaceTempView("tb_neighborhood_features")

# generating the final level of aggregation for the census features:
Q_NB_GEOM = """
SELECT
    A.*,
    B.geometry,
    ST_Centroid(B.geometry) as neighborhood_centroid
FROM tb_neighborhood_features as A
LEFT JOIN tb_neighborhood as B 
ON A.neighborhood = B.Name
"""

df_nb_final = spark.sql(Q_NB_GEOM)

### 6.2.3 Exporting Neighborhood level features

In [148]:
# saving the results the results without geometries:
PROCESSED_IPTU_FEATURES = "../data/processed/sp_iptu/"

NB_FEATURES = f"tb_neighborhood_iptu"

save_to_filesystem(
    df_nb_final, PROCESSED_IPTU_FEATURES, NB_FEATURES, NB_FEATURES + ".parquet"
)

True

In [149]:
# saving the results the results without geometries:
NB_NO_GEO = f"tb_neighborhood_iptu_no_geo"

df_nb_no_geo = df_nb_final.drop("geometry", "neighborhood_centroid")

save_to_filesystem(
    df_nb_no_geo, PROCESSED_IPTU_FEATURES, NB_NO_GEO, NB_NO_GEO + ".parquet"
)

True

## 6.3 District Features

In [166]:
# reading the neighborhoods files:
district_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/districts/*"
)

df_district = Adapter.toDf(district_rdd, spark)

# dropping the unnecessary columns:
df_district = df_district.drop(
    "CLASSID", "FEATID", "REVISIONNU", "DATA_CRIAC", "USUARIO_ID"
)

df_district.createOrReplaceTempView("tb_district")

# converting the coordinate system in the district file:
Q_DISTRICT_CONVERSION = """
SELECT 
    ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
    A.NOME_DIST as district_name,
    A.SIGLA_DIST as district_abbreviation,
    A.COD_DIST as district_code,
    A.COD_SUB as subdistrict_code
FROM tb_district as A
"""

df_district = spark.sql(Q_DISTRICT_CONVERSION)
df_district.createOrReplaceTempView("tb_district")

# veriyfing the results:
df_district

geometry,district_name,district_abbreviation,district_code,subdistrict_code
POLYGON ((-46.446...,JOSE BONIFACIO,JBO,47,27
POLYGON ((-46.758...,JD SAO LUIS,JDS,46,18
POLYGON ((-46.475...,ARTUR ALVIM,AAL,5,21
POLYGON ((-46.756...,JAGUARA,JAG,40,8
POLYGON ((-46.581...,VILA PRUDENTE,VPR,93,29


In [167]:
# performing the join of the zipcodes onto the neighborhoods:
Q_DISTRICT_MATCH = """
SELECT 
    B.district_name,
    A.zipcode
FROM tb_zipcode as A, tb_district as B
WHERE ST_Intersects(B.geometry, A.geometry)
"""

# matching the areas of ponderation to their sectors:
df_district_match = spark.sql(Q_DISTRICT_MATCH)
df_district_match.createOrReplaceTempView("tb_district_match")

In [168]:
# adding the features to the ponderation area back:
Q_DISTRICT_RAW_FEATURES = """
SELECT
    A.district_name,
    B.*
FROM tb_district_match as A
LEFT JOIN tb_zip_features as B
ON A.zipcode = B.zipcode
"""

# reading back the dataset:
df_district_raw_features = spark.sql(Q_DISTRICT_RAW_FEATURES)

In [169]:
# removing the prefixes:
for col in df_district_raw_features.columns:
    if col != "zipcode":
        df_district_raw_features = df_district_raw_features.withColumnRenamed(
            col, col.replace("zipcode_", "")
        )

### 6.3.1 Aggregating District Features

In [170]:
df_iptu_district = df_district_raw_features.groupby("district_name").agg(
    F.sum(F.col("number_unique_owners")).alias("total_unique_owners"),
    F.sum(F.col("number_property")).alias("total_number_properties"),
    F.avg(F.col("average_area_built")).alias("average_area_built"),
    F.avg(F.col("average_area_lot")).alias("average_area_lot"),
    F.avg(F.col("average_area_occupied")).alias("average_area_occupied"),
    F.avg(F.col("average_construction_year")).alias("average_construction_year"),
    F.avg(F.col("average_construction_square_meter_value")).alias(
        "average_construction_square_meter_value"
    ),
    F.avg(F.col("average_fraction_factor")).alias("average_fraction_factor"),
    F.avg(F.col("average_front_factor")).alias("average_front_factor"),
    F.avg(F.col("average_lot_square_meter_value")).alias(
        "average_lot_square_meter_value"
    ),
    F.avg(F.col("average_number_floors")).alias("average_number_floors"),
    F.avg(F.col("average_number_fronts_corners")).alias(
        "average_number_fronts_corners"
    ),
    F.avg(F.col("average_obsolescence_factor")).alias("average_obsolescence_factor"),
    F.avg(F.col("average_owner_start_contribution_year")).alias(
        "average_owner_start_contribution_year"
    ),
    F.avg(F.col("average_property_age")).alias("average_property_age"),
    F.avg(F.col("average_years_of_ownership")).alias("average_years_of_ownership"),
)

# fixing the column names:
new_columns = list(map(lambda col: f"district_{col}", df_iptu_district.columns))

for i in range(len(df_iptu_district.columns)):
    df_iptu_district = df_iptu_district.withColumnRenamed(
        df_iptu_district.columns[i], new_columns[i]
    )

df_iptu_district = df_iptu_district.withColumnRenamed(
    "district_district_name", "district"
)

### 6.3.2 Joining features to Geometries

In [171]:
# adding the resulting dataframe to the SQL Context:
df_iptu_district.createOrReplaceTempView("tb_district_features")

# generating the final level of aggregation for the census features:
Q_DISTRICT_GEOM = """
SELECT
    A.*,
    B.geometry,
    ST_Centroid(B.geometry) as district_centroid
FROM tb_district_features as A
LEFT JOIN tb_district as B 
ON A.district = B.district_name
"""

df_district_final = spark.sql(Q_DISTRICT_GEOM)

### 6.3.3 Exporting District level features

In [172]:
# saving the results the results without geometries:
PROCESSED_IPTU_FEATURES = "../data/processed/sp_iptu/"

DIST_FEATURES = f"tb_district_iptu"

save_to_filesystem(
    df_district_final,
    PROCESSED_IPTU_FEATURES,
    DIST_FEATURES,
    DIST_FEATURES + ".parquet",
)

True

In [173]:
# saving the results the results without geometries:
DIST_NO_GEO = f"tb_district_iptu_no_geo"

df_district_no_geo = df_district_final.drop("geometry", "district_centroid")

save_to_filesystem(
    df_district_no_geo, PROCESSED_IPTU_FEATURES, DIST_NO_GEO, DIST_NO_GEO + ".parquet"
)

True

## 6.4 Area of Ponderation Features

In [15]:
# reading the area of ponderation files:
ap_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/census_ponderations/*"
)

df_ap = Adapter.toDf(ap_rdd, spark)

# adding the geometry dataframes to the SQL Context:
df_ap.createOrReplaceTempView("tb_ponderation")

Q_AP_CONVERSION = """
SELECT 
    ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
    A.AREA as ponderation_area,
    A.COD_AED as ponderation_area_code
FROM tb_ponderation as A
"""

# converting the CRS to the standard format:
df_ap = spark.sql(Q_AP_CONVERSION)

# adding the geometry dataframes to the SQL Context:
df_ap.createOrReplaceTempView("tb_ponderation")

In [16]:
# verifying the results:
df_ap

geometry,ponderation_area,ponderation_area_code
POLYGON ((-46.782...,10.450821,3550308005243
POLYGON ((-46.755...,13.241315,3550308005242
POLYGON ((-46.708...,4.180473,3550308005309
POLYGON ((-46.700...,11.936847,3550308005189
POLYGON ((-46.677...,7.321331,3550308005127


In [17]:
# performing the join of the zipcodes onto the areas of ponderation:
Q_PONDERATION_MATCH = """
SELECT 
    B.ponderation_area_code,
    A.zipcode
FROM tb_zipcode as A, tb_ponderation as B
WHERE ST_Intersects(B.geometry, A.geometry)
"""

# matching the areas of ponderation to their sectors:
df_ap_match = spark.sql(Q_PONDERATION_MATCH)
df_ap_match.createOrReplaceTempView("tb_ponderation_match")

In [24]:
# adding the features to the ponderation area back:
Q_PONDERATION_RAW_FEATURES = """
SELECT
    A.ponderation_area_code,
    B.*
FROM tb_ponderation_match as A
LEFT JOIN tb_zip_features as B
ON A.zipcode = B.zipcode
"""

# reading back the dataset:
df_ap_raw_features = spark.sql(Q_PONDERATION_RAW_FEATURES)

In [26]:
# removing the prefixes:
for col in df_ap_raw_features.columns:
    if col != "zipcode":
        df_ap_raw_features = df_ap_raw_features.withColumnRenamed(
            col, col.replace("zipcode_", "")
        )

### 6.4.1 Aggregating Area of Ponderation Features

In [28]:
df_iptu_ponderation = df_ap_raw_features.groupby("ponderation_area_code").agg(
    F.sum(F.col("number_unique_owners")).alias("total_unique_owners"),
    F.sum(F.col("number_property")).alias("total_number_properties"),
    F.avg(F.col("average_area_built")).alias("average_area_built"),
    F.avg(F.col("average_area_lot")).alias("average_area_lot"),
    F.avg(F.col("average_area_occupied")).alias("average_area_occupied"),
    F.avg(F.col("average_construction_year")).alias("average_construction_year"),
    F.avg(F.col("average_construction_square_meter_value")).alias(
        "average_construction_square_meter_value"
    ),
    F.avg(F.col("average_fraction_factor")).alias("average_fraction_factor"),
    F.avg(F.col("average_front_factor")).alias("average_front_factor"),
    F.avg(F.col("average_lot_square_meter_value")).alias(
        "average_lot_square_meter_value"
    ),
    F.avg(F.col("average_number_floors")).alias("average_number_floors"),
    F.avg(F.col("average_number_fronts_corners")).alias(
        "average_number_fronts_corners"
    ),
    F.avg(F.col("average_obsolescence_factor")).alias("average_obsolescence_factor"),
    F.avg(F.col("average_owner_start_contribution_year")).alias(
        "average_owner_start_contribution_year"
    ),
    F.avg(F.col("average_property_age")).alias("average_property_age"),
    F.avg(F.col("average_years_of_ownership")).alias("average_years_of_ownership"),
)

# fixing the column names:
new_columns = list(
    map(lambda col: f"ponderation_area_{col}", df_iptu_ponderation.columns)
)

for i in range(len(df_iptu_ponderation.columns)):
    df_iptu_ponderation = df_iptu_ponderation.withColumnRenamed(
        df_iptu_ponderation.columns[i], new_columns[i]
    )

df_iptu_ponderation = df_iptu_ponderation.withColumnRenamed(
    "ponderation_area_ponderation_area_code", "ponderation_area_code"
)

### 6.4.2 Joining features to Geometries

In [31]:
# adding the resulting dataframe to the SQL Context:
df_iptu_ponderation.createOrReplaceTempView("tb_ponderation_features")

# generating the final level of aggregation for the census features:
Q_PONDERATION_GEOM = """
SELECT
    A.*,
    B.geometry,
    ST_Centroid(B.geometry) as ponderation_area_centroid
FROM tb_ponderation_features as A
LEFT JOIN tb_ponderation as B 
ON A.ponderation_area_code = B.ponderation_area_code
"""

df_ponderation_final = spark.sql(Q_PONDERATION_GEOM)

### 6.4.3 Exporting Ponderation level features

In [32]:
# saving the results the results without geometries:
PROCESSED_IPTU_FEATURES = "../data/processed/sp_iptu/"

PONDERATION_FEATURES = f"tb_ponderation_iptu"

save_to_filesystem(
    df_ponderation_final,
    PROCESSED_IPTU_FEATURES,
    PONDERATION_FEATURES,
    PONDERATION_FEATURES + ".parquet",
)

True

In [33]:
# saving the results the results without geometries:
PONDERATION_NO_GEO = f"tb_ponderation_iptu_no_geo"

df_ponderation_no_geo = df_ponderation_final.drop(
    "geometry", "ponderation_area_centroid"
)

save_to_filesystem(
    df_ponderation_no_geo,
    PROCESSED_IPTU_FEATURES,
    PONDERATION_NO_GEO,
    PONDERATION_NO_GEO + ".parquet",
)

True