# Data Prep - Census Data
This notebook is the second part to the data preparation phase for this project. I will perform a series of manipulations on sociodemographic data from the 2010's Brazilian Census, the latest census conducted in the country. This is a complex process with many stages and will involve both careful manipulations of the raw data as well as handling of geospatial data.

In order to perform these manipulations, we will use both `pyspark` and an extension called `Apache Sedona` (formerly known as `geospark`).

In [112]:
# installing the requirements:
!pip install -r ../configs/dependencies/dataprep_requirements.txt >> ../configs/dependencies/package_installation.txt

In [113]:
# loading the magic commands:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [114]:
###### Loading the necessary libraries #########

# PySpark dependencies:s
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import pyspark.sql.types as T
from pyspark.sql.window import Window

# Sedona dependencies:
from sedona.utils.adapter import Adapter
from sedona.register import SedonaRegistrator
from sedona.utils import KryoSerializer, SedonaKryoRegistrator
from sedona.core.SpatialRDD import SpatialRDD
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.core.formatMapper import GeoJsonReader

# database utilities:
from sqlalchemy import create_engine
import sqlite3 as db
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
import fiona

# plotting and data visualization:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, Image

# other relevant libraries:
import warnings
import unidecode
import inflection
import unicodedata
from datetime import datetime, timedelta
from functools import partial
import json
import re
import os
from glob import glob
import shutil
import itertools
import chardet

# importing the atlas utilities:
from atlasutils import (
    save_to_filesystem,
    save_as_table,
    rotate_xticks,
    get_file_encoding,
    normalize_entities,
    normalize_column_name,
    apply_category_map,
    standardize_variable_names,
    get_null_columns,
    replace_decimal_separator,
    convert_to_geopandas,
    drop_invalid_census_columns,
    clean_census_column_name,
    get_file_crs,
    get_column_values,
)


# setting global parameters for visualizationsss:
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

# 0. Configuring Spark

In [115]:
# function to encapsulate standard spark configurations:
def init_spark(app_name):

    spark = (
        SparkSession.builder.appName(app_name)
        .config("spark.files.overwrite", "true")
        .config("spark.serializer", KryoSerializer.getName)
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config(
            "spark.jars.packages",
            "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,"
            "org.datasyslab:geotools-wrapper:geotools-24.1",
        )
        .config("spark.sql.repl.eagerEval.enabled", True)
        .config("spark.sql.repl.eagerEval.maxNumRows", 5)
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .config("sedona.global.charset", "utf8")
        .config("sedona.global.index", "true")
        .enableHiveSupport()
        .getOrCreate()
    )

    SedonaRegistrator.registerAll(spark)

    return spark

In [116]:
# init the spark session:
spark = init_spark("SP Atlas - IBGE Census")

In [117]:
# verifying the session status:
spark

# 1. Loading and Inspecting the Data
We will first load the Polygon geospatial data that we will later use to match to the `(lat, long)` paths in the listings datasets.

## 1.1 Sector Polygons
The lowest level of data aggregation in the Census is a Sector. A sector is a specific area in which researchers collect census data. These sectors are contiguous areas and IBGE provides the Shapefiles for them. This will allow us to match the Listings data to the census sector where they can be located.

In [35]:
# loading the raw dataset:
RAW_DATA_DIR = "../data/raw/"

# shapefiles for sectors:
sector_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/census_sectors/*"
)

df_sector = Adapter.toDf(sector_rdd, spark)

# verifying the sectors:
df_sector

geometry,ID,CD_GEOCODI,TIPO,CD_GEOCODS,NM_SUBDIST,CD_GEOCODD,NM_DISTRIT,CD_GEOCODM,NM_MUNICIP,NM_MICRO,NM_MESO,CD_GEOCODB,NM_BAIRRO,ID1
POLYGON ((-46.410...,98237,354100005000009,URBANO,35410000500,,354100005,PRAIA GRANDE,3541000,PRAIA GRANDE,SANTOS,METROPOLITANA DE ...,354100005001,Boqueirão,1
POLYGON ((-46.416...,98232,354100005000004,URBANO,35410000500,,354100005,PRAIA GRANDE,3541000,PRAIA GRANDE,SANTOS,METROPOLITANA DE ...,354100005001,Boqueirão,2
POLYGON ((-46.412...,98230,354100005000002,URBANO,35410000500,,354100005,PRAIA GRANDE,3541000,PRAIA GRANDE,SANTOS,METROPOLITANA DE ...,354100005001,Boqueirão,3
POLYGON ((-46.411...,98229,354100005000001,URBANO,35410000500,,354100005,PRAIA GRANDE,3541000,PRAIA GRANDE,SANTOS,METROPOLITANA DE ...,354100005001,Boqueirão,4
POLYGON ((-46.413...,98231,354100005000003,URBANO,35410000500,,354100005,PRAIA GRANDE,3541000,PRAIA GRANDE,SANTOS,METROPOLITANA DE ...,354100005001,Boqueirão,5


In [36]:
# selecting the relevant columns:
df_sector = df_sector.select(
    F.col("ID").alias("id"),
    F.col("geometry"),
    F.col("CD_GEOCODI").alias("sector_code"),
    F.col("CD_GEOCODM").alias("city_code"),
    F.col("CD_GEOCODB").alias("neighborhood_code"),
    F.col("NM_MUNICIP").alias("city"),
    F.col("NM_BAIRRO").alias("neighborhood"),
    F.col("TIPO").alias("sector_type"),
)

# filtering just São Paulo:
df_sector = df_sector.filter(F.col("city") == "SÃO PAULO")

## 1.2 Area of Ponderation

In [37]:
# reading the area of ponderation files:
ap_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/census_ponderations/*"
)

df_ap = Adapter.toDf(ap_rdd, spark)

# adding the geometry dataframes to the SQL Context:
df_ap.createOrReplaceTempView("tb_ponderation")

# verifying the sectors:
df_ap

geometry,ID,AREA,COD_AED,COD_AED_S
POLYGON ((317854....,158,10.450821,3550308005243,243
POLYGON ((320682....,159,13.241315,3550308005242,242
POLYGON ((325497....,160,4.180473,3550308005309,309
POLYGON ((326306....,117,11.936847,3550308005189,189
POLYGON ((328654....,1,7.321331,3550308005127,127


In [38]:
Q_AP_CONVERSION = """
SELECT 
    ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
    A.AREA as ponderation_area,
    A.COD_AED as ponderation_area_code
FROM tb_ponderation as A
"""

# converting the CRS to the standard format:
df_ap = spark.sql(Q_AP_CONVERSION)

# adding the geometry dataframes to the SQL Context:
df_ap.createOrReplaceTempView("tb_ponderation")

In [39]:
# verifying the results:
df_ap

geometry,ponderation_area,ponderation_area_code
POLYGON ((-46.782...,10.450821,3550308005243
POLYGON ((-46.755...,13.241315,3550308005242
POLYGON ((-46.708...,4.180473,3550308005309
POLYGON ((-46.700...,11.936847,3550308005189
POLYGON ((-46.677...,7.321331,3550308005127


## 1.3 Neighborhoods

In [40]:
# reading the neighborhoods files:
nb_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/neighborhoods/*"
)

df_nb = Adapter.toDf(nb_rdd, spark)
df_nb.createOrReplaceTempView("tb_neighborhood")

# verifying the sectors:
df_nb

geometry,Name,descriptio
MULTIPOLYGON (((-...,Alto da Riviera,ALTO DA RIVIERA
POLYGON ((-46.589...,Alto da Mooca,ALTO DA MOOCA
POLYGON ((-46.719...,Alto da Lapa,ALTO DA LAPA
POLYGON ((-46.629...,Vila Agua Funda,VILA AGUA FUNDA
POLYGON ((-46.622...,Agua Fria,AGUA FRIA


## 1.4 Districts

In [41]:
# reading the neighborhoods files:
district_rdd = ShapefileReader.readToGeometryRDD(
    sc=spark, inputPath=RAW_DATA_DIR + "sp_layers/districts/*"
)

df_district = Adapter.toDf(district_rdd, spark)

# verifying the sectors:
df_district

geometry,CLASSID,FEATID,REVISIONNU,NOME_DIST,SIGLA_DIST,COD_DIST,COD_SUB,DATA_CRIAC,USUARIO_ID
POLYGON ((352436....,4.0,8583485.0,1.0,JOSE BONIFACIO,JBO,47,27,20070319,0.0
POLYGON ((320696....,4.0,8583484.0,1.0,JD SAO LUIS,JDS,46,18,20070319,0.0
POLYGON ((349461....,4.0,8583445.0,1.0,ARTUR ALVIM,AAL,5,21,20070319,0.0
POLYGON ((320731....,4.0,8583479.0,1.0,JAGUARA,JAG,40,8,20070319,0.0
POLYGON ((338651....,4.0,8583437.0,1.0,VILA PRUDENTE,VPR,93,29,20070319,0.0


In [42]:
# it looks like the file's geometry is not the standard we want (ESPG:4326), let's find out what is it:
get_file_crs(RAW_DATA_DIR + "sp_layers/districts/")

<Projected CRS: EPSG:29193>
Name: SAD69 / UTM zone 23S
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Brazil - between 48°W and 42°W, northern and southern hemispheres, onshore and offshore.
- bounds: (-48.0, -33.5, -42.0, 5.13)
Coordinate Operation:
- name: UTM zone 23S
- method: Transverse Mercator
Datum: South American Datum 1969
- Ellipsoid: GRS 1967 Modified
- Prime Meridian: Greenwich

We will need to convert the dataset to a standard CRS (coordinate reference system), which, in our case, is `espg:4326`.

In [43]:
# dropping the unnecessary columns:
df_district = df_district.drop(
    "CLASSID", "FEATID", "REVISIONNU", "DATA_CRIAC", "USUARIO_ID"
)

df_district.createOrReplaceTempView("tb_district")

# converting the coordinate system in the district file:
Q_DISTRICT_CONVERSION = """
SELECT 
    ST_FlipCoordinates(ST_Transform(A.geometry, 'epsg:29193','epsg:4326')) as geometry,
    A.NOME_DIST as district_name,
    A.SIGLA_DIST as district_abbreviation,
    A.COD_DIST as district_code,
    A.COD_SUB as subdistrict_code
FROM tb_district as A
"""

df_district = spark.sql(Q_DISTRICT_CONVERSION)

## 1.5 Census Codebook
The codebook is a master file that contains the name of the variables in respect to their original locations and meaning in the raw data. We will use these to give more meaningful names to the many census variables.

In [44]:
# loading the codebook from the census data
DATA_DOC_DIR = "../references/documentation/"

df_codebook = spark.read.json(DATA_DOC_DIR + "ibge/codebook_features_selected.json")

In [45]:
# we will normalize the data in the codebook such that we can use it to match the variables in the other datasets:
new_cols = {
    "Descrição da Variável": "variable_description",
    "Nome da Variável": "variable_name",
    "Tabela": "dataset",
}

for original_name, new_name in new_cols.items():
    df_codebook = df_codebook.withColumnRenamed(original_name, new_name)

In [46]:
# codebook yields:
df_codebook.count()  # there are a total of 4107 variables in all the datasets

4107

In [47]:
# verifying the dataset:
df_codebook

dataset_name_pt,is_selected,normalized_variable,simplified_variable_name,variable_description_en,variable_description_pt,variable_name
Básico,1,sector_code,cod_setor,Sector code,Código do setor,Cod_setor
Básico,1,code_large_region...,cod_grandes_regioes,Code of Large Reg...,Código das Grande...,Cod_Grandes Regiões
Básico,1,name_large_region...,nome_grande_regiao,Name of large reg...,Nome das Grandes ...,Nome_Grande_Regiao
Básico,1,federation_unit_code,cod_uf,Federation Unit Code,Código da Unidade...,Cod_UF
Básico,1,name_federation_unit,nome_da_uf,Name of the Feder...,Nome da Unidade d...,Nome_da_UF


In [48]:
# building udfs from the utilities:
normalize_entities_udf = F.udf(normalize_entities)

df_codebook = df_codebook.withColumn(
    "dataset_normalized", normalize_entities_udf(F.col("dataset_name_pt"))
)

In [49]:
# fixing the order of columns for the codebook:
df_codebook_subset = df_codebook.select(
    "dataset_normalized", "simplified_variable_name", "normalized_variable"
)

# generating a lookup table base on the dataset : original_name relation
map_rdd = df_codebook_subset.rdd.groupBy(lambda row: row["dataset_normalized"]).map(
    lambda row: (row[0], {variable[1]: variable[2] for variable in row[1]})
)

codebook_lookup = dict(map_rdd.collect())

## 1.6 Census Data
The Census data itself is broken up in several different files that represent the different entities of the data. For example:

1. Domicilio{N}: refers to data at the `household`. Things like number of households in a sector, number of residents and such is located here;
2. Pessoa{N}: refers to the `person` data. Here we can find information about the sociodemographic make-up of the sector, such as gender distributions, number of children, race profiles, et cetera;

... and so on.

I will process all the files related to the city of São Paulo and gather them into the appropriate entities. I also went ahead and pre-selected several features from the codebook, such that we won't use all of the available features (`4107` according to the codebook).

# 2. Processing the Census Data

In [579]:
# defining a blacklist of column names to not convert data types:

blacklist = [
    "cod_bairro",
    "cod_distrito",
    "cod_grandes_regioes",
    "cod_meso",
    "cod_micro",
    "cod_municipio",
    "cod_rm",
    "cod_setor",
    "cod_subdistrito",
    "cod_uf",
    "nome_da_meso",
    "nome_da_micro",
    "nome_da_rm",
    "nome_da_uf",
    "nome_do_bairro",
    "nome_do_distrito",
    "nome_do_municipio",
    "nome_do_subdistrito",
    "nome_grande_regiao",
    "situacao_setor",
    "tipo_setor",
    "situacao",
    "setor_precoleta",
]

In [580]:
# list all the files available on the census data directory:
census_files = sorted(glob(RAW_DATA_DIR + "sp_census/universe_results/*.csv"))
BASE_PATH = RAW_DATA_DIR + "sp_census/universe_results/"

# saving the resulting dataframes to a processed stage for further handling:
PROCESSED_CENSUS_DATA_DIR = "../data/processed/sp_census/raw_datasets/"

# instantiating the UDF for replacing decimal separators:
replace_decimal_separator_udf = F.udf(replace_decimal_separator)
census_data = []

In [581]:
for file in tqdm(census_files):

    print(f"Processing file: {file}")

    # fixing names of the original tables
    dataset_name = (
        file.split("/")[-1].replace(".csv", "").replace("SP1", "").replace("_", "")
    )

    # normalizes the text data for the dataset
    dataset_name = normalize_entities(dataset_name)

    # loading the raw csv file
    df_temp = spark.read.csv(
        file,
        header=True,
        sep=";",
        inferSchema=False,
        encoding="ISO-8859-1",
    )

    # dropping duplicated columns (we can't do much about these)
    df_temp = drop_invalid_census_columns(df_temp, codebook_lookup, dataset_name)

    # retrieving the columns:
    original_columns = sorted(df_temp.columns)

    # normalize column names:
    normalized_columns = list(map(clean_census_column_name, original_columns))

    # looking up column names:
    new_columns = list(
        map(lambda col: codebook_lookup[dataset_name][col], normalized_columns)
    )

    # get columns with duplicates:
    cols_map = dict(zip(normalized_columns, new_columns))

    # iterating over the columns to replace with the new ones:
    for i in range(len(original_columns)):

        input_col = original_columns[i]
        check_col = clean_census_column_name(input_col)
        output_col = new_columns[i]

        try:

            if check_col not in blacklist:

                # renames the column:
                df_temp = df_temp.withColumnRenamed(input_col, output_col)

                # in this case, convert the data types to float after replacing decimal separators:
                df_temp = df_temp.withColumn(
                    output_col, replace_decimal_separator_udf(F.col(output_col))
                ).withColumn(output_col, F.col(output_col).cast("double"))

            else:
                df_temp = df_temp.withColumnRenamed(input_col, output_col)

        except Exception as e:
            print(f"Dropping column {col} as it could not be processed: {e}")

            df_temp.drop(input_col)

    # using the helper function to save the file:
    OUTPUT_NAME = f"tb_{dataset_name}"

    save_to_filesystem(
        df_temp, PROCESSED_CENSUS_DATA_DIR, OUTPUT_NAME, OUTPUT_NAME + ".parquet"
    )

  0%|          | 0/26 [00:00<?, ?it/s]

Processing file: ../data/raw/sp_census/universe_results/Basico_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_basico
    


  4%|▍         | 1/26 [00:00<00:23,  1.07it/s]

../data/processed/sp_census//tb_basico/part-00000-e17276ce-6668-4f09-9559-473044f13e0c-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_basico/part-00000-e17276ce-6668-4f09-9559-473044f13e0c-c000.gz.parquet 
    to ../data/processed/sp_census//tb_basico.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_basico
    
Processing file: ../data/raw/sp_census/universe_results/Domicilio01_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns ['V125', 'V204', 'V202', 'V035', 'V120', 'V139', 'V119', 'V025', 'V026', 'V208', 'V041', 'V206', 'V126', 'V203', 'V039', 'V004', 'V198'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_domicilio01
    


  8%|▊         | 2/26 [00:42<09:51, 24.66s/it]

../data/processed/sp_census//tb_domicilio01/part-00000-e06c8b05-7fe0-4670-8113-e27176d090d8-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_domicilio01/part-00000-e06c8b05-7fe0-4670-8113-e27176d090d8-c000.gz.parquet 
    to ../data/processed/sp_census//tb_domicilio01.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_domicilio01
    
Processing file: ../data/raw/sp_census/universe_results/Domicilio02_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns ['V098', 'V046', 'V054', 'V090', 'V010', 'V002'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_domicilio02
    


 12%|█▏        | 3/26 [00:52<06:56, 18.12s/it]

../data/processed/sp_census//tb_domicilio02/part-00000-35f694d9-356a-4448-a930-deffb20fd9ef-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_domicilio02/part-00000-35f694d9-356a-4448-a930-deffb20fd9ef-c000.gz.parquet 
    to ../data/processed/sp_census//tb_domicilio02.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_domicilio02
    
Processing file: ../data/raw/sp_census/universe_results/DomicilioRenda_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_domicilio_renda
    


 15%|█▌        | 4/26 [00:52<04:05, 11.14s/it]

../data/processed/sp_census//tb_domicilio_renda/part-00000-31c2218a-8eec-4ffc-8549-93b6e08e831b-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_domicilio_renda/part-00000-31c2218a-8eec-4ffc-8549-93b6e08e831b-c000.gz.parquet 
    to ../data/processed/sp_census//tb_domicilio_renda.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_domicilio_renda
    
Processing file: ../data/raw/sp_census/universe_results/Entorno01_SP1.csv
Dropping columns ['Cod_Grandes Regiões', 'Cod_RM', 'Cod_UF', 'Cod_bairro', 'Cod_distrito', 'Cod_meso', 'Cod_micro', 'Cod_municipio', 'Cod_subdistrito', 'Nome_Grande_Regiao', 'Nome_da_RM', 'Nome_da_UF ', 'Nome_da_meso', 'Nome_da_micro', 'Nome_do_bairro', 'Nome_do_distrito', 'Nome_do_municipio', 'Nome_do_subdistrito', 'Setor_Precoleta'] - not present in the codebook
Dropping columns ['V102', 'V062', 'Cod_setor', 'V023', 'V041', 'V016', 'V163', 'V103', 'V006', 'V003', 'V063', 'V021', 'V059', 'V162', 'V020', 'Situacao_setor', 

 19%|█▉        | 5/26 [01:16<05:26, 15.54s/it]

../data/processed/sp_census//tb_entorno01/part-00000-518627ff-ce8b-48b4-b6b9-3555f3e879d9-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_entorno01/part-00000-518627ff-ce8b-48b4-b6b9-3555f3e879d9-c000.gz.parquet 
    to ../data/processed/sp_census//tb_entorno01.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_entorno01
    
Processing file: ../data/raw/sp_census/universe_results/Entorno02_SP1.csv
Dropping columns ['Cod_Grandes Regiões', 'Cod_RM', 'Cod_UF', 'Cod_bairro', 'Cod_distrito', 'Cod_meso', 'Cod_micro', 'Cod_municipio', 'Cod_subdistrito', 'Nome_Grande_Regiao', 'Nome_da_RM', 'Nome_da_UF ', 'Nome_da_meso', 'Nome_da_micro', 'Nome_do_bairro', 'Nome_do_distrito', 'Nome_do_municipio', 'Nome_do_subdistrito', 'Setor_Precoleta'] - not present in the codebook
Dropping columns ['V202', 'Cod_setor', 'V207', 'V206', 'V203', 'Situacao_setor'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_entorno02
    


 23%|██▎       | 6/26 [01:53<07:40, 23.04s/it]

../data/processed/sp_census//tb_entorno02/part-00000-286e867c-19f0-4ea5-8107-14aef4a2ccae-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_entorno02/part-00000-286e867c-19f0-4ea5-8107-14aef4a2ccae-c000.gz.parquet 
    to ../data/processed/sp_census//tb_entorno02.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_entorno02
    
Processing file: ../data/raw/sp_census/universe_results/Entorno03_SP1.csv
Dropping columns ['Cod_Grandes Regiões', 'Cod_RM', 'Cod_UF', 'Cod_bairro', 'Cod_distrito', 'Cod_meso', 'Cod_micro', 'Cod_municipio', 'Cod_subdistrito', 'Nome_Grande_Regiao', 'Nome_da_RM', 'Nome_da_UF ', 'Nome_da_meso', 'Nome_da_micro', 'Nome_do_bairro', 'Nome_do_distrito', 'Nome_do_municipio', 'Nome_do_subdistrito', 'Setor_Precoleta'] - not present in the codebook
Dropping columns ['Cod_setor', 'V475', 'V477', 'V479', 'V480', 'V429', 'V453', 'V478', 'V431', 'V471', 'V457', 'Situacao_setor'] for being duplicates

    Saving dataframe to ../data/pr

 27%|██▋       | 7/26 [02:21<07:45, 24.48s/it]

../data/processed/sp_census//tb_entorno03/part-00000-a71f32e9-4c3a-45b1-9125-e4e517b128dd-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_entorno03/part-00000-a71f32e9-4c3a-45b1-9125-e4e517b128dd-c000.gz.parquet 
    to ../data/processed/sp_census//tb_entorno03.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_entorno03
    
Processing file: ../data/raw/sp_census/universe_results/Entorno04_SP1.csv
Dropping columns ['Cod_Grandes Regiões', 'Cod_RM', 'Cod_UF', 'Cod_bairro', 'Cod_distrito', 'Cod_meso', 'Cod_micro', 'Cod_municipio', 'Cod_subdistrito', 'Nome_Grande_Regiao', 'Nome_da_RM', 'Nome_da_UF ', 'Nome_da_meso', 'Nome_da_micro', 'Nome_do_bairro', 'Nome_do_distrito', 'Nome_do_municipio', 'Nome_do_subdistrito', 'Setor_Precoleta'] - not present in the codebook
Dropping columns ['Cod_setor', 'Situacao_setor', 'V693', 'V694'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_entorno04
    


 31%|███       | 8/26 [03:00<08:42, 29.03s/it]

../data/processed/sp_census//tb_entorno04/part-00000-185d82b2-3ae9-45d9-b483-6165b2167838-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_entorno04/part-00000-185d82b2-3ae9-45d9-b483-6165b2167838-c000.gz.parquet 
    to ../data/processed/sp_census//tb_entorno04.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_entorno04
    
Processing file: ../data/raw/sp_census/universe_results/Entorno05_SP1.csv
Dropping columns ['Cod_Grandes Regiões', 'Cod_RM', 'Cod_UF', 'Cod_bairro', 'Cod_distrito', 'Cod_meso', 'Cod_micro', 'Cod_municipio', 'Cod_subdistrito', 'Nome_Grande_Regiao', 'Nome_da_RM', 'Nome_da_UF ', 'Nome_da_meso', 'Nome_da_micro', 'Nome_do_bairro', 'Nome_do_distrito', 'Nome_do_municipio', 'Nome_do_subdistrito', 'Setor_Precoleta'] - not present in the codebook
Dropping columns ['Cod_setor', 'Situacao_setor'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_entorno05
    


 35%|███▍      | 9/26 [03:39<09:10, 32.36s/it]

../data/processed/sp_census//tb_entorno05/part-00000-bae291b5-0af2-4ca0-8a7b-e2c295824b8d-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_entorno05/part-00000-bae291b5-0af2-4ca0-8a7b-e2c295824b8d-c000.gz.parquet 
    to ../data/processed/sp_census//tb_entorno05.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_entorno05
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa01_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa01
    


 38%|███▊      | 10/26 [03:44<06:19, 23.75s/it]

../data/processed/sp_census//tb_pessoa01/part-00000-134e9900-b4e1-48b2-93dd-af573356c6ea-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa01/part-00000-134e9900-b4e1-48b2-93dd-af573356c6ea-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa01.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa01
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa02_SP1.csv
Dropping columns ['V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V2

 42%|████▏     | 11/26 [03:48<04:27, 17.82s/it]

../data/processed/sp_census//tb_pessoa02/part-00000-7c9d6791-8b43-4636-8bf5-1f05904ebf74-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa02/part-00000-7c9d6791-8b43-4636-8bf5-1f05904ebf74-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa02.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa02
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa03_SP1.csv
Dropping columns ['V002'] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa03
    


 46%|████▌     | 12/26 [04:43<06:48, 29.16s/it]

../data/processed/sp_census//tb_pessoa03/part-00000-aa3ed58b-09fd-4f31-9097-bc2ca707fa8c-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa03/part-00000-aa3ed58b-09fd-4f31-9097-bc2ca707fa8c-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa03.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa03
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa04_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa04
    


 50%|█████     | 13/26 [05:00<05:28, 25.31s/it]

../data/processed/sp_census//tb_pessoa04/part-00000-c7da13e7-91fd-4cae-8be7-4305420b8f90-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa04/part-00000-c7da13e7-91fd-4cae-8be7-4305420b8f90-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa04.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa04
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa05_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa05
    


 54%|█████▍    | 14/26 [05:00<03:33, 17.76s/it]

../data/processed/sp_census//tb_pessoa05/part-00000-7cc60fd9-6c27-48cb-9460-d96cb2c0ff03-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa05/part-00000-7cc60fd9-6c27-48cb-9460-d96cb2c0ff03-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa05.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa05
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa06_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa06
    


 58%|█████▊    | 15/26 [05:36<04:16, 23.30s/it]

../data/processed/sp_census//tb_pessoa06/part-00000-a5901c19-5c6b-42fe-a34e-11e7cc175df4-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa06/part-00000-a5901c19-5c6b-42fe-a34e-11e7cc175df4-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa06.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa06
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa07_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns ['V050', 'V202', 'V168', 'V049', 'V042', 'V008', 'V189', 'V194', 'V036', 'V161', 'V195', 'V203', 'V041', 'V015'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa07
    


 62%|██████▏   | 16/26 [06:03<04:04, 24.42s/it]

../data/processed/sp_census//tb_pessoa07/part-00000-23d468ed-ca73-4648-abc8-b3b003a83093-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa07/part-00000-23d468ed-ca73-4648-abc8-b3b003a83093-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa07.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa07
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa08_SP1.csv
Dropping columns ['Cod_setor', 'V255'] - not present in the codebook
Dropping columns ['V100', 'V205', 'V062', 'V054', 'V080', 'V242', 'V220', 'V089', 'V212', 'V084', 'V251', 'V223', 'V211', 'V083', 'V098', 'V052', 'V213', 'V227', 'V229', 'V237', 'V064', 'V218', 'V244', 'V096', 'V053', 'V249', 'V070', 'V055', 'V207', 'V210', 'V058', 'V209', 'V086', 'V226', 'V063', 'V250', 'V245', 'V060', 'V208', 'V059', 'V224', 'V073', 'V236', 'V071', 'V091', 'V253', 'V075', 'V215', 'V076', 'V056', 'V216', 'V057', 'V217', 'V239', 'V067', 'V228', 'V092', 'V097', 'V206'

 65%|██████▌   | 17/26 [06:38<04:07, 27.52s/it]

../data/processed/sp_census//tb_pessoa08/part-00000-e5e9ec12-54f1-4c4e-a1c1-b944cee69531-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa08/part-00000-e5e9ec12-54f1-4c4e-a1c1-b944cee69531-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa08.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa08
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa09_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns ['V157', 'V161', 'V172', 'V168'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa09
    


 69%|██████▉   | 18/26 [07:25<04:27, 33.43s/it]

../data/processed/sp_census//tb_pessoa09/part-00000-8f9c7f8c-ffce-478f-9991-ae4aeffdba51-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa09/part-00000-8f9c7f8c-ffce-478f-9991-ae4aeffdba51-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa09.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa09
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa10_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa10
    


 73%|███████▎  | 19/26 [07:25<02:44, 23.45s/it]

../data/processed/sp_census//tb_pessoa10/part-00000-f7983842-c530-4875-a74f-b5fafe7e133b-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa10/part-00000-f7983842-c530-4875-a74f-b5fafe7e133b-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa10.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa10
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa11_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa11
    


 77%|███████▋  | 20/26 [07:37<01:59, 19.89s/it]

../data/processed/sp_census//tb_pessoa11/part-00000-372b7357-3c01-4573-9b7d-a4455f4da54e-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa11/part-00000-372b7357-3c01-4573-9b7d-a4455f4da54e-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa11.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa11
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa12_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa12
    


 81%|████████  | 21/26 [07:48<01:26, 17.37s/it]

../data/processed/sp_census//tb_pessoa12/part-00000-64e7fe2a-14b1-494b-b154-d0f4974cc15f-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa12/part-00000-64e7fe2a-14b1-494b-b154-d0f4974cc15f-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa12.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa12
    
Processing file: ../data/raw/sp_census/universe_results/Pessoa13_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa13
    


 85%|████████▍ | 22/26 [08:00<01:02, 15.67s/it]

../data/processed/sp_census//tb_pessoa13/part-00000-90837121-a595-48ae-817b-9ae26215c39d-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa13/part-00000-90837121-a595-48ae-817b-9ae26215c39d-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa13.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa13
    
Processing file: ../data/raw/sp_census/universe_results/PessoaRenda_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_pessoa_renda
    


 88%|████████▊ | 23/26 [08:12<00:43, 14.50s/it]

../data/processed/sp_census//tb_pessoa_renda/part-00000-ec1f32a4-a42c-4358-8724-1ba6f72bba96-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_pessoa_renda/part-00000-ec1f32a4-a42c-4358-8724-1ba6f72bba96-c000.gz.parquet 
    to ../data/processed/sp_census//tb_pessoa_renda.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_pessoa_renda
    
Processing file: ../data/raw/sp_census/universe_results/Responsavel01_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_responsavel01
    


 92%|█████████▏| 24/26 [08:19<00:24, 12.32s/it]

../data/processed/sp_census//tb_responsavel01/part-00000-8e073358-048e-4e4e-b0a3-51a2ae4fcf21-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_responsavel01/part-00000-8e073358-048e-4e4e-b0a3-51a2ae4fcf21-c000.gz.parquet 
    to ../data/processed/sp_census//tb_responsavel01.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_responsavel01
    
Processing file: ../data/raw/sp_census/universe_results/Responsavel02_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns [] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_responsavel02
    


 96%|█████████▌| 25/26 [08:57<00:19, 19.85s/it]

../data/processed/sp_census//tb_responsavel02/part-00000-225f9d8d-4e73-4a65-93ef-e7deac429016-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_responsavel02/part-00000-225f9d8d-4e73-4a65-93ef-e7deac429016-c000.gz.parquet 
    to ../data/processed/sp_census//tb_responsavel02.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_responsavel02
    
Processing file: ../data/raw/sp_census/universe_results/ResponsavelRenda_SP1.csv
Dropping columns [] - not present in the codebook
Dropping columns ['V118', 'V052', 'V019', 'V063', 'V085', 'V129'] for being duplicates

    Saving dataframe to ../data/processed/sp_census//tb_responsavel_renda
    


100%|██████████| 26/26 [09:07<00:00, 21.06s/it]

../data/processed/sp_census//tb_responsavel_renda/part-00000-02bb960a-223c-4e6e-a591-138cb39ff88c-c000.gz.parquet

    Saving
    ../data/processed/sp_census//tb_responsavel_renda/part-00000-02bb960a-223c-4e6e-a591-138cb39ff88c-c000.gz.parquet 
    to ../data/processed/sp_census//tb_responsavel_renda.parquet
    

    Deleting leftover directory at ../data/processed/sp_census//tb_responsavel_renda
    





# 3. Summarizing the Polygon Data
In order to simplify the operations with Polygon data at the lowest level (sector), we will add a helper column as an approximation of the geospatial reference of the sector by calculating its centroid. This will allow simpler joins later on at a reasonable level of approximation due to the comparatively small area of the census sectors.

We will also use these to generate the features themselves at different levels. 

In [50]:
# registering dataframes as temporary table views -- will allow us to access them in SQL Context:
df_sector.createOrReplaceTempView("tb_census_sector")

In [51]:
# adding the centroid and area of the polygons:
Q_ADD_GEOM = """
SELECT 
    A.*,
    ST_Centroid(A.geometry) as sector_centroid,
    ST_X(ST_Centroid(A.geometry)) as centroid_longitude,
    ST_Y(ST_Centroid(A.geometry)) as centroid_latitude,
    ST_Area(A.geometry) as sector_area
FROM tb_census_sector as A
"""

# adding the geometry:
df_sector = spark.sql(Q_ADD_GEOM)

In [52]:
# verifying the resulting schema:
df_sector.printSchema()  # looks good to go

root
 |-- id: string (nullable = true)
 |-- geometry: geometry (nullable = true)
 |-- sector_code: string (nullable = true)
 |-- city_code: string (nullable = true)
 |-- neighborhood_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- sector_type: string (nullable = true)
 |-- sector_centroid: geometry (nullable = false)
 |-- centroid_longitude: double (nullable = true)
 |-- centroid_latitude: double (nullable = true)
 |-- sector_area: double (nullable = false)



# 4. Selecting the target features

In the codebook file provided, I had already selected several features of interest as a basis for this project. As it is originally intended for use in the Real State industry, the selection reflect that. Anyone interested in other kinds of features is free to create a selection of his or her own and use the following code as reference. The original files that hold the variables of interest are the following:

1. `Basico`
2. `Domicilio01`
3. `Domicilio02`
4. `Pessoa11`
5. `Pessoa12`
6. `Pessoa13`
7. `Responsavel02`

In [53]:
# specifying a directory for the census data:
CENSUS_DATA_DIR = "../data/processed/sp_census/raw_datasets/"

# basico - basic entity:
tb_basico = spark.read.parquet(CENSUS_DATA_DIR + "tb_basico.parquet")

# domicilio - housing entities:
tb_domicilio01 = spark.read.parquet(CENSUS_DATA_DIR + "tb_domicilio01.parquet")
tb_domicilio02 = spark.read.parquet(CENSUS_DATA_DIR + "tb_domicilio02.parquet")

# pessoa - person entities:
tb_pessoa11 = spark.read.parquet(CENSUS_DATA_DIR + "tb_pessoa11.parquet")
tb_pessoa12 = spark.read.parquet(CENSUS_DATA_DIR + "tb_pessoa12.parquet")
tb_pessoa13 = spark.read.parquet(CENSUS_DATA_DIR + "tb_pessoa13.parquet")

# resposavel - household responsible entities:
tb_responsible02 = spark.read.parquet(CENSUS_DATA_DIR + "tb_responsavel02.parquet")

In [54]:
# listing the number of census sectors:
tb_basico.count()  # joins show end up with the same number of rows (distinct sectors)

18363

In [55]:
# joining all the datasets before selecting:
tb_census = tb_basico.join(
    tb_domicilio01, on=["sector_code", "sector_situation_code"], how="left"
)
tb_census = tb_census.join(
    tb_domicilio02, on=["sector_code", "sector_situation_code"], how="left"
)
tb_census = tb_census.join(
    tb_pessoa11, on=["sector_code", "sector_situation_code"], how="left"
)
tb_census = tb_census.join(
    tb_pessoa12, on=["sector_code", "sector_situation_code"], how="left"
)
tb_census = tb_census.join(
    tb_responsible02, on=["sector_code", "sector_situation_code"], how="left"
)

In [56]:
# verifying the data integrity:
assert (
    tb_basico.count() == tb_census.count()
), "There are more rows than expected in the final file"

In [57]:
# selecting the columns that are relevant from the codebook:
col_selection = df_codebook.filter(F.col("is_selected") == 1)

In [58]:
# keeping the columns of interest:
cols_to_keep = get_column_values(col_selection, "normalized_variable")

In [59]:
# dropping the columns that are not part of the selection:
tb_census = tb_census.drop(
    *[col for col in tb_census.columns if col not in cols_to_keep]
)

# 5. Preparing features at the units of interest

As previously described in the `1.0 Data Understanding - All Datasets.ipynb` notebook, we have several units of interest for the datasets we are working with. These are the following:

1. `sector`: the lowest unit of measurement for the Brazilian Census, which is one of most important geospatially referenced datasets we will be working with;
2. `zipcode`: zip codes in the city of São Paulo can be roughly approximated to an entire street (also called a logradouro);
3. `area_of_ponderation`: areas of ponderation are contiguous groups of census sectors;
4. `neighborhoods`: areas that are often (but not directly) related to the neighborhoods of the city;
5. `districts`: districts are administrative regions defined by law (and thus, won't change much over time), used to allocate resources by the City Hall;

We will aggregate the features at the levels above and maintain them in their raw state. Note: the zip code data will be processed in a different notebook as it requires a different processing approach.

In [60]:
# adding the geometry dataframes to the SQL Context:
df_ap.createOrReplaceTempView("tb_ponderation")
df_nb.createOrReplaceTempView("tb_neighborhood")
df_district.createOrReplaceTempView("tb_district")

In [61]:
# adding the raw features to the geometry:
df_sector_features = (
    df_sector.join(
        tb_census, (df_sector.sector_code == tb_census.sector_code), how="left"
    )
    .drop(tb_census.neighborhood_code)
    .drop(tb_census.sector_code)
)

In [62]:
# retrieving the number of rows after the join:
df_sector_features.count()

18953

In [63]:
# listing columns with null values:
get_null_columns(df_sector_features)

-RECORD 0----------------------------------------------------------------------------------------------------------------
 id                                                                                                                | 0   
 geometry                                                                                                          | 0   
 sector_code                                                                                                       | 0   
 city_code                                                                                                         | 0   
 neighborhood_code                                                                                                 | 0   
 city                                                                                                              | 0   
 neighborhood                                                                                                      | 0   
 sector_type            

In [64]:
# dropping null rows from the join (represent sectors that are not present in the area of study):
df_sector_features = df_sector_features.na.drop(
    subset=["code_large_regions_geographical_regions"]
)

In [65]:
# removing the missing values gets us back to the correct number (the ones that match the census exactly)
df_sector_features.count()

18363

In [66]:
# adding the resulting dataframe to the SQL Context:
df_sector_features.createOrReplaceTempView("tb_sector_features")

## 5.1 Area of Ponderation

In [176]:
# adding the centroid and area of the polygons:
Q_AP_MATCH = """
SELECT 
    A.ponderation_area_code,
    B.sector_code
FROM tb_ponderation as A, tb_sector_features as B
WHERE ST_Contains(A.geometry, B.sector_centroid)
"""

# matching the areas of ponderation to their sectors:
df_ap_match = spark.sql(Q_AP_MATCH)

In [177]:
# adding the resulting dataframe to the SQL Context:
df_ap_match.createOrReplaceTempView("tb_ap_match")

In [178]:
# adding the features to the ponderation area back:
Q_AP_RAW_FEATURES = """
SELECT
    A.ponderation_area_code,
    A.sector_code,
    B.*
FROM tb_ap_match as A
LEFT JOIN tb_sector_features as B
ON A.sector_code = B.sector_code
"""

# reading back the dataset:
df_ap_raw_features = spark.sql(Q_AP_RAW_FEATURES)

# dropping the geometry columns and looking at the results:
df_ap_raw_features = df_ap_raw_features.drop("geometry", "sector_centroid")

df_ap_raw_features

ponderation_area_code,sector_code,id,sector_code.1,city_code,neighborhood_code,city,neighborhood,sector_type,centroid_longitude,centroid_latitude,sector_area,sector_situation_code,code_large_regions_geographical_regions,name_large_regions_geographical_regions,federation_unit_code,name_federation_unit,mesoregion_code,name_mesoregion,microregion_code,name_microregion,metropolitan_region_code_or_ride,name_metropolitan_region_or_ride,code_municipality,name_municipality,district_code,district_name,subdistrict_code,name_sub_district,name_neighborhood,sector_type_code,permanent_private_households_or_persons_responsible_for_permanent_private_households,residents_permanent_private_households_or_population_residing_permanent_private_households,average_number_dwellers_permanent_private_households_obtained_by_var2_division_by_var1,value_average_monthly_nominal_income_persons_responsible_for_permanent_private_households_with_and_without_income,permanent_private_households_and_disposed,permanent_private_households_acquisition,permanent_private_households_rented,permanent_private_households_with_3_bathrooms_residents,permanent_private_households_without_exclusive_use_bathroom_residents,permanent_private_households_with_electricity,permanent_private_households_with_1_resident,permanent_private_households_with_2_residents,permanent_private_households_with_3_residents,permanent_private_households_with_4_residents,permanent_private_households_with_5_residents,permanent_private_households_with_6_residents,permanent_private_households_with_7_residents,permanent_private_households_with_8_residents,permanent_private_households_with_9_residents,permanent_private_households_with_10_or_more_residents,permanent_private_home_type_households_and_clear,permanent_private_home_type_households_and_acquisition,permanent_private_household_homes_rented,residents_private_households_and_collective_households,residents_permanent_household_private_households,residents_permanent_private_households_apartment_type,residents_permanent_permanent_households,dwellers_permanent_private_households_and_acquisition,residents_permanent_private_households_rented,children_only_from_person_responsible_private_households,household_employees_private_households_males,relatives_household_employees_private_households_male,household_employees_private_households,relatives_domestic_employees_as_private_households_female,literacy_responsible_persons
3550308005039,355030801000005,108943,355030801000005,3550308,,SÃO PAULO,,URBANO,-46.57065666912533,-23.571444624386825,7.278722154733566E-6,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030801,ÁGUA RASA,35503080100,ÁGUA RASA,SÃO PAULO (todos ...,0,240.0,754.0,3.14,1416.9,105.0,4.0,112.0,20.0,1.0,240.0,30.0,60.0,65.0,38.0,34.0,10.0,1.0,1.0,0.0,1.0,98.0,4.0,105.0,754.0,716.0,30.0,340.0,15.0,354.0,46.0,0.0,0.0,1.0,0.0,232.0
3550308005106,355030802000028,109096,355030802000028,3550308,,SÃO PAULO,,URBANO,-46.69939093174092,-23.54192792142038,2.741913499999751...,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030802,ALTO DE PINHEIROS,35503080200,ALTO DE PINHEIROS,SÃO PAULO (todos ...,0,177.0,394.0,2.23,5017.33,120.0,10.0,36.0,31.0,0.0,177.0,46.0,69.0,42.0,16.0,4.0,0.0,0.0,0.0,0.0,0.0,37.0,3.0,5.0,394.0,120.0,274.0,276.0,19.0,80.0,12.0,0.0,0.0,4.0,0.0,176.0
3550308005029,355030806000016,109560,355030806000016,3550308,,SÃO PAULO,,URBANO,-46.67432775353449,-23.52900256909393,5.131327499995703E-6,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030806,BARRA FUNDA,35503080600,BARRA FUNDA,SÃO PAULO (todos ...,0,135.0,356.0,2.64,3564.83,80.0,10.0,38.0,22.0,0.0,135.0,27.0,45.0,27.0,25.0,9.0,1.0,1.0,0.0,0.0,0.0,36.0,4.0,6.0,356.0,156.0,200.0,214.0,32.0,88.0,11.0,0.0,0.0,3.0,0.0,135.0
3550308005016,355030808000029,109753,355030808000029,3550308,,SÃO PAULO,,URBANO,-46.60066022961847,-23.52987229395476,1.101778800000770...,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030808,BELÉM,35503080800,BELÉM,SÃO PAULO (todos ...,0,259.0,739.0,2.85,1867.73,117.0,2.0,139.0,7.0,0.0,259.0,50.0,66.0,63.0,52.0,20.0,3.0,2.0,1.0,0.0,2.0,33.0,0.0,28.0,739.0,221.0,518.0,316.0,5.0,416.0,30.0,2.0,0.0,1.0,0.0,259.0
3550308005016,355030808000045,109769,355030808000045,3550308,,SÃO PAULO,,URBANO,-46.60511826311402,-23.52436697910189,1.054084249999352E-5,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030808,BELÉM,35503080800,BELÉM,SÃO PAULO (todos ...,0,193.0,577.0,2.99,2075.0,103.0,14.0,68.0,15.0,0.0,193.0,28.0,54.0,46.0,41.0,13.0,7.0,2.0,0.0,2.0,0.0,101.0,14.0,54.0,577.0,537.0,38.0,303.0,51.0,210.0,21.0,0.0,0.0,1.0,0.0,188.0


In [179]:
# verifying the data integrity:
assert (
    df_ap_raw_features.count() == df_ap_match.count()
), "There are more rows than expected in the final file"

### 5.1.1 Aggregating Raw Area of Ponderation Features

In [184]:
# let's aggregate the data from the census sectors onto the area of ponderation level
df_ap_agg = df_ap_raw_features.groupby("ponderation_area_code").agg(
    F.sum(
        F.col(
            "permanent_private_households_or_persons_responsible_for_permanent_private_households"
        )
    ).alias("total_private_households"),
    F.sum(
        F.col(
            "residents_permanent_private_households_or_population_residing_permanent_private_households"
        )
    ).alias("total_population_private_households"),
    F.avg(
        F.col(
            "average_number_dwellers_permanent_private_households_obtained_by_var2_division_by_var1"
        )
    ).alias("average_number_residents_per_household"),
    F.avg(
        F.col(
            "value_average_monthly_nominal_income_persons_responsible_for_permanent_private_households_with_and_without_income"
        )
    ).alias("average_monthly_income"),
    F.sum(F.col("permanent_private_households_and_disposed")).alias(
        "total_fully_owned_properties"
    ),
    F.sum(F.col("permanent_private_households_acquisition")).alias(
        "total_properties_in_acquisition"
    ),
    F.sum(F.col("permanent_private_households_rented")).alias(
        "total_rented_properties"
    ),
    F.sum(F.col("permanent_private_households_with_3_bathrooms_residents")).alias(
        "total_properties_with_3_bathrooms"
    ),
    F.sum(
        F.col("permanent_private_households_without_exclusive_use_bathroom_residents")
    ).alias("total_properties_without_private_bathrooms"),
    F.sum(F.col("permanent_private_households_with_electricity")).alias(
        "total_households_with_electricity"
    ),
    F.sum(F.col("permanent_private_households_with_1_resident")).alias(
        "total_households_with_1_resident"
    ),
    F.sum(F.col("permanent_private_households_with_2_residents")).alias(
        "total_households_with_2_residents"
    ),
    F.sum(F.col("permanent_private_households_with_3_residents")).alias(
        "total_households_with_3_residents"
    ),
    F.sum(F.col("permanent_private_households_with_4_residents")).alias(
        "total_households_with_4_residents"
    ),
    F.sum(F.col("permanent_private_households_with_5_residents")).alias(
        "total_households_with_5_residents"
    ),
    F.sum(F.col("permanent_private_households_with_6_residents")).alias(
        "total_households_with_6_residents"
    ),
    F.sum(F.col("permanent_private_households_with_7_residents")).alias(
        "total_households_with_7_residents"
    ),
    F.sum(F.col("permanent_private_households_with_8_residents")).alias(
        "total_households_with_8_residents"
    ),
    F.sum(F.col("permanent_private_households_with_9_residents")).alias(
        "total_households_with_9_residents"
    ),
    F.sum(F.col("permanent_private_households_with_10_or_more_residents")).alias(
        "total_households_with_10_residents_or_more"
    ),
    F.sum(F.col("permanent_private_home_type_households_and_clear")).alias(
        "total_fully_owned_houses"
    ),
    F.sum(F.col("permanent_private_home_type_households_and_acquisition")).alias(
        "total_houses_in_acquisition"
    ),
    F.sum(F.col("permanent_private_household_homes_rented")).alias(
        "total_rented_houses"
    ),
    F.sum(F.col("residents_private_households_and_collective_households")).alias(
        "total_residents_in_area"
    ),
    F.sum(F.col("residents_permanent_household_private_households")).alias(
        "total_residents_in_permanent_households"
    ),
    F.sum(F.col("residents_permanent_private_households_apartment_type")).alias(
        "total_residents_in_private_apartments"
    ),
    F.sum(F.col("dwellers_permanent_private_households_and_acquisition")).alias(
        "total_residents_properties_in_acquisition"
    ),
    F.sum(F.col("residents_permanent_private_households_rented")).alias(
        "total_residents_rented_properties"
    ),
    F.sum(F.col("children_only_from_person_responsible_private_households")).alias(
        "total_number_children"
    ),
    F.sum(
        F.col("household_employees_private_households_males")
        + F.col("household_employees_private_households")
    ).alias("total_number_house_workers"),
    F.sum(
        F.col("relatives_domestic_employees_as_private_households_female")
        + F.col("relatives_household_employees_private_households_male")
    ).alias("total_number_house_workers_parents"),
    F.sum(F.col("literacy_responsible_persons")).alias(
        "total_literate_household_leaders"
    ),
)

# fixing the column names:
new_columns = list(map(lambda col: f"ponderation_area_{col}", df_ap_agg.columns))

for i in range(len(df_ap_agg.columns)):
    df_ap_agg = df_ap_agg.withColumnRenamed(df_ap_agg.columns[i], new_columns[i])

In [185]:
for col in sorted(df_ap_agg.columns):
    print(col)

ponderation_area_average_monthly_income
ponderation_area_average_number_residents_per_household
ponderation_area_ponderation_area_code
ponderation_area_total_fully_owned_houses
ponderation_area_total_fully_owned_properties
ponderation_area_total_households_with_10_residents_or_more
ponderation_area_total_households_with_1_resident
ponderation_area_total_households_with_2_residents
ponderation_area_total_households_with_3_residents
ponderation_area_total_households_with_4_residents
ponderation_area_total_households_with_5_residents
ponderation_area_total_households_with_6_residents
ponderation_area_total_households_with_7_residents
ponderation_area_total_households_with_8_residents
ponderation_area_total_households_with_9_residents
ponderation_area_total_households_with_electricity
ponderation_area_total_houses_in_acquisition
ponderation_area_total_literate_household_leaders
ponderation_area_total_number_children
ponderation_area_total_number_house_workers
ponderation_area_total_number_

In [186]:
# verifying the results:
df_ap_agg

ponderation_area_ponderation_area_code,ponderation_area_total_private_households,ponderation_area_total_population_private_households,ponderation_area_average_number_residents_per_household,ponderation_area_average_monthly_income,ponderation_area_total_fully_owned_properties,ponderation_area_total_properties_in_acquisition,ponderation_area_total_rented_properties,ponderation_area_total_properties_with_3_bathrooms,ponderation_area_total_properties_without_private_bathrooms,ponderation_area_total_households_with_electricity,ponderation_area_total_households_with_1_resident,ponderation_area_total_households_with_2_residents,ponderation_area_total_households_with_3_residents,ponderation_area_total_households_with_4_residents,ponderation_area_total_households_with_5_residents,ponderation_area_total_households_with_6_residents,ponderation_area_total_households_with_7_residents,ponderation_area_total_households_with_8_residents,ponderation_area_total_households_with_9_residents,ponderation_area_total_households_with_10_residents_or_more,ponderation_area_total_fully_owned_houses,ponderation_area_total_houses_in_acquisition,ponderation_area_total_rented_houses,ponderation_area_total_residents_in_area,ponderation_area_total_residents_in_permanent_households,ponderation_area_total_residents_in_private_apartments,ponderation_area_total_residents_properties_in_acquisition,ponderation_area_total_residents_rented_properties,ponderation_area_total_number_children,ponderation_area_total_number_house_workers,ponderation_area_total_number_house_workers_parents,ponderation_area_total_literate_household_leaders
3550308005048,15002.0,38235.0,2.5446874999999998,3969.7853125,9386.0,952.0,3988.0,2225.0,40.0,15000.0,3365.0,4888.0,3414.0,2319.0,705.0,202.0,63.0,27.0,11.0,8.0,3544.0,125.0,1323.0,38331.0,15543.0,21665.0,2400.0,9727.0,1490.0,143.0,6.0,14880.0
3550308005045,8400.0,23519.0,2.801219512195122,3996.8287804878046,5266.0,938.0,1936.0,1359.0,104.0,8400.0,1303.0,2554.0,2197.0,1637.0,464.0,148.0,56.0,22.0,10.0,9.0,1982.0,97.0,1071.0,23546.0,10093.0,12905.0,2631.0,5466.0,1000.0,50.0,9.0,8346.0
3550308005031,8835.0,26619.0,3.082682926829268,2033.2802439024392,5135.0,622.0,2649.0,838.0,115.0,8830.0,1311.0,2339.0,2239.0,1743.0,739.0,253.0,100.0,54.0,23.0,34.0,3262.0,90.0,1929.0,26730.0,17762.0,7808.0,1814.0,8588.0,1327.0,30.0,1.0,8685.0
3550308005158,8666.0,26741.0,3.0878787878787883,1294.025151515151,5434.0,410.0,2413.0,538.0,41.0,8664.0,1138.0,2198.0,2254.0,1800.0,787.0,294.0,109.0,40.0,24.0,22.0,5171.0,150.0,2298.0,26774.0,24925.0,1734.0,1277.0,7321.0,1324.0,17.0,0.0,8436.0
3550308005102,9382.0,20499.0,2.219795918367347,5673.932040816328,5634.0,576.0,2625.0,2097.0,20.0,9382.0,3161.0,3151.0,1726.0,992.0,267.0,57.0,18.0,7.0,1.0,2.0,1020.0,16.0,276.0,20548.0,3842.0,16260.0,1211.0,5496.0,793.0,206.0,13.0,9365.0


In [187]:
# adding the resulting dataframe to the SQL Context:
df_ap_agg.createOrReplaceTempView("tb_ponderation_features")

In [77]:
# generating the final level of aggregation for the census features:
Q_AP_GEOM = """
SELECT
    A.*,
    B.geometry,
    ST_Centroid(B.geometry) as ponderation_area_centroid,
    ST_Area(B.geometry) as ponderation_area_value
FROM tb_ponderation_features as A
LEFT JOIN tb_ponderation as B 
ON A.area_ponderation_area_code = B.ponderation_area_code
"""

df_ap_final = spark.sql(Q_AP_GEOM)

In [78]:
# looking at the results:
df_ap_final

ponderation_area_code,total_private_households,total_population_private_households,average_number_residents_per_household,average_monthly_income,total_fully_owned_properties,total_properties_in_acquisition,total_rented_properties,total_properties_with_3_bathrooms,total_properties_without_private_bathrooms,total_households_with_electricity,total_households_with_1_resident,total_households_with_2_residents,total_households_with_3_residents,total_households_with_4_residents,total_households_with_5_residents,total_households_with_6_residents,total_households_with_7_residents,total_households_with_8_residents,total_households_with_9_residents,total_households_with_10_residents_or_more,total_fully_owned_houses,total_houses_in_acquisition,total_rented_houses,total_residents_in_area,total_residents_in_permanent_households,total_residents_in_private_apartments,total_residents_properties_in_acquisition,total_residents_rented_properties,total_number_children,total_number_house_workers,total_number_house_workers_parents,total_literate_household_leaders,geometry,ponderation_area_centroid,ponderation_area_value
3550308005045,8400.0,23519.0,2.801219512195122,3996.8287804878046,5266.0,938.0,1936.0,1359.0,104.0,8400.0,1303.0,2554.0,2197.0,1637.0,464.0,148.0,56.0,22.0,10.0,9.0,1982.0,97.0,1071.0,23546.0,10093.0,12905.0,2631.0,5466.0,1000.0,50.0,9.0,8346.0,POLYGON ((-46.620...,POINT (-46.619835...,1.217477032570667...
3550308005048,15002.0,38235.0,2.5446874999999998,3969.7853125,9386.0,952.0,3988.0,2225.0,40.0,15000.0,3365.0,4888.0,3414.0,2319.0,705.0,202.0,63.0,27.0,11.0,8.0,3544.0,125.0,1323.0,38331.0,15543.0,21665.0,2400.0,9727.0,1490.0,143.0,6.0,14880.0,POLYGON ((-46.627...,POINT (-46.637962...,2.032630229985242E-4
3550308005031,8835.0,26619.0,3.082682926829268,2033.2802439024392,5135.0,622.0,2649.0,838.0,115.0,8830.0,1311.0,2339.0,2239.0,1743.0,739.0,253.0,100.0,54.0,23.0,34.0,3262.0,90.0,1929.0,26730.0,17762.0,7808.0,1814.0,8588.0,1327.0,30.0,1.0,8685.0,POLYGON ((-46.596...,POINT (-46.607031...,4.670095618082835E-4
3550308005102,9382.0,20499.0,2.219795918367347,5673.932040816328,5634.0,576.0,2625.0,2097.0,20.0,9382.0,3161.0,3151.0,1726.0,992.0,267.0,57.0,18.0,7.0,1.0,2.0,1020.0,16.0,276.0,20548.0,3842.0,16260.0,1211.0,5496.0,793.0,206.0,13.0,9365.0,POLYGON ((-46.684...,POINT (-46.677750...,1.720310464927754...
3550308005158,8666.0,26741.0,3.0878787878787883,1294.025151515151,5434.0,410.0,2413.0,538.0,41.0,8664.0,1138.0,2198.0,2254.0,1800.0,787.0,294.0,109.0,40.0,24.0,22.0,5171.0,150.0,2298.0,26774.0,24925.0,1734.0,1277.0,7321.0,1324.0,17.0,0.0,8436.0,POLYGON ((-46.526...,POINT (-46.540679...,1.598476872192706E-4


In [80]:
# looking at the schema:
df_ap_final.printSchema()

root
 |-- ponderation_area_code: string (nullable = true)
 |-- total_private_households: double (nullable = true)
 |-- total_population_private_households: double (nullable = true)
 |-- average_number_residents_per_household: double (nullable = true)
 |-- average_monthly_income: double (nullable = true)
 |-- total_fully_owned_properties: double (nullable = true)
 |-- total_properties_in_acquisition: double (nullable = true)
 |-- total_rented_properties: double (nullable = true)
 |-- total_properties_with_3_bathrooms: double (nullable = true)
 |-- total_properties_without_private_bathrooms: double (nullable = true)
 |-- total_households_with_electricity: double (nullable = true)
 |-- total_households_with_1_resident: double (nullable = true)
 |-- total_households_with_2_residents: double (nullable = true)
 |-- total_households_with_3_residents: double (nullable = true)
 |-- total_households_with_4_residents: double (nullable = true)
 |-- total_households_with_5_residents: double (nullab

### 5.1.2 Saving Area of Ponderation Results

In [166]:
# save the results to the specified directory:
AP_OUTPUT = f"areas_of_ponderation/tb_area_ponderation_census"
PROCESSED_UNITS_OF_INTEREST = "../data/processed/sp_census/units_of_interest/"

save_to_filesystem(
    df_ap_final, PROCESSED_GEO_DATASETS, AP_OUTPUT, AP_OUTPUT + ".parquet"
)


    Saving dataframe to ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census
    
../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census/part-00000-92cc7da8-664e-4ec8-86af-4a2fe4b19e23-c000.gz.parquet

    Saving
    ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census/part-00000-92cc7da8-664e-4ec8-86af-4a2fe4b19e23-c000.gz.parquet 
    to ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census.parquet
    

    Deleting leftover directory at ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census
    


'Done!'

In [170]:
# saving without the geometry:
df_ap_no_geo = df_ap_final.drop("geometry", "ponderation_area_centroid")

AP_NO_GEO = f"areas_of_ponderation/tb_area_ponderation_census_no_geo"

save_to_filesystem(
    df_ap_final, PROCESSED_GEO_DATASETS, AP_OUTPUT, AP_OUTPUT + ".parquet"
)


    Saving dataframe to ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census
    
../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census/part-00000-619979f1-aaac-4796-8627-6c42bf2de7a3-c000.gz.parquet

    Saving
    ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census/part-00000-619979f1-aaac-4796-8627-6c42bf2de7a3-c000.gz.parquet 
    to ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census.parquet
    

    Deleting leftover directory at ../data/processed/sp_census/geospatial_datasets//areas_of_ponderation/tb_area_ponderation_census
    


'Done!'

## 5.2 Neighborhoods

In [163]:
# adding the centroid and area of the polygons:
Q_NEIGHBORHOOD_MATCH = """
SELECT 
    A.Name as neighborhood_name,
    B.sector_code
FROM tb_neighborhood as A, tb_sector_features as B
WHERE ST_Contains(A.geometry, B.sector_centroid)
"""

# matching the areas of ponderation to their sectors:
df_nb_match = spark.sql(Q_NEIGHBORHOOD_MATCH)

# adding the resulting dataframe to the SQL Context:
df_nb_match.createOrReplaceTempView("tb_nb_match")

In [164]:
# adding the features to the ponderation area back:
Q_NB_RAW_FEATURES = """
SELECT
    A.neighborhood_name,
    A.sector_code,
    B.*
FROM tb_nb_match as A
LEFT JOIN tb_sector_features as B
ON A.sector_code = B.sector_code
"""

# reading back the dataset:
df_nb_raw_features = spark.sql(Q_NB_RAW_FEATURES)

# dropping the geometry columns and looking at the results:
df_nb_raw_features = df_nb_raw_features.drop("geometry", "sector_centroid")

In [165]:
df_nb_raw_features

neighborhood_name,sector_code,id,sector_code.1,city_code,neighborhood_code,city,neighborhood,sector_type,centroid_longitude,centroid_latitude,sector_area,sector_situation_code,code_large_regions_geographical_regions,name_large_regions_geographical_regions,federation_unit_code,name_federation_unit,mesoregion_code,name_mesoregion,microregion_code,name_microregion,metropolitan_region_code_or_ride,name_metropolitan_region_or_ride,code_municipality,name_municipality,district_code,district_name,subdistrict_code,name_sub_district,name_neighborhood,sector_type_code,permanent_private_households_or_persons_responsible_for_permanent_private_households,residents_permanent_private_households_or_population_residing_permanent_private_households,average_number_dwellers_permanent_private_households_obtained_by_var2_division_by_var1,value_average_monthly_nominal_income_persons_responsible_for_permanent_private_households_with_and_without_income,permanent_private_households_and_disposed,permanent_private_households_acquisition,permanent_private_households_rented,permanent_private_households_with_3_bathrooms_residents,permanent_private_households_without_exclusive_use_bathroom_residents,permanent_private_households_with_electricity,permanent_private_households_with_1_resident,permanent_private_households_with_2_residents,permanent_private_households_with_3_residents,permanent_private_households_with_4_residents,permanent_private_households_with_5_residents,permanent_private_households_with_6_residents,permanent_private_households_with_7_residents,permanent_private_households_with_8_residents,permanent_private_households_with_9_residents,permanent_private_households_with_10_or_more_residents,permanent_private_home_type_households_and_clear,permanent_private_home_type_households_and_acquisition,permanent_private_household_homes_rented,residents_private_households_and_collective_households,residents_permanent_household_private_households,residents_permanent_private_households_apartment_type,residents_permanent_permanent_households,dwellers_permanent_private_households_and_acquisition,residents_permanent_private_households_rented,children_only_from_person_responsible_private_households,household_employees_private_households_males,relatives_household_employees_private_households_male,household_employees_private_households,relatives_domestic_employees_as_private_households_female,literacy_responsible_persons
Vila Canero,355030801000005,108943,355030801000005,3550308,,SÃO PAULO,,URBANO,-46.57065666912533,-23.571444624386825,7.278722154733566E-6,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030801,ÁGUA RASA,35503080100,ÁGUA RASA,SÃO PAULO (todos ...,0,240.0,754.0,3.14,1416.9,105.0,4.0,112.0,20.0,1.0,240.0,30.0,60.0,65.0,38.0,34.0,10.0,1.0,1.0,0.0,1.0,98.0,4.0,105.0,754.0,716.0,30.0,340.0,15.0,354.0,46.0,0.0,0.0,1.0,0.0,232.0
Vila Madalena,355030802000028,109096,355030802000028,3550308,,SÃO PAULO,,URBANO,-46.69939093174092,-23.54192792142038,2.741913499999751...,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030802,ALTO DE PINHEIROS,35503080200,ALTO DE PINHEIROS,SÃO PAULO (todos ...,0,177.0,394.0,2.23,5017.33,120.0,10.0,36.0,31.0,0.0,177.0,46.0,69.0,42.0,16.0,4.0,0.0,0.0,0.0,0.0,0.0,37.0,3.0,5.0,394.0,120.0,274.0,276.0,19.0,80.0,12.0,0.0,0.0,4.0,0.0,176.0
Agua Branca,355030806000016,109560,355030806000016,3550308,,SÃO PAULO,,URBANO,-46.67432775353449,-23.52900256909393,5.131327499995703E-6,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030806,BARRA FUNDA,35503080600,BARRA FUNDA,SÃO PAULO (todos ...,0,135.0,356.0,2.64,3564.83,80.0,10.0,38.0,22.0,0.0,135.0,27.0,45.0,27.0,25.0,9.0,1.0,1.0,0.0,0.0,0.0,36.0,4.0,6.0,356.0,156.0,200.0,214.0,32.0,88.0,11.0,0.0,0.0,3.0,0.0,135.0
Belenzinho,355030808000029,109753,355030808000029,3550308,,SÃO PAULO,,URBANO,-46.60066022961847,-23.52987229395476,1.101778800000770...,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030808,BELÉM,35503080800,BELÉM,SÃO PAULO (todos ...,0,259.0,739.0,2.85,1867.73,117.0,2.0,139.0,7.0,0.0,259.0,50.0,66.0,63.0,52.0,20.0,3.0,2.0,1.0,0.0,2.0,33.0,0.0,28.0,739.0,221.0,518.0,316.0,5.0,416.0,30.0,2.0,0.0,1.0,0.0,259.0
Alto do Pari,355030808000045,109769,355030808000045,3550308,,SÃO PAULO,,URBANO,-46.60511826311402,-23.52436697910189,1.054084249999352E-5,1,3,Região Sudeste,35,São Paulo,3515,Metropolitana de ...,35061,São Paulo,20,RM São Paulo,3550308,SÃO PAULO,355030808,BELÉM,35503080800,BELÉM,SÃO PAULO (todos ...,0,193.0,577.0,2.99,2075.0,103.0,14.0,68.0,15.0,0.0,193.0,28.0,54.0,46.0,41.0,13.0,7.0,2.0,0.0,2.0,0.0,101.0,14.0,54.0,577.0,537.0,38.0,303.0,51.0,210.0,21.0,0.0,0.0,1.0,0.0,188.0
