In [1]:
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import DataFrame, Series
from pandera.typing.geopandas import GeoSeries
from pathlib import Path
import janitor

In [2]:
PATH_TO_PROCESSED_FILES = Path("../data/processed/")
ROME_GEOJSON = (
    PATH_TO_PROCESSED_FILES / "Rome_urban_zones_with_features__epsg4326.geojson"
)
SAVE_OUTPUT = True
OUTPUT_ROME_GEOJSON = (
    PATH_TO_PROCESSED_FILES / "Rome_urban_zones_with_features_clean__epsg4326.geojson"
)

## Loading the spatial tessellation

In [3]:
raw_gdf_rome = gpd.read_file(ROME_GEOJSON)
print(raw_gdf_rome.shape)

(155, 13)


## Data transformation and validation

In [4]:
class FeaturesGeoDataFrameSchema(pa.SchemaModel):
    area_code: Series[str] = pa.Field(nullable=False, unique=True, str_matches=r"\d+[A-Z]")
    area_name: Series[str] = pa.Field(nullable=False, unique=True)
    SHAPE_Leng: Series[float] = pa.Field(ge=0, nullable=False)
    SHAPE_Area: Series[float] = pa.Field(ge=0, nullable=False)
    n_traffic_lights: Series[int] = pa.Field(ge=0, nullable=False, coerce=True)
    density_traffic_lights: Series[float] = pa.Field(ge=0, nullable=False)
    n_shops: Series[int] = pa.Field(ge=0, nullable=False, coerce=True)
    density_shops: Series[float] = pa.Field(ge=0, nullable=False)
    perc_roads_bc: Series[float] = pa.Field(ge=0, le=100, nullable=False)
    car_accidents: Series[int] = pa.Field(ge=0, nullable=False, coerce=True)
    perc_people_far_from_public_transport: Series[float] = pa.Field(ge=0, le=100, nullable=False)
    pop_density: Series[float] = pa.Field(ge=0, nullable=False)
    geometry: GeoSeries = pa.Field(nullable=False)

In [5]:
try:
    FeaturesGeoDataFrameSchema(raw_gdf_rome)
except pa.errors.SchemaError as exc:
    print(exc)

Error while coercing 'car_accidents' to type int64: Could not coerce <class 'pandas.core.series.Series'> data_container into type int64:
   index  failure_case
0     97           NaN


In [6]:
@pa.check_types()
def fill_nans(gdf_rome: DataFrame) -> DataFrame[FeaturesGeoDataFrameSchema]:
    indices = gdf_rome.index[gdf_rome["area_code"] == "20O"]
    assert len(indices) == 1
    index_area20O = indices[0]

    if np.isnan(gdf_rome.loc[index_area20O, "car_accidents"]):
        gdf_rome.loc[index_area20O, "car_accidents"] = 0
    if np.isnan(gdf_rome.loc[index_area20O, "perc_roads_bc"]):
        gdf_rome.loc[index_area20O, "perc_roads_bc"] = 0
    return gdf_rome

@pa.check_types()
def clean_data(gdf: DataFrame[FeaturesGeoDataFrameSchema]) -> DataFrame[FeaturesGeoDataFrameSchema]:
    return (
        gdf
        .change_type("area_code", str)
        .change_type("area_name", str)
        .change_type("n_traffic_lights", int)
        .change_type("n_shops", int)
        .change_type("car_accidents", int)
    )

gdf_rome = (raw_gdf_rome
    .pipe(fill_nans)
    .pipe(clean_data)
)

In [7]:
if SAVE_OUTPUT:
    gdf_rome.to_file(OUTPUT_ROME_GEOJSON, driver='GeoJSON')

  pd.Int64Index,


# Watermark

In [8]:
%load_ext watermark
%watermark

Last updated: 2022-02-25T23:12:16.511236+01:00

Python implementation: CPython
Python version       : 3.8.12
IPython version      : 8.0.1

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 4.4.0-210-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 40
Architecture: 64bit



In [9]:
%watermark --iversions

geopandas : 0.10.2
janitor   : 0.22.0
sys       : 3.8.12 | packaged by conda-forge | (default, Jan 30 2022, 23:42:07) 
[GCC 9.4.0]
pandera   : 0.9.0
numpy     : 1.22.2
matplotlib: 3.5.1
pandas    : 1.4.1

