In [0]:
# ==============================
# Imports y SparkSession
# ==============================
from pathlib import Path
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    to_timestamp, date_format,
    year, month, dayofmonth,
    hour, minute, second, col
)
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

from shapely.geometry import Point, shape
import pandas as pd

spark = SparkSession.builder \
    .appName("02_ingest_bronze") \
    .getOrCreate()

In [0]:
# ==============================
# Recarga de config y geometrías
# ==============================
import json
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape

# 1) Parámetros de simulación
cfg       = json.loads(
    Path("/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/notebooks/1_simulation/sim_config.json")
    .read_text()
)
base      = Path(cfg["base_path"])
run_ts    = "<el mismo timestamp generado en NB1>"
input_dir = base / "data" / "sim-events" / run_ts

# 2) Carga de geometrías con GeoPandas para evitar el error de 'bytes'
raw_dir   = base / "data" / "raw"
gdf_neigh = gpd.read_parquet(f"{raw_dir}/medellin_neighborhoods.parquet")
gdf_mask  = gpd.read_parquet(f"{raw_dir}/50001.parquet")

# 3) Preparar estructuras para los UDFs
neigh_list = gdf_neigh.to_dict("records")
mask_geom  = shape(gdf_mask.loc[0, "geometry"])

In [0]:
# ==============================
# UDFs para district y neighborhood
# ==============================
def find_district(lat, lon):
    pt = Point(lon, lat)
    for b in neigh_list:
        if shape(b["geometry"]).contains(pt):
            return b["IDENTIFICACION"]
    return None

def find_neighborhood(lat, lon):
    pt = Point(lon, lat)
    for b in neigh_list:
        if shape(b["geometry"]).contains(pt):
            return b["NOMBRE"]
    return None

find_district_udf     = udf(find_district, StringType())
find_neighborhood_udf = udf(find_neighborhood, StringType())

In [0]:
# ==============================
# Celda 4 – Leer tabla raw_events_temp y enriquecer
# ==============================
# (1) Cargamos directamente la tabla Delta con los eventos simulados
df_raw = spark.table("unalwater.raw_events_temp")

# (2) Aplicamos los UDFs espaciales
df_events = (
  df_raw
    .withColumn("district",     find_district_udf("latitude", "longitude"))
    .withColumn("neighborhood", find_neighborhood_udf("latitude", "longitude"))
)


In [0]:
# ==============================
# Transformar al esquema Bronze
# ==============================
df_bronze = (
  df_events
    .withColumn("event_ts",      to_timestamp("date", "dd/MM/yyyy HH:mm:ss"))
    .withColumn("partition_date", date_format("event_ts", "ddMMyyyy"))
    .withColumn("event_year",     year("event_ts"))
    .withColumn("event_month",    month("event_ts"))
    .withColumn("event_day",      dayofmonth("event_ts"))
    .withColumn("event_hour",     hour("event_ts"))
    .withColumn("event_minute",   minute("event_ts"))
    .withColumn("event_second",   second("event_ts"))
    .select(
      "partition_date", "order_id", "neighborhood", "customer_id",
      "employee_id", col("date").alias("event_date"),
      "event_day", "event_hour", "event_minute", "event_month",
      "event_second", "event_year", "latitude", "longitude",
      "district", "quantity_products"
    )
)


In [0]:
# ==============================
# Guardar en Delta Lake
# ==============================
spark.sql("CREATE DATABASE IF NOT EXISTS unalwater")

(
  df_bronze
    .write
    .format("delta")
    .mode("append")
    .partitionBy("partition_date")
    .saveAsTable("unalwater.bronze_events")
)