In [0]:
# PASO 1: Imports & Config
import json, random, uuid, time
from pathlib import Path
from datetime import datetime
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

cfg      = json.loads(Path("/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/notebooks/1_simulation/sim_config.json").read_text())
base     = Path(cfg["base_path"])
paths    = cfg["paths"]
interval = cfg["interval_seconds"]
qty_min, qty_max = cfg["quantity_range"]

In [0]:
# PASO 2: Carga de insumos
gdf_neigh = gpd.read_parquet(base/paths["neighborhoods"])       # barrios
mask_geom = gpd.read_parquet(base/paths["city_mask"]).geometry.iloc[0]  # contorno Medellín
df_cust   = pd.read_parquet(base/paths["customers"])            # clientes
df_emp    = pd.read_parquet(base/paths["employees"])            # empleados
print(f"✅ Barrios: {len(gdf_neigh)} | Clientes: {len(df_cust)} | Empleados: {len(df_emp)}")

In [0]:
# PASO 3: Funciones de muestreo y generación de evento
def sample_point(poly):
    minx,miny,maxx,maxy = poly.bounds
    while True:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p) and mask_geom.contains(p):
            return p

def gen_event():
    b  = gdf_neigh.sample(1).iloc[0]
    pt = sample_point(b.geometry)
    return {
      "order_id":          str(uuid.uuid4()),
      "date":              datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
      "customer_id":       int(df_cust.customer_id.sample(1).iloc[0]),
      "employee_id":       int(df_emp.employee_id.sample(1).iloc[0]),
      "quantity_products": random.randint(qty_min, qty_max),
      "latitude":          pt.y,
      "longitude":         pt.x,
      "neighborhood":      b["IDENTIFICACION"]
    }


In [0]:
# PASO 4: Preparar carpeta timestamp
ts      = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = base/paths["output_dir"]/ts
out_dir.mkdir(parents=True, exist_ok=True)
print("▶️ Carpeta de simulación:", out_dir.name)

In [0]:
# PASO 5 optimizada para prueba rápida:
N = 20
for _ in range(N):
    e = gen_event()
    (out_dir/f"{e['order_id']}.json").write_text(json.dumps(e))
print(f"✅ Generados {N} eventos en {out_dir.name}")


In [0]:
# PASO 6: Leer los JSONs y crear DataFrame Spark
files   = list(out_dir.glob("*.json"))
events  = [json.loads(p.read_text()) for p in files]
df_raw  = spark.createDataFrame(events)

In [0]:
# PASO 7: Inspección rápida
display(df_raw)
df_raw.printSchema()
print("Total registros:", df_raw.count())

In [0]:
# Geo‑join para calcular 'district'  
from shapely.geometry import Point

# 1) Convertir df_raw a pandas para hacer spatial join
pdf = df_raw.toPandas()
pdf = df_raw.toPandas().drop(columns=["neighborhood"])

# 2) Crear GeoDataFrame puntual con lat/lon
pdf["geometry"] = pdf.apply(lambda r: Point(r.longitude, r.latitude), axis=1)
gpdf = gpd.GeoDataFrame(pdf, geometry="geometry", crs=gdf_neigh.crs)

# 3) Spatial join: cada punto recibe el polígono que lo contiene
#    Suponemos gdf_neigh tiene columna 'NOMBRE' con el barrio
gpdf = gpd.sjoin(gpdf, gdf_neigh[["IDENTIFICACION","NOMBRE", "geometry"]], how="left", predicate="within")

# 4) Renombrar la columna resultante y limpiar índices
gpdf = gpdf.rename(columns={"IDENTIFICACION": "district", "NOMBRE": "neighborhood"}).drop(columns=["index_right"])

# 5) Volver a Spark
df_raw = spark.createDataFrame(gpdf.drop(columns="geometry"))


In [0]:
# PASO 7: Inspección rápida
display(df_raw)
df_raw.printSchema()
print("Total registros:", df_raw.count())

In [0]:
# PASO 8: Transformar df_raw al esquema Bronze
from pyspark.sql.functions import (
    to_timestamp, date_format,
    year, month, dayofmonth,
    hour, minute, second
)

df_bronze = (
    df_raw
      # 1) Parsear timestamp
      .withColumn("event_ts", to_timestamp("date", "dd/MM/yyyy HH:mm:ss"))
      # 2) Partición diaria en formato ddMMyyyy
      .withColumn("partition_date", date_format("event_ts", "ddMMyyyy"))
      # 3) Desglosar fecha/hora
      .withColumn("event_year",   year("event_ts"))
      .withColumn("event_month",  month("event_ts"))
      .withColumn("event_day",    dayofmonth("event_ts"))
      .withColumn("event_hour",   hour("event_ts"))
      .withColumn("event_minute", minute("event_ts"))
      .withColumn("event_second", second("event_ts"))
      # 4) Renombrar/seleccionar columnas según spec
      .select(
         "partition_date",
         "order_id",
         "neighborhood",
         "customer_id",
         "employee_id",
         "event_ts",
         "event_year","event_month","event_day",
         "event_hour","event_minute","event_second",
         "latitude","longitude",
         "district",
         "quantity_products"
      )
)

# Inspección rápida del resultado
display(df_bronze.limit(5))
df_bronze.printSchema()


In [0]:
%sql
-- Celda X: Crear el schema unalwater
CREATE DATABASE IF NOT EXISTS unalwater;

In [0]:
# PASO 9: Persistir en Delta como managed table
# Si la tabla no existe, la crea; si existe, hace append.

# 1) Guardar como tabla delta en el metastore
(
  df_bronze
    .write
    .format("delta")
    .mode("append")
    .saveAsTable("unalwater.bronze_events")
)

# 2) Verificar conteo final
total = spark.table("unalwater.bronze_events").count()
print(f"✅ Bronze listo. Total registros: {total}")
spark.table("unalwater.bronze_events").show(5, truncate=False)


In [0]:
%sql
DESCRIBE DETAIL unalwater.bronze_events;
SELECT DISTINCT partition_date FROM unalwater.bronze_events ORDER BY partition_date;
SELECT * FROM unalwater.bronze_events WHERE partition_date = '18072025' LIMIT 5;
