In [0]:
# ==============================
# Sección 1: Imports y SparkSession
# ==============================
from pathlib import Path
import json, random, uuid, time
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, current_timestamp, expr, floor, rand, to_timestamp,
    date_format, year, month, dayofmonth, hour, minute, second,
    struct, to_json
)
from pyspark.sql.types import (
    StringType, IntegerType, DoubleType, StructType, StructField,
    TimestampType
)
from shapely.geometry import Point, shape

# Inicia Spark
spark = SparkSession.builder \
    .appName("01_setup_load_sparkified") \
    .getOrCreate()

# Carga configuración
cfg      = json.loads(Path("/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/notebooks/1_simulation/sim_config.json").read_text())
base     = Path(cfg["base_path"])
paths    = cfg["paths"]
interval = cfg["interval_seconds"]
qty_min, qty_max = cfg["quantity_range"]


In [0]:
# ==============================
# Sección 2: Leer insumos (GeoPandas + Spark)
# ==============================
import geopandas as gpd
import pandas as pd

# 1) Ruta local al repo (GeoPandas sí puede acceder)
raw_dir = "/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/data/raw"

# 2) Leer barrios y máscara con GeoPandas
gdf_neigh = gpd.read_parquet(f"{raw_dir}/medellin_neighborhoods.parquet")
gdf_mask  = gpd.read_parquet(f"{raw_dir}/50001.parquet")

# 3) Convertir a listas para UDF espacial
neigh_list = gdf_neigh.to_dict("records")
mask_geom  = shape(gdf_mask.loc[0, "geometry"])

# 4) Leer clientes y empleados con Spark (Spark sí puede acceder si están en DBFS o S3, pero no en /Workspace)
# Solución: también leerlos con pandas si están en /Workspace
pdf_cust = pd.read_parquet(f"{raw_dir}/customers.parquet")
pdf_emp  = pd.read_parquet(f"{raw_dir}/employees.parquet")

# 5) Convertir a Spark
cust_df = spark.createDataFrame(pdf_cust)
emp_df  = spark.createDataFrame(pdf_emp)

print(f"✅ Barrios: {len(neigh_list)} | Clientes: {cust_df.count()} | Empleados: {emp_df.count()}")


In [0]:
# ==============================
# Sección 3: Simulación de eventos con estructura final deseada
# ==============================
from shapely.geometry import Point
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from collections import OrderedDict

# Número de eventos a generar
N = 20

# IDs preparados para sampling
cust_ids = pdf_cust["customer_id"].tolist()
emp_ids  = pdf_emp["employee_id"].tolist()

# Generar lista de eventos con estructura exacta
events = []
for _ in range(N):
    b = random.choice(neigh_list)
    minx, miny, maxx, maxy = shape(b["geometry"]).bounds
    while True:
        lon = random.uniform(minx, maxx)
        lat = random.uniform(miny, maxy)
        pt = Point(lon, lat)
        if shape(b["geometry"]).contains(pt) and mask_geom.contains(pt):
            break
    event = OrderedDict([
        ("latitude",           lat),
        ("longitude",          lon),
        ("date",               datetime.now().strftime("%d/%m/%Y %H:%M:%S")),
        ("customer_id",        random.choice(cust_ids)),
        ("employee_id",        random.choice(emp_ids)),
        ("quantity_products",  random.randint(qty_min, qty_max)),
        ("order_id",           str(uuid.uuid4()))
    ])
    events.append(event)

# Crear DataFrame Spark desde los eventos
schema_ev = StructType([
    StructField("latitude", DoubleType(), False),
    StructField("longitude", DoubleType(), False),
    StructField("date", StringType(), False),
    StructField("customer_id", IntegerType(), False),
    StructField("employee_id", IntegerType(), False),
    StructField("quantity_products", IntegerType(), False),
    StructField("order_id", StringType(), False),
])
df_raw = spark.createDataFrame(events, schema=schema_ev)
print(f"✅ Generados {df_raw.count()} eventos con estructura correcta")
df_raw.show(5, truncate=False)


In [0]:
# ==============================
# Sección 3.1: Registrar cada evento simulado en su propio JSON
# ==============================
import json
from pathlib import Path
from datetime import datetime

# 1) Timestamp único para esta corrida
run_ts = datetime.now().strftime("%Y%m%d_%H%M%S")

# 2) Carpeta destino en tu repo
#    base es Path(cfg["base_path"]) → "/Workspace/Users/.../medellin-bigdata-poc"
output_base = Path(base, "data", "sim-events", run_ts)

# 3) Creo la carpeta (no sobreescribe ejecuciones anteriores)
output_base.mkdir(parents=True, exist_ok=False)

# 4) Escribo cada evento en un archivo JSON separado
for ev in events:
    order_id = ev["order_id"]
    file_path = output_base / f"{order_id}.json"
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(ev, f, ensure_ascii=False, indent=2)

print(f"✅ {len(events)} archivos JSON de eventos escritos en: {output_base}")


In [0]:
# ==============================
# Sección 4: Spatial‐join con UDF (Shapely) — con district y neighborhood
# ==============================
from shapely.geometry import Point, shape
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# UDF que asigna el código de barrio (district) según lat/lon
def find_district(lat, lon):
    pt = Point(lon, lat)
    for b in neigh_list:
        if shape(b["geometry"]).contains(pt):
            return b["IDENTIFICACION"]
    return None

find_district_udf = udf(find_district, StringType())

# UDF que asigna el nombre de barrio (neighborhood) según lat/lon
def find_neighborhood_name(lat, lon):
    pt = Point(lon, lat)
    for b in neigh_list:
        if shape(b["geometry"]).contains(pt):
            return b["NOMBRE"]
    return None

find_name_udf = udf(find_neighborhood_name, StringType())

# Aplicamos ambos UDFs para enriquecer df_raw
df_events = (
    df_raw
      .withColumn("district", find_district_udf("latitude", "longitude"))
      .withColumn("neighborhood", find_name_udf("latitude", "longitude"))
      .drop("neigh_id")
)

# Verificamos resultado
print("✅ Spatial‐join completo: código y nombre de barrio asignados")
df_events.show(5, truncate=False)


In [0]:
# ==============================
# Sección 5: Transformar a esquema Bronze con columnas reordenadas
# ==============================
from pyspark.sql.functions import (
    to_timestamp, date_format,
    year, month, dayofmonth, hour, minute, second,
    col
)

df_bronze = (
    df_events
      # Convertir la fecha string a timestamp para extraer componentes
      .withColumn("event_ts", to_timestamp("date", "dd/MM/yyyy HH:mm:ss"))
      # Formatear partition_date como ddMMyyyy
      .withColumn("partition_date", date_format("event_ts", "ddMMyyyy"))
      # Desglosar componentes de fecha/hora
      .withColumn("event_year",  year("event_ts"))
      .withColumn("event_month", month("event_ts"))
      .withColumn("event_day",   dayofmonth("event_ts"))
      .withColumn("event_hour",  hour("event_ts"))
      .withColumn("event_minute", minute("event_ts"))
      .withColumn("event_second", second("event_ts"))
      # Selección final en el orden deseado
      .select(
         "partition_date",
         "order_id",
         "neighborhood",
         "customer_id",
         "employee_id",
         col("date").alias("event_date"),
         "event_day",
         "event_hour",
         "event_minute",
         "event_month",
         "event_second",
         "event_year",
         "latitude",
         "longitude",
         "district",
         "quantity_products"
      )
)

# Verificar y mostrar
print("✅ Bronze reordenado según el esquema original")
df_bronze.printSchema()
df_bronze.show(5, truncate=False)


In [0]:
# ==============================
# Sección 6: Persistir en Delta y verificar
# ==============================
# Crea la base si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS unalwater")

(
  df_bronze
    .write
    .format("delta")
    .mode("append")
    .partitionBy("partition_date")
    .saveAsTable("unalwater.bronze_events")
)

# Verificación
total = spark.table("unalwater.bronze_events").count()
print(f"✅ Bronze listo. Total registros en tabla: {total}")
spark.table("unalwater.bronze_events").show(5, truncate=False)


In [0]:
%sql

SELECT * 
FROM unalwater.bronze_events
LIMIT 21;