# 01_setup_and_simulate

Este notebook:
- Carga la configuración de simulación.
- Define funciones de muestreo y generación.
- Genera y guarda los JSON de eventos en DBFS.

In [0]:
# ==============================
# Imports y SparkSession
# ==============================
from pathlib import Path
import json, random, uuid
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, DoubleType
)

# Inicia Spark
spark = SparkSession.builder \
    .appName("01_setup_and_simulate") \
    .getOrCreate()


In [0]:
# ==============================
# Carga de configuración y datos de insumo
# ==============================
import pandas as pd
import geopandas as gpd

# 1) Parámetros de simulación
cfg       = json.loads(Path("/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/sim_config.json").read_text())
base      = Path(cfg["base_path"])
raw_dir   = base / "data" / "raw"
qty_min, qty_max = cfg["quantity_range"]
interval  = cfg["interval_seconds"]

# 2) Lectura de datos con pandas/geopandas
pdf_cust  = pd.read_parquet(f"{raw_dir}/customers.parquet")
pdf_emp   = pd.read_parquet(f"{raw_dir}/employees.parquet")
gdf_neigh = gpd.read_parquet(f"{raw_dir}/medellin_neighborhoods.parquet")
gdf_mask  = gpd.read_parquet(f"{raw_dir}/50001.parquet")

In [0]:
# ==============================
# Preparar listas para sampling
# ==============================
from shapely.geometry import Point, shape

# IDs para sampling
cust_ids   = pdf_cust["customer_id"].tolist()
emp_ids    = pdf_emp["employee_id"].tolist()

# Geometrías para UDF espacial
neigh_list = gdf_neigh.to_dict("records")
mask_geom  = shape(gdf_mask.loc[0, "geometry"])

In [0]:
# ==============================
# Celda 4 – Simulación de N eventos y persistencia en Delta
# ==============================
from datetime import datetime
import uuid

# 1) Generar lista de eventos
N = 20
events = []
for _ in range(N):
    b = random.choice(neigh_list)
    minx, miny, maxx, maxy = shape(b["geometry"]).bounds
    while True:
        lon = random.uniform(minx, maxx)
        lat = random.uniform(miny, maxy)
        pt = Point(lon, lat)
        if shape(b["geometry"]).contains(pt) and mask_geom.contains(pt):
            break
    events.append({
        "latitude":           lat,
        "longitude":          lon,
        "date":               datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
        "customer_id":        random.choice(cust_ids),
        "employee_id":        random.choice(emp_ids),
        "quantity_products":  random.randint(qty_min, qty_max),
        "order_id":           str(uuid.uuid4())
    })

# 2) Crear DataFrame Spark desde los eventos
schema_ev = StructType([
    StructField("latitude", DoubleType(), False),
    StructField("longitude", DoubleType(), False),
    StructField("date", StringType(), False),
    StructField("customer_id", IntegerType(), False),
    StructField("employee_id", IntegerType(), False),
    StructField("quantity_products", IntegerType(), False),
    StructField("order_id", StringType(), False),
])
df_raw = spark.createDataFrame(events, schema_ev)

# 3) Persistir en Delta Lake como tabla temporal
spark.sql("CREATE DATABASE IF NOT EXISTS unalwater")
(
    df_raw
      .write
      .format("delta")
      .mode("")
      .saveAsTable("unalwater.raw_events_temp")
)
