In [0]:
# PASO 1: Imports & Config
import json, random, uuid, time
from pathlib import Path
from datetime import datetime
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

cfg      = json.loads(Path("/Workspace/Users/santiagobustosp@gmail.com/medellin-bigdata-poc/notebooks/1_simulation/sim_config.json").read_text())
base     = Path(cfg["base_path"])
paths    = cfg["paths"]
interval = cfg["interval_seconds"]
qty_min, qty_max = cfg["quantity_range"]

In [0]:
# PASO 2: Carga de insumos
gdf_neigh = gpd.read_parquet(base/paths["neighborhoods"])       # barrios
mask_geom = gpd.read_parquet(base/paths["city_mask"]).geometry.iloc[0]  # contorno Medellín
df_cust   = pd.read_parquet(base/paths["customers"])            # clientes
df_emp    = pd.read_parquet(base/paths["employees"])            # empleados
print(f"✅ Barrios: {len(gdf_neigh)} | Clientes: {len(df_cust)} | Empleados: {len(df_emp)}")

In [0]:
# PASO 3: Funciones de muestreo y generación de evento
def sample_point(poly):
    minx,miny,maxx,maxy = poly.bounds
    while True:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p) and mask_geom.contains(p):
            return p

def gen_event():
    b  = gdf_neigh.sample(1).iloc[0]
    pt = sample_point(b.geometry)
    return {
      "order_id":          str(uuid.uuid4()),
      "date":              datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
      "customer_id":       int(df_cust.customer_id.sample(1).iloc[0]),
      "employee_id":       int(df_emp.employee_id.sample(1).iloc[0]),
      "quantity_products": random.randint(qty_min, qty_max),
      "latitude":          pt.y,
      "longitude":         pt.x,
      "neighborhood":      b["NOMBRE"]
    }


In [0]:
# PASSO 4: Preparar carpeta timestamp
ts      = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = base/paths["output_dir"]/ts
out_dir.mkdir(parents=True, exist_ok=True)
print("▶️ Carpeta de simulación:", out_dir.name)

In [0]:
# Celda 5 optimizada para prueba rápida:
N = 20
for _ in range(N):
    e = gen_event()
    (out_dir/f"{e['order_id']}.json").write_text(json.dumps(e))
print(f"✅ Generados {N} eventos en {out_dir.name}")


In [0]:
# PASO 6: Leer los JSONs y crear DataFrame Spark
files   = list(out_dir.glob("*.json"))
events  = [json.loads(p.read_text()) for p in files]
df_raw  = spark.createDataFrame(events)

In [0]:
# PASO 7: Inspección rápida
display(df_raw)
df_raw.printSchema()
print("Total registros:", df_raw.count())