In [0]:
spark

In [0]:
import time

for i in range(1000):
    print(f"▶️ Ejecutando iteración {i+1}")
    dbutils.notebook.run("./setup_optimize", timeout_seconds=200)
    time.sleep(3)

In [0]:
# %run ./setup_optimize

In [0]:
# Traemos los datos de prueba bronces después de recopilar una hora de simulación
data = spark.table("poctesting.bronze_events")
display(data.limit(5))
print(f"El archivo tiene {data.count()} registros")

In [0]:
from pyspark.sql.functions import col


# Importar bronze
df_bronze = spark.table("poctesting.bronze_events")

# Registros completos
df_completos = df_bronze.filter(
    col("neighborhood").isNotNull() & col("district").isNotNull()
)

# Registros incompletos
df_incompletos = df_bronze.filter(
    col("neighborhood").isNull() | col("district").isNull()
)

In [0]:
df_bronze.count()
# df_incompletos.count()

In [0]:
from pyspark.sql.functions import col, row_number, count
from pyspark.sql.window import Window
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree
import numpy as np

def corregir_con_sjoin_y_balltree(df_spark, path_parquet_neigh, schema):
    df = df_spark.toPandas()
    if df.empty:
        return spark.createDataFrame([], schema)

    df["geometry"] = df.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

    gdf_neigh = gpd.read_parquet(path_parquet_neigh)[["NOMBRE", "IDENTIFICACION", "geometry"]]
    gdf_neigh = gdf_neigh.dropna(subset=["geometry"]).set_crs("EPSG:4326")

    gdf = gdf.to_crs("EPSG:3857")
    gdf_neigh = gdf_neigh.to_crs("EPSG:3857")

    gdf_joined = gpd.sjoin(gdf, gdf_neigh, how="left", predicate="within")
    gdf["neighborhood"] = gdf_joined["NOMBRE"]
    gdf["district"] = gdf_joined["IDENTIFICACION"]

    gdf_nulos = gdf[gdf["neighborhood"].isna()].copy()
    if not gdf_nulos.empty:
        centroids = gdf_neigh.geometry.centroid
        neigh_coords = np.array([[pt.y, pt.x] for pt in centroids])
        point_coords = np.array([[pt.y, pt.x] for pt in gdf_nulos.geometry])
        tree = BallTree(np.deg2rad(neigh_coords), metric="haversine")
        dist, idx = tree.query(np.deg2rad(point_coords), k=1)
        gdf.loc[gdf["neighborhood"].isna(), "neighborhood"] = gdf_neigh.iloc[idx.flatten()]["NOMBRE"].values
        gdf.loc[gdf["district"].isna(), "district"] = gdf_neigh.iloc[idx.flatten()]["IDENTIFICACION"].values

    gdf = gdf.dropna(subset=["neighborhood", "district"])

    if gdf.empty:
        return spark.createDataFrame([], schema)
    else:
        return spark.createDataFrame(gdf.drop(columns=["geometry"]))

def actualizar_silver_eventos(df_completos, df_incompletos, path_parquet_neigh):
    nombre_tabla_silver = "silver_eventos"
    schema = df_incompletos.schema

    # Corregir los incompletos
    df_corregido = corregir_con_sjoin_y_balltree(df_incompletos, path_parquet_neigh, schema)
    cantidad_corregidos = df_corregido.count()
    cantidad_completos = df_completos.count()

    # Unir completados
    if df_corregido.limit(1).count() == 0:
        print(f"⚠️ No se corrigieron registros incompletos. Se usará solo df_completos ({cantidad_completos}).")
        df_union = df_completos
    else:
        print(f"✅ Se corrigieron {cantidad_corregidos} registros. Total con completos: {cantidad_completos + cantidad_corregidos}")
        df_union = df_completos.unionByName(df_corregido)

    # Eliminar duplicados por order_id conservando el de mayor quantity_products
    window_spec = Window.partitionBy("order_id").orderBy(col("quantity_products").desc())
    df_union_dedup = df_union.withColumn("rn", row_number().over(window_spec)).filter("rn = 1").drop("rn")

    # Cargar datos existentes (si los hay)
    tabla_existe = spark.catalog.tableExists(nombre_tabla_silver)
    if tabla_existe:
        df_existente = spark.table(nombre_tabla_silver)
        df_todo = df_existente.unionByName(df_union_dedup)

        # Deduplicar final por order_id (mantener mayor quantity_products)
        window_final = Window.partitionBy("order_id").orderBy(col("quantity_products").desc())
        df_final = df_todo.withColumn("rn", row_number().over(window_final)).filter("rn = 1").drop("rn")
    else:
        df_final = df_union_dedup

    # Guardar en tabla silver
    modo = "overwrite" if not tabla_existe else "overwrite"
    df_final.write.mode(modo).saveAsTable(nombre_tabla_silver)

    # Verificación final de duplicados
    df_verif = spark.table(nombre_tabla_silver)
    df_check = df_verif.groupBy("order_id").agg(count("*").alias("cantidad")).filter("cantidad > 1")

    if df_check.count() > 0:
        print(f"❌ Duplicados encontrados en 'order_id': {df_check.count()}")
        display(df_check)
    else:
        print(f"✅ Tabla 'silver_eventos' actualizada correctamente con {df_final.count()} registros únicos por orden.")


In [0]:
path_parquet_neigh = "/Workspace/Users/danielale22rojas@gmail.com//medellin-bigdata-poc/data/raw/medellin_neighborhoods.parquet"
actualizar_silver_eventos(df_completos, df_incompletos, path_parquet_neigh)

In [0]:
df_silver = spark.table("silver_eventos")
# silver.write.format("delta").mode("overwrite").saveAsTable("poctesting.silver_events")
print(f"Descartamos {df_bronze.count() - df_silver.count()} registros")

In [0]:
# Ventanas para Gold

from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Definir ventanas
window_neigh = Window.partitionBy("neighborhood")

# Agregar columnas con totales
df_gold = df_silver \
    .withColumn("total_by_neighborhood", F.sum("quantity_products").over(window_neigh))


In [0]:
display(df_gold.limit(10))

In [0]:
gdf = gpd.read_parquet(path_parquet_neigh) 
gdf["NOMBRE"] = gdf["NOMBRE"].str.replace("CORREGIMIENTO DE ", "", regex=True)
gdf["NOMBRE"]

In [0]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

# 1. Cargar los polígonos de los barrios
gdf = gpd.read_parquet(path_parquet_neigh) 
gdf = gdf[["NOMBRE", "geometry"]]

# 2. Leer Gold y dejar un registro por barrio
pdf_gold = df_gold.select("neighborhood", "total_by_neighborhood").distinct().toPandas()

# 3. Unir con datos espaciales
gdf_merged = gdf.merge(pdf_gold, left_on="NOMBRE", right_on="neighborhood", how="left")
gdf_merged["NOMBRE"] = gdf_merged["NOMBRE"].str.replace("CORREGIMIENTO DE ", "", regex=True)

# 4. Crear gráfico
fig, ax = plt.subplots(figsize=(16, 10))
gdf_merged.plot(
    column="total_by_neighborhood",
    cmap="OrRd",
    edgecolor="black",
    legend=True,
    ax=ax
)

# 5. Rotular barrios usando centroide
for idx, row in gdf_merged.iterrows():
    if row["geometry"] is not None:
        centroid = row["geometry"].centroid
        plt.text(
            centroid.x,
            centroid.y,
            row["NOMBRE"],
            fontsize=6,
            ha="center",
            va="center"
        )

plt.title("Total de Productos por Barrio - Medellín", fontsize=15)
plt.axis("off")
plt.show()

# Quitar la palabra corregimiento de los nombres

In [0]:
gdf = gdf.rename(columns={"NOMBRE": "neighborhood"})
gdf["neighborhood"] = gdf["neighborhood"].astype(str)


gdf_merged = gdf.merge(
    pdf_gold,
    on="neighborhood",
    how="left"
).to_crs(epsg=4326).reset_index(drop=True)


import folium
import geopandas as gpd
import pandas as pd

# 1. Cargar polígonos
gdf = gpd.read_parquet(base/paths["neighborhoods"])[["NOMBRE", "geometry"]]
gdf = gdf.rename(columns={"NOMBRE": "neighborhood"})
gdf["neighborhood"] = gdf["neighborhood"].astype(str)

# 2. Leer gold y limpiar
pdf_gold = df_gold.select("neighborhood", "total_by_neighborhood").distinct().toPandas()
pdf_gold["neighborhood"] = pdf_gold["neighborhood"].astype(str)

# 3. Unir y preparar
gdf_merged = gdf.merge(pdf_gold, on="neighborhood", how="left")
gdf_merged = gdf_merged.to_crs(epsg=4326).reset_index(drop=True)

# 4. Crear mapa
m = folium.Map(location=[6.25184, -75.56359], zoom_start=12)

# 5. Choropleth asegurando que neighborhood esté en GeoJSON
folium.Choropleth(
    geo_data=gdf_merged.__geo_interface__,  # ← acceso directo a GeoJSON
    data=pdf_gold,
    columns=["neighborhood", "total_by_neighborhood"],
    key_on="feature.properties.neighborhood",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Total de Productos por Barrio"
).add_to(m)

# 6. Popups
for _, row in gdf_merged.iterrows():
    if pd.notna(row["total_by_neighborhood"]):
        folium.Marker(
            location=[row["geometry"].centroid.y, row["geometry"].centroid.x],
            popup=f"{row['neighborhood']}<br>Total: {int(row['total_by_neighborhood'])}",
            icon=folium.Icon(color='blue', icon='info-sign')
        ).add_to(m)

# 7. Guardar el HTML
m.save("/Workspace/Users/danielale22rojas@gmail/medellin-bigdata-poc/mapa/medellin_interactivo.html")