In [119]:
import pandas as pd
import numpy as np

from pathlib import Path

In [120]:
DATA_DIR_ECU911 = Path("../data/processed/ecu911")
DATA_DIR_APRENDIDOS = Path("../data/interim/aprehendidos_detenidos")

DATA_DIR_DEST = Path("../data/processed")

df_ecu = pd.read_csv(DATA_DIR_ECU911 / "ecu911_con_coords.csv")
df_apr = pd.read_csv(DATA_DIR_APRENDIDOS / "aprehendidos_detenidos_limpio.csv")

print(df_ecu.shape)
print(df_apr.shape)

(1773335, 9)
(61149, 34)


  df_apr = pd.read_csv(DATA_DIR_APRENDIDOS / "aprehendidos_detenidos_limpio.csv")


In [121]:
# Aprehensiones
df_apr["fecha"] = pd.to_datetime(df_apr["datetime"]).dt.normalize()
df_apr["codigo_parroquia"] = df_apr["codigo_parroquia"].astype(str).str.strip()

# ECU911
df_ecu["fecha"] = pd.to_datetime(df_ecu["fecha"]).dt.normalize()
df_ecu["codigo_parroquia"] = df_ecu["cod_parroquia"].astype(str).str.strip()

df_ecu["servicio"] = df_ecu["servicio"].astype(str).str.upper().str.strip()
df_ecu["subtipo"] = df_ecu["subtipo"].astype(str).str.upper().str.strip()

In [122]:
# RESOLUCION DE GRID

GRID_SIZE = 0.01  # ~1km (ajustable)

def compute_grid_id(lat, lon, grid_size=GRID_SIZE):
    gx = np.floor(lat / grid_size).astype(int)
    gy = np.floor(lon / grid_size).astype(int)
    return gx.astype(str) + "_" + gy.astype(str)

df_apr["grid_id"] = compute_grid_id(
    df_apr["latitud"],
    df_apr["longitud"]
)

df_apr[["grid_id", "fecha"]].head()

Unnamed: 0,grid_id,fecha
0,-372_-7962,2025-02-03
1,-105_-7775,2025-05-16
2,-227_-7988,2025-05-20
3,-328_-7996,2025-04-11
4,-96_-7936,2025-09-18


In [123]:
DELITOS_INTERES = [
    'DELITOS CONTRA EL DERECHO A LA PROPIEDAD',
    'DELITOS POR LA PRODUCCIÓN O TRÁFICO ILÍCITO DE SUSTANCIAS CATALOGADAS SUJETAS A FISCALIZACIÓN',
    'DELITOS CONTRA LA SEGURIDAD PÚBLICA',
    'DELITOS CONTRA LA EFICIENCIA DE LA ADMINISTRACIÓN PÚBLICA',
    'DELITOS DE VIOLENCIA CONTRA LA MUJER O MIEMBROS DEL NÚCLEO FAMILIAR'
]

df_apr["presunta_infraccion"] = (
    df_apr["presunta_infraccion"]
    .astype(str).str.upper().str.strip()
)

df_apr_del = df_apr[
    df_apr["presunta_infraccion"].isin(DELITOS_INTERES)
].copy()

In [124]:
target = (
    df_apr_del
    .groupby(["grid_id", "fecha"])
    .size()
    .reset_index(name="conteo_delitos")
)

target.head()

Unnamed: 0,grid_id,fecha,conteo_delitos
0,-100_-7781,2025-02-09,1
1,-100_-7781,2025-05-14,1
2,-100_-7781,2025-07-28,2
3,-100_-7781,2025-09-20,1
4,-100_-7782,2025-02-01,3


In [125]:
all_grids = df_apr["grid_id"].unique()
all_dates = pd.date_range(
    df_apr["fecha"].min(),
    df_apr["fecha"].max(),
    freq="D"
)

base = pd.MultiIndex.from_product(
    [all_grids, all_dates],
    names=["grid_id", "fecha"]
).to_frame(index=False)

In [126]:
df = base.merge(
    target,
    on=["grid_id", "fecha"],
    how="left"
)

df["conteo_delitos"] = df["conteo_delitos"].fillna(0)

In [127]:
grid_to_parroquia = (
    df_apr.groupby(["grid_id", "codigo_parroquia"])
    .size()
    .reset_index(name="n")
    .sort_values(["grid_id", "n"], ascending=[True, False])
    .drop_duplicates("grid_id")[["grid_id", "codigo_parroquia"]]
)

df = df.merge(grid_to_parroquia, on="grid_id", how="left")

In [128]:
df_ecu_sec = df_ecu[df_ecu["servicio"] == "SEGURIDAD CIUDADANA"].copy()

ecu_tot = (
    df_ecu_sec
    .groupby(["codigo_parroquia", "fecha"])
    .size()
    .reset_index(name="llamadas_totales")
)

SUBTIPOS_VIOLENCIA = [
    "DISPAROS", "HOMICIDIO", "ASESINATO", "MUERTE VIOLENTA",
    "AGRESIÓN FÍSICA", "VIOLENCIA INTRAFAMILIAR",
    "SECUESTRO", "VIOLACIÓN"
]

SUBTIPOS_PROPIEDAD = [
    "ROBO", "ASALTO", "HURTO",
    "ROBO DE VEHÍCULO", "ROBO DE MOTOCICLETA",
    "INTENTO DE ROBO"
]

SUBTIPOS_CONTEXTO = [
    "PERSONAS SOSPECHOSAS", "PRESENCIA DE ARMAS",
    "PORTACIÓN DE ARMAS", "INTIMIDACIÓN",
    "EXTORSIÓN", "MICROTRÁFICO", "TRÁFICO DE DROGAS"
]

def grupo_subtipo(s):
    if s in SUBTIPOS_VIOLENCIA: return "violencia"
    if s in SUBTIPOS_PROPIEDAD: return "propiedad"
    if s in SUBTIPOS_CONTEXTO: return "contexto"
    return "otro"

df_ecu_sec["grupo"] = df_ecu_sec["subtipo"].apply(grupo_subtipo)

ecu_grp = (
    df_ecu_sec
    .groupby(["codigo_parroquia", "fecha", "grupo"])
    .size()
    .reset_index(name="conteo")
    .pivot_table(
        index=["codigo_parroquia", "fecha"],
        columns="grupo",
        values="conteo",
        fill_value=0
    )
    .reset_index()
)

ecu_grp = ecu_grp.rename(columns={
    "violencia": "llamadas_violencia",
    "propiedad": "llamadas_propiedad",
    "contexto": "llamadas_contexto",
    "otro": "llamadas_otro"
})


ecu_grp.head(10)


grupo,codigo_parroquia,fecha,llamadas_contexto,llamadas_otro,llamadas_propiedad,llamadas_violencia
0,100150,2025-01-01,0.0,230.0,11.0,1.0
1,100150,2025-01-02,0.0,69.0,4.0,1.0
2,100150,2025-01-03,0.0,57.0,2.0,2.0
3,100150,2025-01-04,0.0,90.0,2.0,1.0
4,100150,2025-01-05,0.0,85.0,4.0,0.0
5,100150,2025-01-06,0.0,98.0,2.0,1.0
6,100150,2025-01-07,0.0,69.0,4.0,0.0
7,100150,2025-01-08,0.0,99.0,1.0,0.0
8,100150,2025-01-09,0.0,80.0,4.0,2.0
9,100150,2025-01-10,0.0,127.0,5.0,0.0


In [129]:
df = (
    df
    .merge(ecu_tot, on=["codigo_parroquia", "fecha"], how="left")
    .merge(ecu_grp, on=["codigo_parroquia", "fecha"], how="left")
)

cols_ctx = [
    "llamadas_totales",
    "llamadas_violencia",
    "llamadas_propiedad",
    "llamadas_contexto",
    "llamadas_otro"
]

for c in cols_ctx:
    if c in df.columns:
        df[c] = df[c].fillna(0)

df.head(10)

Unnamed: 0,grid_id,fecha,conteo_delitos,codigo_parroquia,llamadas_totales,llamadas_contexto,llamadas_otro,llamadas_propiedad,llamadas_violencia
0,-372_-7962,2025-01-01,0.0,71150,4.0,0.0,4.0,0.0,0.0
1,-372_-7962,2025-01-02,0.0,71150,4.0,0.0,4.0,0.0,0.0
2,-372_-7962,2025-01-03,0.0,71150,1.0,0.0,1.0,0.0,0.0
3,-372_-7962,2025-01-04,0.0,71150,0.0,0.0,0.0,0.0,0.0
4,-372_-7962,2025-01-05,0.0,71150,4.0,0.0,4.0,0.0,0.0
5,-372_-7962,2025-01-06,0.0,71150,2.0,0.0,2.0,0.0,0.0
6,-372_-7962,2025-01-07,0.0,71150,2.0,0.0,2.0,0.0,0.0
7,-372_-7962,2025-01-08,0.0,71150,1.0,0.0,1.0,0.0,0.0
8,-372_-7962,2025-01-09,0.0,71150,3.0,0.0,3.0,0.0,0.0
9,-372_-7962,2025-01-10,0.0,71150,3.0,0.0,3.0,0.0,0.0


In [130]:
df = df.sort_values(["grid_id", "fecha"])

df["delitos_lag_1"] = df.groupby("grid_id")["conteo_delitos"].shift(1)
df["delitos_7d"] = df.groupby("grid_id")["conteo_delitos"].shift(1).rolling(7).mean()

df["dia_semana"] = df["fecha"].dt.weekday
df["es_fin_semana"] = df["dia_semana"].isin([5,6]).astype(int)

df.head(15)


Unnamed: 0,grid_id,fecha,conteo_delitos,codigo_parroquia,llamadas_totales,llamadas_contexto,llamadas_otro,llamadas_propiedad,llamadas_violencia,delitos_lag_1,delitos_7d,dia_semana,es_fin_semana
375136,-100_-7774,2025-01-01,0.0,150156,2.0,0.0,2.0,0.0,0.0,,,2,0
375137,-100_-7774,2025-01-02,0.0,150156,1.0,0.0,0.0,0.0,1.0,0.0,,3,0
375138,-100_-7774,2025-01-03,0.0,150156,1.0,0.0,1.0,0.0,0.0,0.0,,4,0
375139,-100_-7774,2025-01-04,0.0,150156,2.0,0.0,2.0,0.0,0.0,0.0,,5,1
375140,-100_-7774,2025-01-05,0.0,150156,2.0,0.0,2.0,0.0,0.0,0.0,,6,1
375141,-100_-7774,2025-01-06,0.0,150156,1.0,0.0,0.0,1.0,0.0,0.0,,0,0
375142,-100_-7774,2025-01-07,0.0,150156,0.0,0.0,0.0,0.0,0.0,0.0,,1,0
375143,-100_-7774,2025-01-08,0.0,150156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0
375144,-100_-7774,2025-01-09,0.0,150156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0
375145,-100_-7774,2025-01-10,0.0,150156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0


In [131]:
df_model = df.dropna().reset_index(drop=True)


In [132]:
df_model.count()

grid_id               2202849
fecha                 2202849
conteo_delitos        2202849
codigo_parroquia      2202849
llamadas_totales      2202849
llamadas_contexto     2202849
llamadas_otro         2202849
llamadas_propiedad    2202849
llamadas_violencia    2202849
delitos_lag_1         2202849
delitos_7d            2202849
dia_semana            2202849
es_fin_semana         2202849
dtype: int64

In [133]:
count = df_model["conteo_delitos"].value_counts().reset_index()

print(count)

    conteo_delitos    count
0              0.0  2182623
1              1.0    14492
2              2.0     3817
3              3.0     1109
4              4.0      445
5              5.0      170
6              6.0       97
7              7.0       34
8              8.0       19
9              9.0       12
10            13.0        5
11            10.0        5
12            11.0        4
13            12.0        4
14            15.0        3
15            40.0        1
16            14.0        1
17            26.0        1
18            16.0        1
19            25.0        1
20            18.0        1
21            22.0        1
22            52.0        1
23            20.0        1
24            24.0        1


In [134]:
df_model.to_csv(
    DATA_DIR_DEST / "dataset_entrenamiento_grid.csv",
    index=False
)