In [6]:
# ============================================
# 0) Imports y configuración
# ============================================
import pandas as pd
import numpy as np
import unicodedata
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

DATA_PATH = Path("../data/interim/ecu911/ecu911_limpio.csv")
OUT_DIR   = Path("../data/processed/ecu911/ecu911_clustering")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Rango de K a probar
K_MIN, K_MAX = 3, 12

# Filtrar filas (parroquia-mes) con muy pocos eventos para no meter ruido
MIN_EVENTS_PARR_MES = 5

In [7]:
# ============================================
# 1) Lista de subtipos de interés
#    (IMPORTANTE: en esta lista ponlos SIN tildes)
# ============================================
SUBTIPOS_INTERES = [
    "DISPAROS", "HOMICIDIO", "ASESINATO", "MUERTE VIOLENTA",
    "AGRESION FISICA", "VIOLENCIA INTRAFAMILIAR",
    "SECUESTRO", "VIOLACION",
    "ROBO", "ASALTO", "HURTO",
    "ROBO DE VEHICULO", "ROBO DE MOTOCICLETA",
    "INTENTO DE ROBO",
    "PERSONAS SOSPECHOSAS", "PRESENCIA DE ARMAS",
    "PORTACION DE ARMAS", "INTIMIDACION",
    "EXTORSION", "MICROTRAFICO", "TRAFICO DE DROGAS"
]
SUBTIPOS_SET = set(SUBTIPOS_INTERES)

In [8]:
# ============================================
# 2) Función de normalización de texto (quita tildes, trim, upper)
# ============================================
def norm_txt(s: str) -> str:
    s = str(s).strip().upper()
    s = "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )
    return s

In [9]:
# ============================================
# 3) Cargar + limpieza mínima
# ============================================
df = pd.read_csv(DATA_PATH)

# Fecha
df["fecha"] = pd.to_datetime(df["fecha"], errors="coerce")
df = df.dropna(subset=["fecha", "cod_parroquia"])

# Normalizar texto
for c in ["provincia", "canton", "parroquia", "subtipo", "servicio"]:
    if c in df.columns:
        df[c] = df[c].astype(str).apply(norm_txt)

# Verificación rápida de servicio
# (Si siempre es "SEGURIDAD CIUDADANA", esta columna no aporta)
print("Servicios únicos:", df["servicio"].nunique())
print(df["servicio"].value_counts().head(10))

Servicios únicos: 1
servicio
SEGURIDAD CIUDADANA    1773973
Name: count, dtype: int64


In [10]:
# ============================================
# 4) Filtrar solo subtipos de interés
# ============================================
df_del = df[df["subtipo"].isin(SUBTIPOS_SET)].copy()

print("Filas totales:", len(df), "| Filas con subtipos interés:", len(df_del))
print("Subtipos encontrados:", sorted(df_del["subtipo"].unique()))

Filas totales: 1773973 | Filas con subtipos interés: 73090
Subtipos encontrados: ['AGRESION FISICA', 'ASESINATO', 'DISPAROS', 'EXTORSION', 'HOMICIDIO', 'HURTO', 'ROBO', 'SECUESTRO', 'VIOLACION', 'VIOLENCIA INTRAFAMILIAR']


In [11]:
# ============================================
# 5) Crear variable de mes y la "zona"
# ============================================
zona_cols = ["provincia", "canton", "cod_parroquia", "parroquia"]
df_del["ym"] = df_del["fecha"].dt.to_period("M").astype(str)

In [12]:
# ============================================
# 6) MATRIZ para clustering: filas = (parroquia, mes), columnas = subtipos (conteos)
# ============================================
X_counts = (
    df_del.groupby(zona_cols + ["ym", "subtipo"])
          .size()
          .unstack(fill_value=0)
)

# Total de eventos por fila (parroquia-mes)
row_total = X_counts.sum(axis=1)
X_counts = X_counts[row_total >= MIN_EVENTS_PARR_MES].copy()

print("Filas (parroquia-mes) después de MIN_EVENTS:", X_counts.shape[0])
print("Columnas (subtipos):", X_counts.shape[1])

# (Opcional) agregar feature de intensidad total
X = X_counts.copy()
X["TOTAL_EVENTOS"] = X_counts.sum(axis=1)

Filas (parroquia-mes) después de MIN_EVENTS: 1636
Columnas (subtipos): 10


In [13]:
# ============================================
# 7) Transformación + escalado
# ============================================
X_log = np.log1p(X)  # reduce outliers
X_scaled = StandardScaler().fit_transform(X_log)

In [14]:
# ============================================
# 8) Elegir K (prueba rápida)
# ============================================
scores = []
for k in range(K_MIN, K_MAX + 1):
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    dbi = davies_bouldin_score(X_scaled, labels)
    scores.append((k, sil, dbi))

df_scores = pd.DataFrame(scores, columns=["k", "silhouette", "davies_bouldin"]) \
              .sort_values("silhouette", ascending=False)

print(df_scores)

# Escoge el mejor por silhouette (simple)
best_k = int(df_scores.iloc[0]["k"])
print("Best K:", best_k)

    k  silhouette  davies_bouldin
0   3    0.370559        1.372461
1   4    0.212048        1.600948
5   8    0.168899        1.599587
3   6    0.164726        1.639879
4   7    0.161039        1.643354
6   9    0.158573        1.607916
8  11    0.153753        1.570093
7  10    0.151197        1.605678
2   5    0.146650        1.691067
9  12    0.138785        1.612914
Best K: 3


In [15]:
# ============================================
# 9) Entrenar modelo final
# ============================================
model = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
labels = model.fit_predict(X_scaled)

In [16]:
# ============================================
# 10) Resultados por (parroquia, mes)
# ============================================
res = X.reset_index()[zona_cols + ["ym"]].copy()
res["cluster_id"] = labels

# Métrica de volumen para contexto
res["total_eventos"] = X["TOTAL_EVENTOS"].values

# Guardar asignaciones
res.to_csv(OUT_DIR / "parroquia_mes_clusterizadas.csv", index=False)

In [17]:
# ============================================
# 11) Interpretación: Top subtipos por cluster
# ============================================
subtipo_cols = [c for c in X.columns if c != "TOTAL_EVENTOS"]

tmp = X.reset_index()[zona_cols + ["ym"] + subtipo_cols + ["TOTAL_EVENTOS"]].copy()
tmp["cluster_id"] = labels

# Promedio por cluster
cluster_means = tmp.groupby("cluster_id")[subtipo_cols].mean()

def top_subtipos(row, n=5):
    return list(row.sort_values(ascending=False).head(n).index)

top5 = cluster_means.apply(lambda r: top_subtipos(r, n=5), axis=1) \
                    .rename("top5_subtipos") \
                    .reset_index()

resumen = (
    tmp.groupby("cluster_id")
       .agg(
           filas_parroquia_mes=("TOTAL_EVENTOS", "size"),
           total_eventos_prom=("TOTAL_EVENTOS", "mean"),
           total_eventos_med=("TOTAL_EVENTOS", "median"),
       )
       .reset_index()
       .merge(top5, on="cluster_id", how="left")
)

resumen.to_csv(OUT_DIR / "resumen_clusters.csv", index=False)

print("Guardado en:", OUT_DIR)
print(resumen)

Guardado en: ..\data\processed\ecu911\ecu911_clustering
   cluster_id  filas_parroquia_mes  total_eventos_prom  total_eventos_med  \
0           0                   60          527.366667              170.5   
1           1                 1250           10.831200                9.0   
2           2                  326           66.766871               52.0   

                                       top5_subtipos  
0  [ROBO, VIOLENCIA INTRAFAMILIAR, EXTORSION, HUR...  
1  [ROBO, VIOLENCIA INTRAFAMILIAR, HURTO, AGRESIO...  
2  [HURTO, ROBO, VIOLENCIA INTRAFAMILIAR, AGRESIO...  


In [18]:
res.merge(tmp[zona_cols+["ym","cluster_id","TOTAL_EVENTOS"]], on=zona_cols+["ym","cluster_id"], how="left") \
   .query("cluster_id == 0") \
   .sort_values("TOTAL_EVENTOS", ascending=False) \
   .tail(20)


subtipo,provincia,canton,cod_parroquia,parroquia,ym,cluster_id,total_eventos,TOTAL_EVENTOS
253,EL ORO,MACHALA,70150,"MACHALA, CABECERA CANTONAL Y CAPITAL PROVINCIAL",2025-10,0,139,139
1478,SANTO DOMINGO DE LOS TSACHILAS,SANTO DOMINGO,230150,"SANTO DOMINGO DE LOS COLORADOS, CABECERA CANTO...",2025-01,0,137,137
318,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-10,0,136,136
696,LOS RIOS,QUEVEDO,120550,"QUEVEDO, CABECERA CANTONAL",2025-10,0,127,127
688,LOS RIOS,QUEVEDO,120550,"QUEVEDO, CABECERA CANTONAL",2025-02,0,120,120
317,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-09,0,117,117
313,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-05,0,111,111
309,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-01,0,107,107
314,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-06,0,103,103
315,ESMERALDAS,ESMERALDAS,80150,"ESMERALDAS, CABECERA CANTONAL Y CAPITAL PROVIN...",2025-07,0,101,101


In [19]:
import joblib

joblib.dump(model, '../models/trained/modelo_ecu911_clustering.pkl')

['../models/trained/modelo_ecu911_clustering.pkl']