## IMPORTAR DATOS

In [None]:
import pandas as pd
colab=True
df = pd.read_csv('data/processed/rome_clean.csv') if colab == False else pd.read_csv('rome_clean.csv')
df.head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
df['dis'].hist(bins=50, edgecolor='black')
plt.title('Histogram of Distances (dis)')
plt.xlabel('Distance')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

In [None]:
coords_df = df[['lonO', 'latO', 'lonD', 'latD', 'idS', 'dis']].copy()

def categorize_distance(dis):
    if dis < 2500:
        return 'corto'
    elif dis < 5000:
        return 'medio'
    else:
        return 'largo'

def wait(dis):
    if dis == 'corto':
        return 300
    elif dis == 'medio':
        return 420
    else:
        return 600

coords_df['type'] = coords_df['dis'].apply(categorize_distance)
coords_df['wait'] = coords_df['type'].apply(wait)
coords_df.head()

In [None]:
coords_origin_df = coords_df[['lonO', 'latO', 'idS', 'type', 'wait']]
coords_destination_df = coords_df[['lonD', 'latD', 'idS', 'type', 'wait']]

In [None]:
coords_origin_df.head()

In [None]:
coords_destination_df.head()

x Aplicamos HDBSCAN por separado sobre los puntos de origen y de destino (al final no utilizamos esta técnica porque no nos interesa tanto la densidad de los puntos si no la distancia a los centroides)

x Aplicamos Fuzzy C-Means que agrupa segun la distancia a los centroides con un grado de pertenencia (tarda mucho en ejecutar y queremos probar para encontrar el mejor k)

Aplicamos K-Means porque es más rápido y queremos encontrar el mejor valor de k

##TÉCNICAS PARA LA CORRECTA SELECCIÓN DE K (K-MEANS)

In [None]:
from sklearn.cluster import KMeans
from tqdm import tqdm

def clusterizar_coordenadas(df, coords_cols, k_values=[100,150,200,250,300,350,400], random_state=42):
    """
    Clusteriza un DataFrame usando KMeans para varios valores de k con barra de progreso.

    Args:
        df (pd.DataFrame): DataFrame con las coordenadas.
        coords_cols (list): Lista con los nombres de las columnas de coordenadas, e.g., ['lat','lon'].
        k_values (list): Lista de valores de k a probar.
        random_state (int): Semilla para reproducibilidad.

    Returns:
        pd.DataFrame: DataFrame con nuevas columnas cluster_k para cada valor de k.
    """
    for k in tqdm(k_values, desc="Clustering"):
        kmeans = KMeans(n_clusters=k, random_state=random_state)
        df[f'cluster_{k}'] = kmeans.fit_predict(df[coords_cols])
    return df

In [None]:
k_values = list(range(150, 250, 10))
coords_origin_df = clusterizar_coordenadas(coords_origin_df, coords_cols=['latO','lonO'], k_values=k_values)

In [None]:
coords_origin_df.head()

Ahora vamos a calcular el número de viajes que se 'descartarían' para cada valor de k. Esto depende de la distancia del viaje que se va a realizar.

In [None]:
import numpy as np

def haversine_dist(lat1, lon1, lat2, lon2):
    """
    Calcula la distancia Haversine en metros entre dos puntos (lat, lon).
    """
    R = 6371000  # Radio de la Tierra en metros
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

def viajes_fuera_de_radio(df, coords_cols=['latO','lonO'], k_values=[100,150,200,250,300,350,400], velocidad_kmh=5):
    resultados = {}

    for k in k_values:
        cluster_col = f'cluster_{k}'
        fuera = 0

        # Obtener centroides
        centroides = df.groupby(cluster_col)[coords_cols].mean()

        for idx, row in df.iterrows():
            centroide = centroides.loc[row[cluster_col]]
            distancia_m = haversine_dist(row[coords_cols[0]], row[coords_cols[1]],
                                        centroide[coords_cols[0]], centroide[coords_cols[1]])

            max_dist = (velocidad_kmh * 1000 / 3600) * row['wait']

            if distancia_m > max_dist:
                fuera += 1

        resultados[k] = fuera

    return resultados


In [None]:
coords_origin_results = viajes_fuera_de_radio(coords_origin_df, k_values=k_values)
coords_origin_results

Vamos a hacer una gráfica para comparar el valor de silhouette score con los viajes descartados en función del valor de número de clústers

In [None]:
import plotly.graph_objects as go

def plot_silhouette_vs_viajes(k_values, silhouette_scores, viajes_fuera):
    """
    Grafica silhouette score y viajes fuera de radio en función de k.

    Args:
        k_values (list): Lista de valores de k.
        silhouette_scores (list): Lista de silhouette scores correspondientes a cada k.
        viajes_fuera (dict): Diccionario {k: número de viajes fuera de radio}.
    """
    # Convertir dict de viajes a lista en el mismo orden de k_values
    viajes = [viajes_fuera[k] for k in k_values]
    silhouette_scores = [silhouette_scores[k] for k in k_values]

    fig = go.Figure()

    # Línea de silhouette score
    fig.add_trace(go.Scatter(
        x=k_values,
        y=silhouette_scores,
        name='Silhouette Score',
        mode='lines+markers',
        yaxis='y1'
    ))

    # Línea de viajes fuera de radio
    fig.add_trace(go.Scatter(
        x=k_values,
        y=viajes,
        name='Viajes fuera de radio',
        mode='lines+markers',
        yaxis='y2'
    ))

    # Configurar ejes
    fig.update_layout(
        title='Silhouette Score vs Viajes fuera de radio',
        xaxis=dict(title='Número de clusters k'),
        yaxis=dict(title='Silhouette Score', side='left'),
        yaxis2=dict(title='Viajes fuera de radio', overlaying='y', side='right'),
        legend=dict(x=0.1, y=1.1, orientation='h')
    )

    fig.show()

In [None]:
from sklearn.metrics import silhouette_score

def calcular_silhouette(df, coords_cols=['latO','lonO'], k_values=[100,150,200,250,300,350,400]):
    """
    Calcula el silhouette score para cada clustering guardado en cluster_k.

    Args:
        df (pd.DataFrame): DataFrame con las coordenadas y las columnas cluster_k.
        coords_cols (list): Columnas de lat/lon.
        k_values (list): Valores de k.

    Returns:
        dict: {k: silhouette score}
    """
    scores = {}
    X = df[coords_cols].values

    for k in k_values:
        cluster_col = f'cluster_{k}'
        labels = df[cluster_col].values
        if len(set(labels)) > 1:  # Silhouette necesita al menos 2 clusters
            score = silhouette_score(X, labels)
        else:
            score = 0
        scores[k] = score
    return scores


In [None]:
sil_scores = calcular_silhouette(coords_origin_df, coords_cols=['latO','lonO'], k_values=k_values)

In [None]:
plot_silhouette_vs_viajes(k_values, sil_scores, coords_origin_results)

Después de este analisis sobre los viajes de origin, concluimos que el k de las estaciones debe ser 200 (SS=0.4836977, Viajes descartados=151)

In [None]:
coords_origin_df = coords_origin_df[['lonO', 'latO', 'idS', 'type', 'cluster_200']]
coords_origin_df.head()

In [None]:
import folium
import webbrowser
import tempfile
import pandas as pd

cluster_col = 'cluster_200'
coords_cols = ['latO', 'lonO']

# Calcular centroides
centroides = coords_origin_df.groupby(cluster_col)[coords_cols].mean().reset_index()

# Crear mapa centrado en los puntos
mapa = folium.Map(location=[coords_origin_df['latO'].mean(), coords_origin_df['lonO'].mean()], zoom_start=13)

# Colores para clusters
colores = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Añadir centroides
for i, row in centroides.iterrows():
    folium.Marker(
        location=[row['latO'], row['lonO']],
        popup=f"Cluster {row[cluster_col]}",
        icon=folium.Icon(color=colores[i % len(colores)], icon='star')
    ).add_to(mapa)

# Añadir puntos
for i, row in coords_origin_df.iterrows():
    folium.CircleMarker(
        location=[row['latO'], row['lonO']],
        radius=3,
        color=colores[row['cluster_200'] % len(colores)],
        fill=True,
        fill_opacity=0.6
    ).add_to(mapa)

mapa


Ahora vamos a repetir el mismo proceso pero para los puntos de destino

In [None]:
k_values = list(range(180, 240, 10))
coords_destination_df = clusterizar_coordenadas(coords_destination_df, coords_cols=['latD','lonD'], k_values=k_values)

En este caso, los viajes fuera de radio los interpretamos de manera diferente. No son lo que andaría una persona para coger la bici, sino lo que andaría tras dejarla en la estación. Los valores de distancia los dejaremos iguales.

In [None]:
coords_destination_results = viajes_fuera_de_radio(coords_destination_df, coords_cols=['latD', 'lonD'], k_values=k_values)
coords_destination_results

In [None]:
sil_scores = calcular_silhouette(coords_destination_df, coords_cols=['latD','lonD'], k_values=k_values)

In [None]:
plot_silhouette_vs_viajes(k_values, sil_scores, coords_destination_results)

En este caso elegimos k = 200 (SS=0.4375309, Viajes descartados=168)

In [None]:
coords_destination_df = coords_destination_df[['lonD', 'latD', 'idS', 'type', 'cluster_200']]
coords_destination_df.head()

In [None]:
import folium
import webbrowser
import tempfile
import pandas as pd

cluster_col = 'cluster_200'
coords_cols = ['latD', 'lonD']

# Calcular centroides
centroides = coords_destination_df.groupby(cluster_col)[coords_cols].mean().reset_index()

# Crear mapa centrado en los puntos
mapa = folium.Map(location=[coords_destination_df['latD'].mean(), coords_destination_df['lonD'].mean()], zoom_start=13)

# Colores para clusters
colores = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Añadir centroides
for i, row in centroides.iterrows():
    folium.Marker(
        location=[row['latD'], row['lonD']],
        popup=f"Cluster {row[cluster_col]}",
        icon=folium.Icon(color=colores[i % len(colores)], icon='star')
    ).add_to(mapa)

# Añadir puntos
for i, row in coords_destination_df.iterrows():
    folium.CircleMarker(
        location=[row['latD'], row['lonD']],
        radius=3,
        color=colores[row['cluster_200'] % len(colores)],
        fill=True,
        fill_opacity=0.6
    ).add_to(mapa)

mapa

## CLUSTERIZACIÓN FINAL

Ahora tenemos que agrupar las estaciones de origen y de destino en unas estaciones globales.

In [None]:
from sklearn.cluster import KMeans
coords_origin_df = coords_df[['lonO', 'latO', 'idS', 'type', 'wait']]
kmeans = KMeans(n_clusters=200, random_state=42)
coords_origin_df[f'clusterO'] = kmeans.fit_predict(coords_origin_df[['latO', 'lonO']])

coords_destination_df = coords_df[['lonD', 'latD', 'idS', 'type', 'wait']]
kmeans = KMeans(n_clusters=200, random_state=42)
coords_destination_df[f'clusterD'] = kmeans.fit_predict(coords_destination_df[['latD', 'lonD']])

In [None]:
coords_origin_df

In [None]:
origin_centroids = coords_origin_df.groupby('clusterO')[['latO','lonO']].mean().reset_index()
destination_centroids = coords_destination_df.groupby('clusterD')[['latD','lonD']].mean().reset_index()

In [None]:
import folium
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Example: if coords_origin_df and coords_destination_df already exist
# Compute cluster centroids for plotting


# Create a base map centered around the average of all points
map_center = [
    coords_origin_df['latO'].mean(),
    coords_origin_df['lonO'].mean()
]
m = folium.Map(location=map_center, zoom_start=13)


# Plot origin centroids
for _, row in origin_centroids.iterrows():
    folium.Marker(
        location=(row['latO'], row['lonO']),
        popup=f"Cluster Origin {int(row['clusterO'])}",
        icon=folium.Icon(color='blue', icon='home')
    ).add_to(m)
    folium.Circle(
        location=[row['latO'], row['lonO']],
        radius=150,
        color='green',
        fill=True,
        fill_opacity=0.15
    ).add_to(m)

# Plot origin centroids
for _, row in destination_centroids.iterrows():
    folium.Marker(
        location=(row['latD'], row['lonD']),
        popup=f"Cluster Destination {int(row['clusterD'])}",
        icon=folium.Icon(color='red', icon='flag')
    ).add_to(m)

m


Observamos que hay muchas estaciones cuyo centroide practicamente coincide.

In [None]:
!pip install haversine

In [None]:
import pandas as pd
from haversine import haversine, Unit

RADIUS = 200  # metros

fused_rows = []

# =========================================================
# Obtener el siguiente ID libre para destinos aislados
# =========================================================
max_origin_id = origin_centroids["clusterO"].max()

# El siguiente ID disponible será mayor que ambos
next_new_id = max_origin_id+ 1


# =========================================================
# 1) CÍRCULOS DESDE LOS ORÍGENES
# =========================================================
for _, row_o in origin_centroids.iterrows():
    o_id = row_o["clusterO"]
    o_lat, o_lon = row_o["latO"], row_o["lonO"]

    nearby_dests = []
    nearby_dests_id = []

    for _, row_d in destination_centroids.iterrows():
        d_lat, d_lon = row_d["latD"], row_d["lonD"]
        dist_m = haversine((o_lat, o_lon), (d_lat, d_lon), unit=Unit.METERS)
        if dist_m <= RADIUS:
            nearby_dests.append((d_lat, d_lon))
            nearby_dests_id.append(row_d['clusterD'])

    if len(nearby_dests) == 0:
        fused_lat = o_lat
        fused_lon = o_lon
        num_dest = 0
    else:
        all_lats = [o_lat] + [x[0] for x in nearby_dests]
        all_lons = [o_lon] + [x[1] for x in nearby_dests]
        fused_lat = sum(all_lats) / len(all_lats)
        fused_lon = sum(all_lons) / len(all_lons)
        num_dest = len(nearby_dests)

    fused_rows.append({
        "origin": True,
        "cluster_id": o_id,
        "lat": fused_lat,
        "lon": fused_lon,
        "num_destinations": num_dest,
        "old_id": nearby_dests_id
    })


# =========================================================
# 2) CÍRCULOS DESDE LOS DESTINOS
# =========================================================
for _, row_d in destination_centroids.iterrows():
    d_lat, d_lon = row_d["latD"], row_d["lonD"]

    found_origin = False

    for _, row_o in origin_centroids.iterrows():
        o_lat, o_lon = row_o["latO"], row_o["lonO"]
        dist_m = haversine((d_lat, d_lon), (o_lat, o_lon), unit=Unit.METERS)
        if dist_m <= RADIUS:
            found_origin = True
            break

    # Si NO hay origen dentro del radio → añadir destino con ID nuevo
    if not found_origin:
        fused_rows.append({
            "origin": False,
            "cluster_id": next_new_id,   # USAMOS EL NUEVO ID
            "lat": d_lat,
            "lon": d_lon,
            "num_destinations": 0,
            "old_id": row_d["clusterD"]
        })
        next_new_id += 1  # Actualizar contador para el siguiente destino aislado


# =========================================================
# 3) DATAFRAME FINAL
# =========================================================
fused_centroids = pd.DataFrame(fused_rows)


In [None]:
fused_centroids.shape

In [None]:
fused_centroids[fused_centroids["origin"]==True]

a continunacion visualizamos como quedan los clusters fusionados

In [None]:
import folium
from folium.plugins import MarkerCluster

# =========================================================
# CONFIGURACIÓN
# =========================================================
colores = ['red', 'blue', 'green', 'purple', 'orange',
           'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Repetimos colores si hay más clusters que colores disponibles
color_map = {}
for idx, cluster_id in enumerate(fused_centroids["cluster_id"]):
    color_map[cluster_id] = colores[idx % len(colores)]

# =========================================================
# CREAR MAPA
# =========================================================
m = folium.Map(location=[41.90, 12.48], zoom_start=12)

# Capa para cluster fusionado
fused_layer = folium.FeatureGroup(name="Fused Centroids").add_to(m)

# Capa de círculos 50m para destinos aislados
dest_radius_layer = folium.FeatureGroup(name="Destinations 50m Radius").add_to(m)


# =========================================================
# PINTAR CENTROIDES FUSIONADOS
# =========================================================
for _, row in fused_centroids.iterrows():
    cid = row["cluster_id"]
    lat = row["lat"]
    lon = row["lon"]
    num_dest = row["num_destinations"]
    is_origin = row["origin"]  # True = origen fusionado, False = destino aislado

    # --- MARCADOR ---
    folium.Marker(
        location=[lat, lon],
        icon=folium.Icon(
            icon="bolt",
            prefix="fa",
            color=color_map[cid]
        ),
        popup=(
            f"<b>Cluster {cid}</b><br>"
            f"{'Origen fusionado' if is_origin else 'Destino aislado'}<br>"
            f"Destinos fusionados: {num_dest}<br>"
            f"lat: {lat:.5f}<br>"
            f"lon: {lon:.5f}"
        )
    ).add_to(fused_layer)

    # --- CÍRCULO DE 50m PARA DESTINOS QUE SE MANTIENEN ---
    if not is_origin:
        folium.Circle(
            location=[lat, lon],
            radius=200,  # 50 metros
            color=color_map[cid],
            fill=True,
            fill_opacity=0.15
        ).add_to(dest_radius_layer)


# =========================================================
# CAPAS
# =========================================================
folium.LayerControl().add_to(m)

# Mostrar
m


In [None]:
coords_origin_df['point_type'] = 'origin'
coords_origin_df.rename(columns={'latO': 'lat', 'lonO': 'lon'}, inplace=True)
coords_destination_df.rename(columns={'latD': 'lat', 'lonD': 'lon'}, inplace=True)
coords_destination_df['point_type'] = 'destination'
final_clusters = pd.concat([coords_origin_df, coords_destination_df])
final_clusters.head()

In [None]:
! pip install shapely osmnx networkx

Mover centroides fusionados según restricciones urbanísticas de la ciudad de roma

In [None]:
import json
import osmnx as ox
import networkx as nx
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.ops import nearest_points
import pandas as pd

# =========================================================
# 1. LOAD AND PREPARE RESTRICTED ZONES
# =========================================================

# Load the content you provided
json_data = json.load(open("restricted_zones.json"))

# The JSON coordinates are [Lon, Lat] (GeoJSON standard).
# We convert them to Shapely Polygons.
polygons_list = []
for poly_coords in json_data["restricted_polygons"]:
    # Ensure the polygon is closed (first point == last point)
    if poly_coords[0] != poly_coords[-1]:
        poly_coords.append(poly_coords[0])
    polygons_list.append(Polygon(poly_coords))

# Create a single MultiPolygon object for easier checking
restricted_area = MultiPolygon(polygons_list)

# =========================================================
# 2. DOWNLOAD STREET NETWORK (The "Reality" Layer)
# =========================================================
print("Downloading street network from OpenStreetMap...")

# We download the street network around your average coordinates.
# 'drive' ensures we don't snap to pedestrian paths inside buildings or parks if you need cars.
# Use 'walk' if these are pedestrian clusters.
center_lat = fused_centroids["lat"].mean()
center_lon = fused_centroids["lon"].mean()

# Download graph within 3km radius (adjust if your area is larger)
G = ox.graph_from_point((center_lat, center_lon), dist=8000, network_type='walk')

# Project graph to UTM (meters) for accurate nearest-node finding, then back to lat/lon
# Note: For simple nearest node finding without strict meter precision, we can use the unprojected graph directly
# with the newer osmnx functions, but creating a GeoDataFrame is safest.
gdf_nodes = ox.graph_to_gdfs(G, edges=False)

# =========================================================
# 3. LOGIC TO MOVE CENTROIDS
# =========================================================

def correct_centroid_location(row):
    # Create Point (Shapely uses Lon, Lat order)
    current_point = Point(row['lon'], row['lat'])

    # 1. CHECK RESTRICTED ZONES
    if restricted_area.contains(current_point):
        # Find nearest point on the boundary of the restricted area
        # nearest_points returns tuple (geom1, geom2), we want the point on the polygon (index 1)
        p1, p2 = nearest_points(current_point, restricted_area)
        target_point = p1 # p1 is the point on the restricted_area boundary closest to current_point

        # Update our "current" point to this new boundary location
        current_point = target_point
        status = "Moved out of Zone"
    else:
        status = "Zone OK"

    # 2. SNAP TO REALITY (Avoid rivers/houses)
    # We take the (potentially moved) point and find the nearest street node
    nearest_node_id = ox.distance.nearest_nodes(G, current_point.x, current_point.y)
    node_data = G.nodes[nearest_node_id]

    # Update lat/lon to the street node's coordinates
    new_lat = node_data['y']
    new_lon = node_data['x']

    return pd.Series([new_lat, new_lon, status])

# =========================================================
# 4. APPLY TO DATAFRAME
# =========================================================
print("Adjusting centroids...")

# Apply function
fused_centroids[['lat', 'lon', 'adjustment_status']] = fused_centroids.apply(correct_centroid_location, axis=1)

print("Done! Centroids moved out of zones and snapped to nearest streets.")
fused_centroids.head()

Asignción de los puntos (tanto origen como destino) a su cluster más cercano pues los cluster han sido movidos y algunos puntos ahora puede ser conveniente asignarlo a otro

In [None]:
from sklearn.metrics.pairwise import haversine_distances
import numpy as np

# 1. Convertir coordenadas a Radianes (Scikit-learn lo requiere así)
# Asegúrate que el orden sea [Latitud, Longitud]
points_rad = np.radians(final_clusters[['lat', 'lon']].to_numpy())
centroids_rad = np.radians(fused_centroids[['lat', 'lon']].to_numpy())

# 2. Calcular matriz de distancias (Resultado en radianes)
# Esto genera una matriz de tamaño (N_puntos x N_centroides)
dist_matrix = haversine_distances(points_rad, centroids_rad)

# 3. Encontrar el índice del valor mínimo por fila
# (No hace falta convertir a metros para saber cuál es el menor)
closest_centroid_indices = np.argmin(dist_matrix, axis=1)

# 4. Asignar los IDs reales
final_clusters['clusterFinal'] = fused_centroids.iloc[closest_centroid_indices]["cluster_id"].values

final_clusters.head()

In [None]:
max(final_clusters['clusterFinal'])

In [None]:
fused_centroids

In [None]:
import folium

# =========================================================
# CONFIGURACIÓN DE COLORES
# =========================================================
colores = ['red', 'blue', 'green', 'purple', 'orange',
           'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

color_map = {}
# Usamos enumerate para asegurar que si hay más clusters que colores, se repitan
for idx, row in fused_centroids.iterrows():
    cid = row["cluster_id"]
    color_map[cid] = colores[idx % len(colores)]

# =========================================================
# CREAR MAPA
# =========================================================
# Centramos el mapa aproximadamente usando la media de los centroides o un punto fijo
center_lat = fused_centroids["lat"].mean()
center_lon = fused_centroids["lon"].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# Capas
fused_layer = folium.FeatureGroup(name="Fused Centroids").add_to(m)
points_layer = folium.FeatureGroup(name="Associated Points").add_to(m)

# =========================================================
# PINTAR CENTROIDES Y SUS PUNTOS ASOCIADOS
# =========================================================
for _, row in fused_centroids.iterrows():

    # Extraer datos del centroide
    cid = row["cluster_id"]
    lat_c = row["lat"]
    lon_c = row["lon"]
    is_origin = row["origin"]
    num_dest = row["num_destinations"] # Opcional, para el popup

    # Seleccionar color
    color = color_map.get(cid, 'gray')

    # -----------------------------------------------------
    # 1. PINTAR MARCADOR DEL CENTROIDE
    # -----------------------------------------------------
    folium.Marker(
        location=[lat_c, lon_c],
        icon=folium.Icon(
            icon="bolt",
            prefix="fa",
            color=color
        ),
        popup=(
            f"<b>Cluster ID: {cid}</b><br>"
            f"Tipo: {'Origen' if is_origin else 'Destino'}<br>"
            f"Lat: {lat_c:.5f}<br>"
            f"Lon: {lon_c:.5f}<br>"
            f"Num Destinos Agrupados: {num_dest}"
        )
    ).add_to(fused_layer)

    # -----------------------------------------------------
    # 2. PINTAR LOS PUNTOS ASOCIADOS (Desde final_clusters)
    # -----------------------------------------------------

    # Filtramos final_clusters donde 'clusterFinal' coincida con el 'cluster_id' actual
    subset = final_clusters[final_clusters["clusterFinal"] == cid]

    for _, point in subset.iterrows():
        p_lat = point["lat"]
        p_lon = point["lon"]
        p_type = point["point_type"] # 'origin' u otro valor

        # Crear popup para el punto individual
        point_popup = f"Tipo: {p_type}<br>Cluster Final: {cid}"

        folium.CircleMarker(
            location=[p_lat, p_lon],
            radius=3,
            color=color,       # Mismo color que el centroide padre
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            popup=point_popup
        ).add_to(points_layer)

# =========================================================
# LAYER CONTROL
# =========================================================
folium.LayerControl().add_to(m)

m

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 1) Contar cuántos viajes tiene cada clusterFinal
cluster_sizes = final_clusters["clusterFinal"].value_counts()

# 2) Definir bins de 50 en 50
bins = np.arange(0, cluster_sizes.max() + 50, 50)

plt.figure(figsize=(12,6))

# 3) Histograma
plt.hist(cluster_sizes, bins=bins, edgecolor='black')

plt.xlabel("Número de viajes por cluster (en intervalos de 50)")
plt.ylabel("Número de clusters")
plt.title("Histograma del tamaño de los clusters (viajes por cluster)")
plt.xticks(bins, rotation=45)
plt.tight_layout()
plt.show()


In [None]:
final_clusters[clusterFinal]

In [None]:
coords_destination_df

In [None]:
fused_centroids

In [None]:
orig = final_clusters[final_clusters["point_type"] == "origin"][["idS", "clusterFinal"]]
orig.shape

dest = final_clusters[final_clusters["point_type"] == "destination"][["idS", "clusterFinal"]]
dest.shape


trips = pd.DataFrame({
    "ids": coords_origin_df["idS"].values,
    "latO" : coords_origin_df["lat"].values,
    "lonO" : coords_origin_df["lon"].values,
    "latD" : coords_destination_df["lat"].values,
    "lonD" : coords_destination_df["lon"].values,
    "wait" : coords_origin_df["wait"].values,
    "cluster_start_final": orig["clusterFinal"].values,
    "cluster_end_final": dest["clusterFinal"].values
})
trips

In [None]:
lost_real = trips[trips["cluster_start_final"] == trips["cluster_end_final"]]

num_lost_real = len(lost_real)
lost_ids_real = set(lost_real.index)

def viajes_fuera_de_radio(trips, centroids, velocidad_kmh=5, return_indices=False):

    fuera = 0
    indices = []
    velocidad_ms = velocidad_kmh * 1000 / 3600

    for idx, row in trips.iterrows():

        # ===== ORIGEN =====
        cid_o = row["cluster_start_final"]
        c_lat_o, c_lon_o = centroids.loc[cid_o]["lat"], centroids.loc[cid_o]["lon"]

        dist_o = haversine_dist(row["latO"], row["lonO"], c_lat_o, c_lon_o)
        max_dist_o = velocidad_ms * row["wait"]

        origen_fuera = dist_o > max_dist_o

        # ===== DESTINO =====
        cid_d = row["cluster_end_final"]
        c_lat_d, c_lon_d = centroids.loc[cid_d]["lat"], centroids.loc[cid_d]["lon"]

        dist_d = haversine_dist(row["latD"], row["lonD"], c_lat_d, c_lon_d)
        max_dist_d = velocidad_ms * row["wait"]

        destino_fuera = dist_d > max_dist_d

        # ===== REGLA FINAL =====
        if origen_fuera or destino_fuera:
            fuera += 1
            indices.append(idx)

    return (fuera, indices) if return_indices else fuera


num_lost_walk, lost_ids_walk_list = viajes_fuera_de_radio(
    trips, fused_centroids, return_indices=True
)
lost_ids_walk = set(lost_ids_walk_list)

total_unico = len(lost_ids_real.union(lost_ids_walk))

print("===== RESULTADOS =====")
print(f"Criterio 1 — Viajes perdidos por (origen==destino): {num_lost_real}")
print(f"Criterio 2 — Viajes perdidos por caminata al centroide: {num_lost_walk}")
print(f"TOTAL acumulado: {total_unico}")
print(f"Porcentaje perdido:{total_unico/trips.shape[0]}")

In [None]:
SCLOSE_THRESHOLD = 250
LOW_DEMAND = 60

candidates = merged[
    (merged["nearest_distance_m"] < CLOSE_THRESHOLD) &
    (merged["num_trips"] < LOW_DEMAND)
]

candidate_ids = candidates["cluster_id"].tolist()
m = folium.Map(location=[41.90, 12.48], zoom_start=12)

# Capas
layer_candidates = folium.FeatureGroup(name="Clusters candidatos").add_to(m)
layer_trips = folium.FeatureGroup(name="Viajes clusters candidatos").add_to(m)
layer_neighbors = folium.FeatureGroup(name="Nearest clusters").add_to(m)
layer_neighbor_trips = folium.FeatureGroup(name="Viajes nearest clusters").add_to(m)

# -------------------------------
#  Pintar clusters candidatos
# -------------------------------
for _, cand in candidates.iterrows():

    cid = cand["cluster_id"]
    nearest = cand["nearest_cluster"]

    # Coordenadas del cluster candidato
    cand_lat = fused_centroids.loc[fused_centroids["cluster_id"] == cid, "lat"].values[0]
    cand_lon = fused_centroids.loc[fused_centroids["cluster_id"] == cid, "lon"].values[0]

    # Coordenadas del nearest_cluster
    neigh_lat = fused_centroids.loc[fused_centroids["cluster_id"] == nearest, "lat"].values[0]
    neigh_lon = fused_centroids.loc[fused_centroids["cluster_id"] == nearest, "lon"].values[0]

    # ---- Pintar candidato (rojo)
    folium.Marker(
        location=[cand_lat, cand_lon],
        icon=folium.Icon(icon="bolt", prefix="fa", color="red"),
        popup=(
            f"<b>Cluster candidato {cid}</b><br>"
            f"Viajes: {cand['num_trips']}<br>"
            f"Nearest cluster: {nearest}<br>"
            f"Distancia mínima: {cand['nearest_distance_m']:.1f} m"
        )
    ).add_to(layer_candidates)

    # ---- Pintar nearest cluster (azul)
    folium.Marker(
        location=[neigh_lat, neigh_lon],
        icon=folium.Icon(icon="bolt", prefix="fa", color="red"),
        popup=(
            f"<b>Nearest cluster</b><br>"
            f"ID: {nearest}<br>"
            f"Viajes: {cand['num_trips']}<br>"
            f"Nearest cluster {cid}<br>"
            f"Distancia: {cand['nearest_distance_m']:.1f} m"
        )
    ).add_to(layer_neighbors)

# -------------------------------
# Pintar viajes de clusters candidatos
# -------------------------------
for _, trip in coords_origin_df.iterrows():
    if trip["clusterO"] in candidate_ids:
        folium.CircleMarker(
            location=[trip["latO"], trip["lonO"]],
            radius=3,
            color=colores[trip["clusterO"]%len(colores)],
            fill=True,
            fill_opacity=0.6
        ).add_to(layer_trips)

nearest_ids = candidates["nearest_cluster"].unique().tolist()
colores=colores[:-1]
for _, trip in coords_origin_df.iterrows():
    if trip["clusterO"] in nearest_ids:
        folium.CircleMarker(
            location=[trip["latO"], trip["lonO"]],
            radius=3,
            color=colores[trip["clusterO"]%len(colores)],
            fill=True,
            fill_opacity=0.6
        ).add_to(layer_neighbor_trips)
# Añadir control de capas
folium.LayerControl().add_to(m)

m

In [None]:
candidates