In [None]:
import pandas as pd
colab=True
df = pd.read_csv('rome_clean.csv') if colab == True else pd.read_csv('/data/processed/rome_clean.csv')
df

In [None]:
coords_df = df[['lonO', 'latO', 'lonD', 'latD', 'idS', 'dis']].copy()

def categorize_distance(dis):
    if dis < 2500:
        return 'corto'
    elif dis < 5000:
        return 'medio'
    else:
        return 'largo'

def wait(dis):
    if dis == 'corto':
        return 300
    elif dis == 'medio':
        return 420
    else:
        return 600

coords_df['type'] = coords_df['dis'].apply(categorize_distance)
coords_df['wait'] = coords_df['type'].apply(wait)
coords_df

In [None]:
origin_df = coords_df[['lonO', 'latO', 'idS', 'dis', 'type', 'wait']].copy()
origin_df.rename(columns={'lonO': 'lon', 'latO': 'lat'}, inplace=True)
origin_df['origin'] = True

destination_df = coords_df[['lonD', 'latD', 'idS', 'dis', 'type', 'wait']].copy()
destination_df.rename(columns={'lonD': 'lon', 'latD': 'lat'}, inplace=True)
destination_df['origin'] = False

combined_coords_df = pd.concat([origin_df, destination_df], ignore_index=True)
combined_coords_df

In [None]:
from sklearn.cluster import KMeans
from tqdm import tqdm

def clusterizar_coordenadas(df, coords_cols, k_values=[100,150,200,250,300,350,400], random_state=42):
    """
    Clusteriza un DataFrame usando KMeans para varios valores de k con barra de progreso.

    Args:
        df (pd.DataFrame): DataFrame con las coordenadas.
        coords_cols (list): Lista con los nombres de las columnas de coordenadas, e.g., ['lat','lon'].
        k_values (list): Lista de valores de k a probar.
        random_state (int): Semilla para reproducibilidad.

    Returns:
        pd.DataFrame: DataFrame con nuevas columnas cluster_k para cada valor de k.
    """
    for k in tqdm(k_values, desc="Clustering"):
        kmeans = KMeans(n_clusters=k, random_state=random_state)
        df[f'cluster_{k}'] = kmeans.fit_predict(df[coords_cols])
    return df

In [None]:
k_values = list(range(150, 310, 10))
combined_coords_df = clusterizar_coordenadas(combined_coords_df, coords_cols=['lat','lon'], k_values=[190])

In [None]:
combined_coords_df

In [None]:
import numpy as np
from tqdm import tqdm

def haversine_dist(lat1, lon1, lat2, lon2):
    """
    Calcula la distancia Haversine en metros entre dos puntos (lat, lon).
    """
    R = 6371000  # Radio de la Tierra en metros
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

def viajes_fuera_de_radio(df, coords_cols=['lat','lon'], k_values=[100,150,200,250,300,350,400], velocidad_kmh=5):
    resultados = {}

    for k in tqdm(k_values, total=len(k_values)):
        cluster_col = f'cluster_{k}'
        fuera = 0

        # Obtener centroides
        centroides = df.groupby(cluster_col)[coords_cols].mean()

        for idx, row in tqdm(df.iterrows(), total=len(df)):
            centroide = centroides.loc[row[cluster_col]]
            distancia_m = haversine_dist(row[coords_cols[0]], row[coords_cols[1]],
                                        centroide[coords_cols[0]], centroide[coords_cols[1]])

            max_dist = (velocidad_kmh * 1000 / 3600) * row['wait']

            if distancia_m > max_dist:
                fuera += 1

        resultados[k] = fuera

    return resultados


In [None]:
combined_coords_results = viajes_fuera_de_radio(combined_coords_df, k_values=[190])
combined_coords_results

In [None]:
from sklearn.metrics import silhouette_score
from tqdm import tqdm

def calcular_silhouette(df, coords_cols=['lat','lon'], k_values=[100,150,200,250,300,350,400]):
    """
    Calcula el silhouette score para cada clustering guardado en cluster_k.

    Args:
        df (pd.DataFrame): DataFrame con las coordenadas y las columnas cluster_k.
        coords_cols (list): Columnas de lat/lon.
        k_values (list): Valores de k.

    Returns:
        dict: {k: silhouette score}
    """
    scores = {}
    X = df[coords_cols].values

    for k in tqdm(k_values, total=len(k_values)):
        cluster_col = f'cluster_{k}'
        labels = df[cluster_col].values
        if len(set(labels)) > 1:  # Silhouette necesita al menos 2 clusters
            score = silhouette_score(X, labels)
        else:
            score = 0
        scores[k] = score
    return scores

In [None]:
sil_scores = calcular_silhouette(combined_coords_df, k_values=[190])

In [None]:
import plotly.graph_objects as go

def plot_silhouette_vs_viajes(k_values, silhouette_scores, viajes_fuera):
    """
    Grafica silhouette score y viajes fuera de radio en función de k.

    Args:
        k_values (list): Lista de valores de k.
        silhouette_scores (list): Lista de silhouette scores correspondientes a cada k.
        viajes_fuera (dict): Diccionario {k: número de viajes fuera de radio}.
    """
    # Convertir dict de viajes a lista en el mismo orden de k_values
    viajes = [viajes_fuera[k] for k in k_values]
    silhouette_scores = [silhouette_scores[k] for k in k_values]

    fig = go.Figure()

    # Línea de silhouette score
    fig.add_trace(go.Scatter(
        x=k_values,
        y=silhouette_scores,
        name='Silhouette Score',
        mode='lines+markers',
        yaxis='y1'
    ))

    # Línea de viajes fuera de radio
    fig.add_trace(go.Scatter(
        x=k_values,
        y=viajes,
        name='Viajes fuera de radio',
        mode='lines+markers',
        yaxis='y2'
    ))

    # Configurar ejes
    fig.update_layout(
        title='Silhouette Score vs Viajes fuera de radio',
        xaxis=dict(title='Número de clusters k'),
        yaxis=dict(title='Silhouette Score', side='left'),
        yaxis2=dict(title='Viajes fuera de radio', overlaying='y', side='right'),
        legend=dict(x=0.1, y=1.1, orientation='h')
    )

    fig.show()

In [None]:
plot_silhouette_vs_viajes(k_values, sil_scores, combined_coords_results)

Elegimos 190 porque es donde parece que se estanca un poco el silhouette score y la curva de viajes fuera de radio. A partir de 220, empieza a crecer pero consideramos que ya es un número muy elevado de estaciones.

In [None]:
combined_coords_df = combined_coords_df[['lon', 'lat', 'idS', 'type', 'origin', 'cluster_190']]
combined_coords_df

In [None]:
import folium
import pandas as pd

cluster_col = 'cluster_190'
coords_cols = ['lat', 'lon']

# Calcular centroides
centroides = combined_coords_df.groupby(cluster_col)[coords_cols].mean().reset_index()

# Crear mapa centrado en los puntos
mapa = folium.Map(location=[combined_coords_df['lat'].mean(), combined_coords_df['lon'].mean()], zoom_start=13)

# Colores para clusters
colores = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Añadir centroides
for i, row in centroides.iterrows():
    folium.Marker(
        location=[row['lat'], row['lon']],
        popup=f"Cluster {row[cluster_col]}",
        icon=folium.Icon(color=colores[i % len(colores)], icon='star')
    ).add_to(mapa)

# Añadir puntos
for i, row in combined_coords_df.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=3,
        color=colores[row['cluster_190'] % len(colores)],
        fill=True,
        fill_opacity=0.6
    ).add_to(mapa)

mapa


Ahora vamos a agrupar estaciones en un radio de cercania, para que si dos estaciones están muy juntas, se agrupen. Para calcular el punto medio, vamos a tener en cuenta el número de viajes en cada cluster.

In [None]:
centroides.rename({'cluster_190': 'cluster'}, axis=1, inplace=True)

In [None]:
combined_coords_df.groupby('cluster_190')['origin'].agg('count')

In [None]:
centroides['num_points'] = combined_coords_df.groupby('cluster_190')['origin'].agg('count')
centroides.head()

In [None]:
# !pip install haversine

In [None]:
from haversine import haversine, Unit

RADIUS = 200 # Radius in kilometers

centroides_new = []
combined_centroids = []

for _, row in centroides.iterrows():
  id = row['cluster']
  if id not in combined_centroids:
    lat, lon = row['lat'], row['lon']

    nearby_dests = []

    # Collect nearby centroids (excluding itself)
    for _, row2 in centroides.iterrows():
      id2 = row2['cluster']
      if id != id2 and id2 not in combined_centroids:
        lat2, lon2 = row2['lat'], row2['lon']
        distance = haversine((lat, lon), (lat2, lon2), unit=Unit.METERS)
        if distance <= RADIUS:
          nearby_dests.append(row2)
          combined_centroids.append(id2)

    # Calculate weighted average for new_lat and new_lon
    weighted_lat_sum = lat * row['num_points']
    weighted_lon_sum = lon * row['num_points']
    total_weight = row['num_points']

    for nearby_row in nearby_dests:
      weighted_lat_sum += nearby_row['lat'] * nearby_row['num_points']
      weighted_lon_sum += nearby_row['lon'] * nearby_row['num_points']
      total_weight += nearby_row['num_points']

    if total_weight > 0:
      new_lat = weighted_lat_sum / total_weight
      new_lon = weighted_lon_sum / total_weight
    else: # Fallback, should ideally not be reached if row['num_points'] > 0
      new_lat = lat
      new_lon = lon

    # This part is currently not adding anything to centroides_new,
    # you might want to append the new_lat, new_lon, and other relevant info
    # to centroides_new list for further processing.
    centroides_new.append({'cluster': id, 'lat': new_lat, 'lon': new_lon, 'num_points': total_weight})



In [None]:
len(centroides_new)

In [None]:
combined_coords_df

In [None]:
centroides_new = pd.DataFrame(centroides_new)

In [None]:
import folium
from folium.plugins import MarkerCluster

# =========================================================
# CONFIGURACIÓN
# =========================================================
colores = ['red', 'blue', 'green', 'purple', 'orange',
           'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Repetimos colores si hay más clusters que colores disponibles
color_map = {}
for idx, row1 in centroides_new.iterrows():
    color_map[row1['cluster']] = colores[idx % len(colores)]

# =========================================================
# CREAR MAPA
# =========================================================
m = folium.Map(location=[41.90, 12.48], zoom_start=12)

# Capa para cluster fusionado
fused_layer = folium.FeatureGroup(name="Fused Centroids").add_to(m)

# Capa de círculos 50m para destinos aislados
dest_radius_layer = folium.FeatureGroup(name="Destinations 50m Radius").add_to(m)


# =========================================================
# PINTAR CENTROIDES FUSIONADOS
# =========================================================
for _, row in centroides_new.iterrows():
    cid = row["cluster"]
    lat = row["lat"]
    lon = row["lon"]

    # --- MARCADOR ---
    folium.Marker(
        location=[lat, lon],
        icon=folium.Icon(
            icon="bolt",
            prefix="fa",
            color=color_map[cid]
        ),
        popup=(
            f"<b>Cluster {cid}</b><br>"
            f"lat: {lat:.5f}<br>"
            f"lon: {lon:.5f}"
        )
    ).add_to(fused_layer)

    folium.Circle(
        location=[lat, lon],
        radius=200,  # 50 metros
        color=color_map[cid],
        fill=True,
        fill_opacity=0.15
    ).add_to(dest_radius_layer)


# =========================================================
# CAPAS
# =========================================================
folium.LayerControl().add_to(m)

# Mostrar
m


In [None]:
!pip install shapely osmnx networkx

In [None]:
#!pip install shapely osmnx networkx
import json
import osmnx as ox
import networkx as nx
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.ops import nearest_points
import pandas as pd
from tqdm import tqdm

# =========================================================
# 1. LOAD AND PREPARE RESTRICTED ZONES
# =========================================================

# Load the content you provided
json_data = json.load(open("restricted_zones.json"))

# The JSON coordinates are [Lon, Lat] (GeoJSON standard).
# We convert them to Shapely Polygons.
polygons_list = []
for poly_coords in json_data["restricted_polygons"]:
    # Ensure the polygon is closed (first point == last point)
    if poly_coords[0] != poly_coords[-1]:
        poly_coords.append(poly_coords[0])
    polygons_list.append(Polygon(poly_coords))

# Create a single MultiPolygon object for easier checking
restricted_area = MultiPolygon(polygons_list)

# =========================================================
# 2. DOWNLOAD STREET NETWORK (The "Reality" Layer)
# =========================================================
print("Downloading street network from OpenStreetMap...")

# We download the street network around your average coordinates.
# 'drive' ensures we don't snap to pedestrian paths inside buildings or parks if you need cars.
# Use 'walk' if these are pedestrian clusters.
center_lat = centroides_new["lat"].mean()
center_lon = centroides_new["lon"].mean()

# Download graph within 3km radius (adjust if your area is larger)
G = ox.graph_from_point((center_lat, center_lon), dist=8000, network_type='walk')

# Project graph to UTM (meters) for accurate nearest-node finding, then back to lat/lon
# Note: For simple nearest node finding without strict meter precision, we can use the unprojected graph directly
# with the newer osmnx functions, but creating a GeoDataFrame is safest.
gdf_nodes = ox.graph_to_gdfs(G, edges=False)

# =========================================================
# 3. LOGIC TO MOVE CENTROIDS
# =========================================================

def correct_centroid_location(row):
    # Create Point (Shapely uses Lon, Lat order)
    current_point = Point(row['lon'], row['lat'])

    # 1. CHECK RESTRICTED ZONES
    if restricted_area.contains(current_point):
        # Find nearest point on the boundary of the restricted area
        # nearest_points returns tuple (geom1, geom2), we want the point on the polygon (index 1)
        p1, p2 = nearest_points(current_point, restricted_area)
        target_point = p1 # p1 is the point on the restricted_area boundary closest to current_point

        # Update our "current" point to this new boundary location
        current_point = target_point
        status = "Moved out of Zone"
    else:
        status = "Zone OK"

    # 2. SNAP TO REALITY (Avoid rivers/houses)
    # We take the (potentially moved) point and find the nearest street node
    nearest_node_id = ox.distance.nearest_nodes(G, current_point.x, current_point.y)
    node_data = G.nodes[nearest_node_id]

    # Update lat/lon to the street node's coordinates
    new_lat = node_data['y']
    new_lon = node_data['x']

    return pd.Series([new_lat, new_lon, status])

# =========================================================
# 4. APPLY TO DATAFRAME
# =========================================================
print("Adjusting centroids...")

# Apply function with tqdm
tqdm.pandas()
centroides_new[['lat', 'lon', 'adjustment_status']] = centroides_new.progress_apply(correct_centroid_location, axis=1)

print("Done! Centroids moved out of zones and snapped to nearest streets.")
centroides_new.head()

In [None]:
from sklearn.metrics.pairwise import haversine_distances
import numpy as np

# 1. Convertir coordenadas a Radianes (Scikit-learn lo requiere así)
# Asegúrate que el orden sea [Latitud, Longitud]
points_rad = np.radians(combined_coords_df[['lat', 'lon']].to_numpy())
centroids_rad = np.radians(centroides_new[['lat', 'lon']].to_numpy())

# 2. Calcular matriz de distancias (Resultado en radianes)
# Esto genera una matriz de tamaño (N_puntos x N_centroides)
dist_matrix = haversine_distances(points_rad, centroids_rad)

# 3. Encontrar el índice del valor mínimo por fila
# (No hace falta convertir a metros para saber cuál es el menor)
closest_centroid_indices = np.argmin(dist_matrix, axis=1)

# 4. Asignar los IDs reales
combined_coords_df['clusterFinal'] = centroides_new.iloc[closest_centroid_indices]["cluster"].values

combined_coords_df

In [None]:
import matplotlib.pyplot as plt

plt.hist(combined_coords_df.groupby('clusterFinal')['lat'].agg('count'), bins= 190)
plt.show()

In [None]:
combined_coords_df.drop('cluster_190', axis=1, inplace=True)

In [None]:
combined_coords_df

In [None]:
centroides_new

In [None]:
combined_coords_df.to_csv('points_with_clusters2.csv')

In [None]:
centroides_new.to_csv('centroides2.csv')

## VIAJES CLUSTERIZADOS, EVALUACION DE ESTOS:

In [None]:
combined_coords_df = pd.read_csv('points_with_clusters2.csv')
centroides_new = pd.read_csv('centroides2.csv')

In [None]:
import folium
from folium.plugins import MarkerCluster

# =========================================================
# CONFIGURACIÓN
# =========================================================
colores = ['red', 'blue', 'green', 'grey', 'purple', 'orange',
           'darkred', 'lightblue', 'cadetblue', 'darkgreen', 'pink']

# Repetimos colores si hay más clusters que colores disponibles
color_map = {}
for idx, row1 in centroides_new.iterrows():
    color_map[row1['cluster']] = colores[idx % len(colores)]

# =========================================================
# CREAR MAPA
# =========================================================
m = folium.Map(location=[41.90, 12.48], zoom_start=12)

# Capa para cluster fusionado
fused_layer = folium.FeatureGroup(name="Fused Centroids").add_to(m)

# Capa de círculos 50m para destinos aislados
dest_radius_layer = folium.FeatureGroup(name="Destinations 50m Radius").add_to(m)

# Capa para los viajes
trips_layer = folium.FeatureGroup(name="Trips").add_to(m)

# =========================================================
# PINTAR CENTROIDES FUSIONADOS
# =========================================================
for _, row in centroides_new.iterrows():
    cid = row["cluster"]
    lat = row["lat"]
    lon = row["lon"]

    # --- MARCADOR --- para el centroide
    folium.Marker(
        location=[lat, lon],
        icon=folium.Icon(
            icon="bolt",
            prefix="fa",
            color=color_map[cid]
        ),
        popup=(f"<b>Cluster {cid}</b><br>lat: {lat:.5f}<br>lon: {lon:.5f}")
    ).add_to(fused_layer)

# =========================================================
# PINTAR LOS VIAJES PERTENECIENTES A CADA CLUSTER
# =========================================================
for _, row in combined_coords_df.iterrows():
    trip_cluster = row["clusterFinal"]  # El ID del cluster del viaje
    trip_lat = row["lat"]
    trip_lon = row["lon"]

    # Asegurarse de que el viaje pertenece a un cluster válido
    if trip_cluster in color_map:
        # Añadir marcador para cada viaje
        folium.CircleMarker(
            location=[trip_lat, trip_lon],
            radius=5,
            color=color_map[trip_cluster],
            fill=True,
            fill_opacity=0.7,
            popup=(f"<b>Cluster: {trip_cluster}</b><br>Lat: {trip_lat:.5f}<br>Lon: {trip_lon:.5f}")
        ).add_to(trips_layer)

# =========================================================
# CAPAS
# =========================================================
folium.LayerControl().add_to(m)

# Mostrar
m

## Distirbucion anclajes/desanclajes por cluster

In [None]:
cluster_sizes = centroides_new['num_points']
print("Extracted cluster sizes:")
print(cluster_sizes.head())

In [None]:
bin_width = 20
min_val = int(cluster_sizes.min() // bin_width * bin_width)
max_val = int((cluster_sizes.max() // bin_width + 1) * bin_width)
bins = list(range(min_val, max_val + bin_width, bin_width))

bin_labels = [f"{i}-{i+bin_width-1}" for i in bins[:-1]]

binned_data = pd.cut(cluster_sizes, bins=bins, labels=bin_labels, right=False)

bin_counts = binned_data.value_counts().sort_index()

print("Cluster counts per bin:")
print(bin_counts)


**Reasoning**:
The data has been prepared with bins and their corresponding counts. Now, a bar chart needs to be generated to visualize the distribution of `num_points` per cluster, as requested in the main task. This will involve using `matplotlib.pyplot` to create a bar chart from `bin_counts`.



In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 7))
plt.bar(bin_counts.index, bin_counts.values, color='skyblue')
plt.xlabel('Number of Points Range (bin_width = 20)')
plt.ylabel('Frequency of Clusters')
plt.title('Distribution of Number of Points per Cluster (Bin Width = 20)')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
centroides_new['num_points'].mean()

## VIAJES PERDIDOS

In [None]:
!pip install haversine

In [None]:
from haversine import haversine, Unit

# Función que devuelve el tiempo máximo en segundos según el tipo de distancia
def wait(dis):
    if dis == 'corto':
        return 300  # 5 minutos en segundos
    elif dis == 'medio':
        return 420  # 7 minutos en segundos
    else:
        return 600  # 10 minutos en segundos

# Función que calcula la distancia máxima que una persona puede caminar
# usando una velocidad de 5 km/h
def max_distance(time_sec):
    # 5 km/h -> 5000 metros por hora -> 83.33 metros por minuto -> 1.39 metros por segundo
    velocity = 5 * 1000 / 3600  # 5 km/h en metros por segundo
    return velocity * time_sec  # Distancia máxima en metros

# Listas para almacenar los viajes perdidos según los criterios
lost_trips = []

# Recorremos cada viaje y verificamos si se encuentra dentro de la distancia máxima caminable
for _, row in combined_coords_df.iterrows():
    trip_cluster = row["clusterFinal"]  # El ID del cluster del viaje
    trip_lat = row["lat"]
    trip_lon = row["lon"]
    trip_type = row["type"]  # corto, medio o largo

    # Obtener el tiempo de espera máximo en segundos
    max_wait_time = wait(trip_type)

    # Calcular la distancia máxima que una persona puede caminar en el tiempo máximo
    max_walk_distance = max_distance(max_wait_time)

    # Obtener las coordenadas del cluster final al que pertenece el viaje
    cluster_row = centroides_new[centroides_new['cluster'] == trip_cluster].iloc[0]
    cluster_lat = cluster_row["lat"]
    cluster_lon = cluster_row["lon"]

    # Calcular la distancia entre el viaje y el centro del cluster final
    distance_to_cluster = haversine((trip_lat, trip_lon), (cluster_lat, cluster_lon), unit=Unit.METERS)

    # Si la distancia es mayor que la distancia máxima caminable, se considera un viaje perdido
    if distance_to_cluster > max_walk_distance:
        lost_trips.append({
            'trip_cluster': trip_cluster,
            'trip_lat': trip_lat,
            'trip_lon': trip_lon,
            'distance': distance_to_cluster,
            'max_walk_distance': max_walk_distance
        })

# Mostrar los viajes perdidos
lost_trips_df = pd.DataFrame(lost_trips)
print(f"Total de viajes perdidos: {len(lost_trips_df)}")
lost_trips_df


In [None]:
from haversine import haversine, Unit
import pandas as pd

# Función que devuelve el tiempo máximo en segundos según el tipo de distancia
def wait(dis):
    if dis == 'corto':
        return 300  # 5 minutos en segundos
    elif dis == 'medio':
        return 420  # 7 minutos en segundos
    else:
        return 600  # 10 minutos en segundos

# Función que calcula la distancia máxima que una persona puede caminar
# usando una velocidad de 5 km/h
def max_distance(time_sec):
    # 5 km/h -> 5000 metros por hora -> 83.33 metros por minuto -> 1.39 metros por segundo
    velocity = 5 * 1000 / 3600  # 5 km/h en metros por segundo
    return velocity * time_sec  # Distancia máxima en metros

# Lista para almacenar los viajes perdidos según los criterios
lost_trips = []

# Total de puntos en combined_coords_df
total_points = len(combined_coords_df)

# Recorrer cada par de origen y destino (los primeros 24000 son orígenes, el resto son destinos)
for i in range(total_points // 2):
    # Origen (primer mitad de los puntos)
    origin_row = combined_coords_df.iloc[i]
    origin_lat = origin_row["lat"]
    origin_lon = origin_row["lon"]
    origin_type = origin_row["type"]  # tipo de distancia (corto, medio, largo)
    origin_cluster = origin_row["clusterFinal"]  # cluster al que pertenece el origen

    # Obtener las coordenadas del centroide del cluster de origen
    origin_cluster_row = centroides_new[centroides_new['cluster'] == origin_cluster].iloc[0]
    origin_cluster_lat = origin_cluster_row["lat"]
    origin_cluster_lon = origin_cluster_row["lon"]

    # Destino (segunda mitad de los puntos)
    destination_row = combined_coords_df.iloc[i + total_points // 2]
    dest_lat = destination_row["lat"]
    dest_lon = destination_row["lon"]
    dest_type = destination_row["type"]  # tipo de distancia (corto, medio, largo)
    dest_cluster = destination_row["clusterFinal"]  # cluster al que pertenece el destino

    # Obtener las coordenadas del centroide del cluster de destino
    dest_cluster_row = centroides_new[centroides_new['cluster'] == dest_cluster].iloc[0]
    dest_cluster_lat = dest_cluster_row["lat"]
    dest_cluster_lon = dest_cluster_row["lon"]

    # Obtener el tiempo máximo de espera (dependiendo del tipo de distancia)
    max_wait_time = wait(origin_type)  # Usamos el tipo de distancia del origen

    # Calcular la distancia máxima caminable
    max_walk_distance = max_distance(max_wait_time)

    # Calcular la distancia entre el origen y su centroide de cluster
    origin_distance_to_cluster = haversine((origin_lat, origin_lon), (origin_cluster_lat, origin_cluster_lon), unit=Unit.METERS)

    # Calcular la distancia entre el destino y su centroide de cluster
    dest_distance_to_cluster = haversine((dest_lat, dest_lon), (dest_cluster_lat, dest_cluster_lon), unit=Unit.METERS)

    # Verificar si el origen o el destino están fuera del alcance caminable
    if origin_distance_to_cluster > max_walk_distance or dest_distance_to_cluster > max_walk_distance:
        lost_trips.append({
            'origin_cluster': origin_cluster,
            'destination_cluster': dest_cluster,
            'origin_lat': origin_lat,
            'origin_lon': origin_lon,
            'destination_lat': dest_lat,
            'destination_lon': dest_lon,
            'origin_distance': origin_distance_to_cluster,
            'destination_distance': dest_distance_to_cluster,
            'max_walk_distance': max_walk_distance,
            'reason': 'Exceeds maximum walkable distance'
        })

    # También considerar el viaje como perdido si el origen y el destino están en el mismo cluster
    elif origin_cluster == dest_cluster:
        lost_trips.append({
            'origin_cluster': origin_cluster,
            'destination_cluster': dest_cluster,
            'origin_lat': origin_lat,
            'origin_lon': origin_lon,
            'destination_lat': dest_lat,
            'destination_lon': dest_lon,
            'origin_distance': 'N/A',  # No se calcula la distancia, ya que es un viaje perdido por el mismo cluster
            'destination_distance': 'N/A',
            'max_walk_distance': 'N/A',
            'reason': 'Same origin and destination cluster'
        })

# Convertir la lista de viajes perdidos en un DataFrame
lost_trips_df = pd.DataFrame(lost_trips)

# Mostrar el número total de viajes perdidos
total_lost = len(lost_trips_df)
total_trips = total_points // 2  # Número total de viajes (ya que hay un origen y un destino por viaje)
proportion_lost = total_lost / total_trips * 100

print(f"Total de viajes perdidos: {total_lost}")
print(f"Proporción de viajes perdidos: {proportion_lost:.2f}%")

# Mostrar los primeros 5 viajes perdidos
lost_trips_df.head()

In [None]:
combined_coords_df

In [None]:
df = df.merge(combined_coords_df[['lon', 'lat', 'clusterFinal']],
              left_on=['lonO', 'latO'],
              right_on=['lon', 'lat'],
              how='left')
print(f"Número de filas después del primer merge: {len(df)}")

# Renombrar la columna 'clusterFinal' obtenida del merge para 'clusterOrigen'
df.rename(columns={'clusterFinal': 'clusterOrigen'}, inplace=True)

# Merge para asignar clusterFinal basado en las coordenadas de destino (lonD, latD)
df = df.merge(combined_coords_df[['lon', 'lat', 'clusterFinal']],
              left_on=['lonD', 'latD'],
              right_on=['lon', 'lat'],
              how='left')

# Renombrar la columna 'clusterFinal' obtenida del merge para 'clusterFinal'
df.rename(columns={'clusterFinal': 'clusterFinal'}, inplace=True)
print(f"Número de filas después del primer merge: {len(df)}")

# Mostrar el resultado
df


In [None]:
df = df.drop_duplicates(subset=['lonO',	'latO',	'lonD'	,'latD'])


In [None]:
df

In [None]:
# Eliminar las columnas lonO, latO, lonD, latD
df = df.drop(columns=['lon_x', 'lat_x', 'lon_y', 'lat_y'])

# Mostrar e resultado
df


In [None]:
lost_trips_df

In [None]:
# Realizar un merge entre df y lost_trips para encontrar las filas coincidentes
merged_df = df.merge(lost_trips_df[['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon']],
                     left_on=['latO', 'lonO', 'latD', 'lonD'],
                     right_on=['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
                     how='left', indicator=True)

# Filtrar aquellas filas que no están en lost_trips (es decir, las filas donde el indicador no es 'both')
df_final = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# Mostrar el DataFrame limpio
df_final


In [None]:
df_final.to_csv("potenciales_viajes_cubiertos.csv")

In [None]:
sum(df_final['price'])

In [None]:
len(df_final['idS'].unique())