<a href="https://colab.research.google.com/github/nuhash-cell/GeoBike-Station-Planner/blob/main/Bike_stations_optimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Source
The dataset was obtained from Capital Bikeshare's System Data. It includes detailed trip records, such as start and end times, trip durations, and rider demographics, providing a comprehensive view of bike usage in the Washington, D.C. area.

In [None]:
import pandas as pd

# Load the CSV files into separate DataFrames
df_aug = pd.read_csv('/content/202408-capitalbikeshare-tripdata.csv')
df_sep = pd.read_csv('/content/202409-capitalbikeshare-tripdata.csv')
df_oct = pd.read_csv('/content/202410-capitalbikeshare-tripdata.csv')

# Display the shapes of the DataFrames
print(f"August DataFrame: {df_aug.shape}")
print(f"September DataFrame: {df_sep.shape}")
print(f"October DataFrame: {df_oct.shape}")

August DataFrame: (614639, 13)
September DataFrame: (720309, 13)
October DataFrame: (725346, 13)


Data Loading

---


The dataset was loaded into a pandas DataFrame from a CSV file using the specified file path. This prepares the data for analysis, enabling efficient manipulation and exploration.

In [None]:
# Append the three DataFrames into one
df = pd.concat([df_aug, df_sep, df_oct], ignore_index=True)

# Filter out rows where start_station_name or end_station_name is missing


In [None]:
start_stations = df[['start_station_name', 'start_lat', 'start_lng']].dropna()
end_stations = df[['end_station_name', 'end_lat', 'end_lng']].dropna()

# Rename columns to make them consistent for combining

In [None]:
start_stations.rename(columns={'start_station_name': 'station_name',
                               'start_lat': 'latitude',
                               'start_lng': 'longitude'}, inplace=True)

end_stations.rename(columns={'end_station_name': 'station_name',
                             'end_lat': 'latitude',
                             'end_lng': 'longitude'}, inplace=True)

# Concatenate start and end stations

In [None]:
all_stations = pd.concat([start_stations, end_stations])

# Drop duplicates to keep unique station names with coordinates


In [None]:
Existing_stations = all_stations.drop_duplicates(subset=['station_name']).reset_index(drop=True)


# Save the stations to a CSV file

In [None]:
output_file_path = '/content/Existing_stations.csv'
Existing_stations.to_csv(output_file_path, index=False)

import libraries

In [None]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import osmnx as ox
import networkx as nx
from geopy.distance import geodesic

# Extract coordinates (start and end points)

In [None]:
coordinates = pd.concat([
    df[['start_lat', 'start_lng']].rename(columns={'start_lat': 'latitude', 'start_lng': 'longitude'}),
    df[['end_lat', 'end_lng']].rename(columns={'end_lat': 'latitude', 'end_lng': 'longitude'})
]).dropna()
print(f"Loaded {len(coordinates)} coordinates.")

Loaded 4119196 coordinates.


# Define the place and download the simplified road network

In [None]:
place_name = "Washington, D.C., USA"
G = ox.graph_from_place(place_name, network_type="bike", simplify=True, retain_all=False)

# Extract nodes with x (longitude) and y (latitude) coordinates


In [None]:
nodes = ox.graph_to_gdfs(G, nodes=True, edges=False)
nodes_df = nodes[['x', 'y']].reset_index()

Create a GeoDataFrame for road network nodes

In [None]:
nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry=gpd.points_from_xy(nodes_df['x'], nodes_df['y']), crs="EPSG:4326")

Create a geofence using the convex hull of the road network nodes

In [None]:
geofence_polygon = nodes_gdf.unary_union.convex_hull

Convert the coordinates DataFrame to a GeoDataFrame

In [None]:
coordinates_gdf = gpd.GeoDataFrame(
    coordinates,
    geometry=gpd.points_from_xy(coordinates['longitude'], coordinates['latitude']),
    crs="EPSG:4326"
)

Filter the coordinates that fall within the geofence polygon

In [None]:
filtered_coordinates = coordinates_gdf[coordinates_gdf.within(geofence_polygon)].reset_index(drop=True)

Convert the existing stations DataFrame to a GeoDataFrame

In [None]:
stations_gdf = gpd.GeoDataFrame(
    Existing_stations,
    geometry=gpd.points_from_xy(Existing_stations['longitude'], Existing_stations['latitude']),
    crs="EPSG:4326"
)

Filter the exisitng stations that fall within the geofence polygon

In [None]:
filtered_stations = stations_gdf[stations_gdf.within(geofence_polygon)].reset_index(drop=True)

Save the filtered stations to a CSV file

In [None]:
filtered_output_path = '/content/filtered_unique_stations_within_geofence.csv'
filtered_stations[['station_name', 'latitude', 'longitude']].to_csv(filtered_output_path, index=False)

save the filtered coordinates data

In [None]:
filtered_file_path = '/content/filtered_coordinates_within_geofence.csv'
coordinates_df = pd.read_csv(filtered_file_path)

Define grid size (in degrees, approx. ~100 meters)

In [None]:
grid_size = 0.001  # About 100 meters (0.001 degrees)

# Calculate grid cell indices (longitude and latitude)

In [None]:
coordinates_df['x_cell'] = (coordinates_df['longitude'] // grid_size).astype(int)
coordinates_df['y_cell'] = (coordinates_df['latitude'] // grid_size).astype(int)

Group by grid cell and compute centroid and density

In [None]:
cluster_summary = coordinates_df.groupby(['x_cell', 'y_cell']).agg(
    x=('longitude', 'mean'),
    y=('latitude', 'mean'),
    density=('latitude', 'size')
).reset_index()


Save the clustered data to a CSV file

In [None]:
output_path = '/content/grid_clustered_coordinates.csv'
cluster_summary[['x', 'y', 'density']].to_csv(output_path, index=False)

Load the clustered coordinates and unique stations data

In [None]:
clustered_file_path = '/content/grid_clustered_coordinates.csv'
stations_file_path = '/content/Existing_stations.csv'
clusters_df = pd.read_csv(clustered_file_path)
stations_df = pd.read_csv(stations_file_path)

Rank the clusters by density in descending order

In [None]:
clusters_df = clusters_df.sort_values(by='density', ascending=False).reset_index(drop=True)

Function to check if a cluster is within a 200-meter radius of any station

In [None]:
def is_within_radius(cluster_point, stations, radius=200):
    cluster_coords = (cluster_point['y'], cluster_point['x'])
    for _, station in stations.iterrows():
        station_coords = (station['latitude'], station['longitude'])
        if geodesic(cluster_coords, station_coords).meters <= radius:
            return True
    return False

Identify the top 5 high-density clusters without a nearby station

In [None]:
unserved_clusters = []
for _, cluster in clusters_df.iterrows():
    if not is_within_radius(cluster, stations_df):
        unserved_clusters.append(cluster)
    if len(unserved_clusters) >= 5:
        break


Create and save a DataFrame for the top 5 unserved clusters

In [None]:
unserved_clusters_df = pd.DataFrame(unserved_clusters)

output_path = '/content/top_5_unserved_clusters.csv'
unserved_clusters_df.to_csv(output_path, index=False)

print(f"Top 5 unserved clusters saved to {output_path}.")

Top 5 unserved clusters saved to /content/top_5_unserved_clusters.csv.
