In [1]:
# Importing libraries
# 📦 Standard Library
import sys
from pathlib import Path
import json
from collections import Counter

# 📊 Data Analysis & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 🌍 Geospatial Libraries
import geopandas as gpd
import geopy
from shapely.geometry import Point, shape
from shapely.ops import nearest_points
from shapely.wkt import loads 
from geopy import distance

# 🗺️ Folium Mapping
import folium

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import BallTree
from scipy.spatial import cKDTree

# 🛠️ Project-Specific Modules
sys.path.append(str(Path().resolve().parent / "src"))
from paths import DOCS_DIR, INTERIM_DIR, GEO_DIR, PROCESSED_DIR
from helpers_folium import load_geojson_as_gdf, load_bikeshare_data, create_centered_map




In [2]:
# load csv with prince george data

prince_george = load_bikeshare_data(INTERIM_DIR / "prince_georgy_cabi.csv")

In [3]:
# dropping unnecessary columns
prince_george = prince_george.drop(columns=["Unnamed: 0","AREA_COVER", "index_right",'ACREAGE',
       'IMPRT_DATE', 'SHAPE_AREA', 'SHAPE_LEN'])

In [4]:
# keeping only the stations that are in the list of interest
prince_george_fixed = prince_george[prince_george["start_station_name"].isin(['1301 McCormick Dr / Wayne K. Curry Admin Bldg',
 '40th Ave & Bladensburg Rd',
 'Baltimore Ave & Jefferson St',
 'Baltimore Ave & Van Buren St / Riverdale Park Station',
 'Baltimore Avenue and Hotel Drive at UMD',
 'Bladensburg Waterfront Park',
 'Bowdoin Ave & Calvert Rd/ College Park Metro',
 'Bowdoin Ave & Calvert Rd/ College Park Station',
 'Capitol Heights Metro',
 'Chillum Rd & Riggs Rd / Riggs Plaza',
 'Crescent Rd & Ridge Rd',
 'Fleet St & Waterfront St',
 'Greenbelt Station Parkway',
 'Guilford Drive & Rowalt Drive / UMD',
 'Hyattsville Library / Adelphi Rd & Toledo Rd',
 "Largo Rd & Campus Way / Prince Georges's Comm Col",
 'Largo Town Center Metro',
 'National Harbor Carousel',
 'New Hampshire Ave & East-West Hwy',
 'Northwestern High School',
 'Oglethorpe St & 42nd Ave',
 'Oxon Hill Park & Ride',
 'Perry & 35th St',
 "Prince George's Plaza Metro",
 'Queens Chapel & Hamilton St',
 'Rhode Island Ave & 39th St / Brentwood Arts Exchange',
 'Rhode Island Avenue /Charles Armentrout Drive - Melrose Skate Park ',
 'Riggs Rd & East West Hwy',
 'Riverdale Park Town Center',
 'Roosevelt Center & Crescent Rd',
 'Southern Ave Metro',
 'Tanger Outlets',
 'The Mall at Prince Georges',
 'Walker Mill Road/ Walker Mill Regional Park ',
 'West Hyattsville Metro'])|prince_george["start_station_name"].isna()]

In [5]:
# null values
prince_george_fixed.isna().sum()

rideable_type                 0
started_at                    0
ended_at                      0
start_station_name        62863
end_station_name          63953
member_casual                 0
start_lat                     0
start_lng                     0
end_lat                     168
end_lng                     168
trip_duration_minutes     77832
time_of_day               77832
year                          0
geometry                      0
WARD                     130316
NAME_left                130316
COUNTY                        0
area                          0
NAME_right                    0
dtype: int64

In [6]:
# ride type count
prince_george_fixed["rideable_type"].value_counts()

rideable_type
electric_bike    93496
classic_bike     32922
docked_bike       3898
Name: count, dtype: int64

In [7]:
# defining electric bikes as the ones that are not docked
ebikes = prince_george[prince_george["rideable_type"] == "electric_bike"]

In [8]:
# defining docked bikes as the ones that are either classic bikes or docked bikes
docked = prince_george[(prince_george["rideable_type"] == "classic_bike")&(prince_george["rideable_type"] == "docked_bike")]

In [9]:
docked.isna().sum()

rideable_type            0
started_at               0
ended_at                 0
start_station_name       0
end_station_name         0
member_casual            0
start_lat                0
start_lng                0
end_lat                  0
end_lng                  0
trip_duration_minutes    0
time_of_day              0
year                     0
geometry                 0
WARD                     0
NAME_left                0
COUNTY                   0
area                     0
NAME_right               0
dtype: int64

In [10]:
ebikes.isna().sum()

rideable_type                0
started_at                   0
ended_at                     0
start_station_name       62863
end_station_name         63445
member_casual                0
start_lat                    0
start_lng                    0
end_lat                      0
end_lng                      0
trip_duration_minutes    66921
time_of_day              66921
year                         0
geometry                     0
WARD                     93566
NAME_left                93566
COUNTY                       0
area                         0
NAME_right                   0
dtype: int64

all the station missing values correspond to ebikes.

In [11]:
# defining unique stations in Prince George's data
avg_lat_per_station = prince_george_fixed.groupby("start_station_name", as_index=False, observed=False)["start_lat"].mean()
avg_lng_per_station = prince_george_fixed.groupby("start_station_name", as_index=False, observed = False)["start_lng"].mean()

pg_unique_stations= avg_lat_per_station.merge(avg_lng_per_station)
pg_unique_stations = pg_unique_stations.dropna(subset=["start_lat","start_lng"])
print("Unique Caby stations in Prince George: ",len(pg_unique_stations))

Unique Caby stations in Prince George:  35


## Calculating distance to closest stations

In [12]:
# Calculating the average distance to the nearest stations using Nearest Neighbors
# Extract coordinates for unique stations
coords = pg_unique_stations[["start_lat", "start_lng"]].values

# Use Nearest Neighbors to find closest stations
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(coords)
distances, _ = nbrs.kneighbors(coords)

# Exclude distance to itself (first column is 0)
pg_unique_stations["avg_distance_to_nearest_station"] = np.mean(distances[:, 1:], axis=1)

#REVIEW - When using NearestNeighbors from sklearn.neighbors with algorithm='ball_tree' or algorithm='kd_tree', the distance is measured in Euclidean units (i.e., degrees of latitude/longitude). This is not ideal for geographic distances because latitude and longitude are not uniform in scale.

In [13]:
# using Havesian distance instead of Eucledian Units

# Convert degrees to radians (needed for haversine)
pg_unique_stations[["lat_rad", "lon_rad"]] = np.radians(pg_unique_stations[["start_lat", "start_lng"]])

# Fit Nearest Neighbors model using Haversine distance
coords = pg_unique_stations[["lat_rad", "lon_rad"]].values
tree = BallTree(coords, metric="haversine")

# Find nearest stations (excluding itself)
distances, _ = tree.query(coords, k=5)  # 5 nearest neighbors

# Convert distances from radians to kilometers (Earth radius ≈ 6371 km)
pg_unique_stations["avg_distance_nearest_station_km"] = distances[:, 1:].mean(axis=1) * 6371

In [14]:
pg_unique_stations.head()

Unnamed: 0,start_station_name,start_lat,start_lng,avg_distance_to_nearest_station,lat_rad,lon_rad,avg_distance_nearest_station_km
5,1301 McCormick Dr / Wayne K. Curry Admin Bldg,38.908392,-76.843263,0.037088,0.67908,-1.341168,3.552301
21,40th Ave & Bladensburg Rd,38.935389,-76.949285,0.010079,0.679551,-1.343018,0.953318
32,Baltimore Ave & Jefferson St,38.955494,-76.940138,0.010131,0.679902,-1.342859,1.059654
33,Baltimore Ave & Van Buren St / Riverdale Park ...,38.969583,-76.937349,0.010064,0.680148,-1.34281,1.006029
34,Baltimore Avenue and Hotel Drive at UMD,38.986639,-76.936072,0.012159,0.680445,-1.342788,1.283055


##  Calculating distance to metro

### Loading geo data (json files)

In [15]:
#  metro stations
metro_stations_gdf = load_geojson_as_gdf(GEO_DIR / "Maryland_Transit_-_WMATA_Metro_Stops (1).geojson")

# train stations
train_stations_gdf = load_geojson_as_gdf(GEO_DIR / "Maryland_Transit_-_MARC_Trains_Stations.geojson")

# maryland boundaries
maryland_gdf = load_geojson_as_gdf(GEO_DIR / "Maryland_Physical_Boundaries_-_County_Boundaries_(Detailed).geojson")

### Mergin with train gdf to have it all in one df

In [16]:
# Renaming columns
train_stations_gdf = train_stations_gdf[["Name","Line_Name","geometry"]]
train_stations_gdf = train_stations_gdf.rename(columns={"Name":"NAME", "Line_Name":"MetroLine"})

# Converting to DF
metro_stations_df = pd.DataFrame(metro_stations_gdf)
train_stations_df = pd.DataFrame(train_stations_gdf)

# Concatenate metro and train stations dataframes
train_metro_stations_df = pd.concat([metro_stations_df,train_stations_df])
train_metro_stations_df["geometry"] = train_metro_stations_df["geometry"].apply(Point)

# Convert to GeoDataFrame
train_metro_stations_gdf= gpd.GeoDataFrame(train_metro_stations_df, geometry="geometry", crs="EPSG:4326")


In [17]:
# converting pg_unique_stations to GeoDataFrame
geometry = [Point(xy) for xy in zip(pg_unique_stations['start_lng'], pg_unique_stations['start_lat'])]
pg_unique_stations_gdf = gpd.GeoDataFrame(pg_unique_stations, geometry=geometry, crs="EPSG:4326")


In [18]:
# adjusting the CRS to Maryland State Plane
pg_unique_stations_gdf = pg_unique_stations_gdf.to_crs(epsg=26985)
train_metro_stations_gdf = train_metro_stations_gdf.to_crs(epsg=26985)
#NOTE -  EPSG:4326 (Latitude/Longitude) → Degrees (Not Good for Distance)
#NOTE - EPSG:3857 or EPSG:26985 (Projected) → Meters/KM (Best for Distance Calculations)



In [19]:
# Function to find the nearest metro station and return its distance in meters
def find_nearest_metro(bike_station, metro_stations):
    """Find the nearest metro station and return its distance in meters."""
    nearest_metro = nearest_points(bike_station, metro_stations.union_all())[1]
    return bike_station.distance(nearest_metro)  # Output in meters

# Compute nearest metro distance for each bikeshare station
pg_unique_stations_gdf["distance_to_metro_meters"] = pg_unique_stations_gdf["geometry"].apply(
    lambda x: find_nearest_metro(x, train_metro_stations_gdf)
)

# Convert to kilometers for better readability
pg_unique_stations_gdf["distance_to_metro_km"] = pg_unique_stations_gdf["distance_to_metro_meters"] / 1000

# Check results
pg_unique_stations_gdf[["start_station_name", "distance_to_metro_km"]].head()


Unnamed: 0,start_station_name,distance_to_metro_km
5,1301 McCormick Dr / Wayne K. Curry Admin Bldg,0.385641
21,40th Ave & Bladensburg Rd,2.800774
32,Baltimore Ave & Jefferson St,0.946694
33,Baltimore Ave & Van Buren St / Riverdale Park ...,0.760299
34,Baltimore Avenue and Hotel Drive at UMD,1.125836


In [20]:
# Same but with KDTree for efficiency
# Extract coordinates
bike_coords = np.array(list(pg_unique_stations_gdf.geometry.apply(lambda x: (x.x, x.y))))
metro_coords = np.array(list(train_metro_stations_gdf.geometry.apply(lambda x: (x.x, x.y))))

# Create KDTree for fast nearest-neighbor search
metro_tree = cKDTree(metro_coords)

# Find nearest metro station for each bikeshare station
distances, indices = metro_tree.query(bike_coords)

# Store distance in meters (since projected CRS is used)
pg_unique_stations_gdf["distance_to_metro_meters"] = distances
pg_unique_stations_gdf["distance_to_metro_km"] = distances / 1000  # Convert to km

print(pg_unique_stations_gdf[["start_station_name", "distance_to_metro_km"]].head())

                                   start_station_name  distance_to_metro_km
5       1301 McCormick Dr / Wayne K. Curry Admin Bldg              0.385641
21                          40th Ave & Bladensburg Rd              2.800774
32                       Baltimore Ave & Jefferson St              0.946694
33  Baltimore Ave & Van Buren St / Riverdale Park ...              0.760299
34            Baltimore Avenue and Hotel Drive at UMD              1.125836


## Distance to City Center

In [21]:
# City Center Coordinates for Prince George's County (from ChatGPT)

data = {
    "Municipality": [
        "Bowie", "College Park", "Greenbelt", "Hyattsville", "Laurel",
        "Mount Rainier", "New Carrollton", "Seat Pleasant", "Berwyn Heights", "Bladensburg",
        "Brentwood", "Capitol Heights", "Cheverly", "Colmar Manor", "Cottage City",
        "District Heights", "Edmonston", "Fairmount Heights", "Forest Heights", "Glenarden",
        "Landover Hills", "Morningside", "North Brentwood", "Riverdale Park", "University Park",
        "Upper Marlboro"
    ],
    "Latitude": [
        38.9420, 38.9897, 39.0046, 38.9559, 39.0993,
        38.9410, 38.9690, 38.8965, 38.9937, 38.9390,
        38.9426, 38.8757, 38.9282, 38.9339, 38.9376,
        38.8595, 38.9507, 38.8990, 38.8140, 38.9290,
        38.9426, 38.8276, 38.9476, 38.9637, 38.9715,
        38.8151
    ],
    "Longitude": [
        -76.7300, -76.9378, -76.8755, -76.9455, -76.8483,
        -76.9647, -76.8797, -76.8991, -76.9136, -76.9336,
        -76.9564, -76.9158, -76.9155, -76.9470, -76.9466,
        -76.8891, -76.9358, -76.9116, -76.9986, -76.8616,
        -76.8916, -76.8891, -76.9564, -76.9355, -76.9366,
        -76.7491
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Convert to GeoDataFrame
geometry = [Point(xy) for xy in zip(df["Longitude"], df["Latitude"])]
city_center_gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")


In [22]:
def find_nearest_cc(bike_station, cc):
    """Find the nearest POI and return its distance in meters."""
    nearest_cc = nearest_points(bike_station, cc.union_all())[1]
    return bike_station.distance(nearest_cc)  # Output in meters

# Compute nearest poi distance for each bikeshare station
pg_unique_stations_gdf["distance_to_cc_meters"] = pg_unique_stations_gdf["geometry"].apply(
    lambda x: find_nearest_cc(x, city_center_gdf)
)

# Convert to kilometers for better readability
pg_unique_stations_gdf["distance_to_cc_km"] = pg_unique_stations_gdf["distance_to_cc_meters"] / 1000

# Check results
pg_unique_stations_gdf[["start_station_name", "distance_to_cc_km"]].head()

Unnamed: 0,start_station_name,distance_to_cc_km
5,1301 McCormick Dr / Wayne K. Curry Admin Bldg,436.021757
21,40th Ave & Bladensburg Rd,428.27833
32,Baltimore Ave & Jefferson St,429.763793
33,Baltimore Ave & Van Buren St / Riverdale Park ...,430.513516
34,Baltimore Avenue and Hotel Drive at UMD,431.256023


# POI (points of interest)

In [23]:
poi_data = {
    "POI Name": [
        "National Harbor",
        "Six Flags America",
        "MGM National Harbor Resort & Casino",
        "Gaylord National Resort & Convention Center",
        "University of Maryland, College Park",
        "College Park Aviation Museum",
        "Oxon Cove Park and Oxon Hill Farm",
        "Montpelier Mansion",
        "Lake Artemesia",
        "Dinosaur Park"
    ],
    "Latitude": [38.78417, 38.90251, 38.79555, 38.78072, 38.98692, 38.97485, 38.80500, 39.06984, 38.99067, 39.00000],
    "Longitude": [-77.01639, -76.77130, -77.00856, -77.01599, -76.94255, -76.92233, -77.01611, -76.85025, -76.92233, -76.88000]}

In [24]:
geometry = [Point(xy) for xy in zip(poi_data['Longitude'], poi_data['Latitude'])]
poi_gdf = gpd.GeoDataFrame(poi_data, geometry=geometry, crs="EPSG:4326")

In [25]:
poi_gdf

Unnamed: 0,POI Name,Latitude,Longitude,geometry
0,National Harbor,38.78417,-77.01639,POINT (-77.01639 38.78417)
1,Six Flags America,38.90251,-76.7713,POINT (-76.7713 38.90251)
2,MGM National Harbor Resort & Casino,38.79555,-77.00856,POINT (-77.00856 38.79555)
3,Gaylord National Resort & Convention Center,38.78072,-77.01599,POINT (-77.01599 38.78072)
4,"University of Maryland, College Park",38.98692,-76.94255,POINT (-76.94255 38.98692)
5,College Park Aviation Museum,38.97485,-76.92233,POINT (-76.92233 38.97485)
6,Oxon Cove Park and Oxon Hill Farm,38.805,-77.01611,POINT (-77.01611 38.805)
7,Montpelier Mansion,39.06984,-76.85025,POINT (-76.85025 39.06984)
8,Lake Artemesia,38.99067,-76.92233,POINT (-76.92233 38.99067)
9,Dinosaur Park,39.0,-76.88,POINT (-76.88 39)


In [26]:
pg_unique_stations_gdf = pg_unique_stations_gdf.to_crs(epsg=26985)
poi_gdf = poi_gdf.to_crs(epsg=26985)

In [27]:
from shapely.ops import nearest_points

def find_nearest_poi(bike_station, poi):
    """Find the nearest POI and return its distance in meters."""
    nearest_poi = nearest_points(bike_station, poi.union_all())[1]
    return bike_station.distance(nearest_poi)  # Output in meters

# Compute nearest poi distance for each bikeshare station
pg_unique_stations_gdf["distance_to_poi_meters"] = pg_unique_stations_gdf["geometry"].apply(
    lambda x: find_nearest_poi(x, poi_gdf)
)

# Convert to kilometers for better readability
pg_unique_stations_gdf["distance_to_poi_km"] = pg_unique_stations_gdf["distance_to_poi_meters"] / 1000

# Check results
pg_unique_stations_gdf[["start_station_name", "distance_to_poi_km"]].head()

Unnamed: 0,start_station_name,distance_to_poi_km
5,1301 McCormick Dr / Wayne K. Curry Admin Bldg,6.275948
21,40th Ave & Bladensburg Rd,4.964679
32,Baltimore Ave & Jefferson St,2.645513
33,Baltimore Ave & Van Buren St / Riverdale Park ...,1.426825
34,Baltimore Avenue and Hotel Drive at UMD,0.562109


In [28]:
pg_unique_stations_gdf.shape

(35, 14)

In [29]:
station_features = pg_unique_stations_gdf[["start_station_name","avg_distance_nearest_station_km","distance_to_metro_km","distance_to_poi_km","distance_to_cc_km"]]

## Population Density

In [30]:
# Adding population data for municipalities in Prince George's County (extracted from ChatGPT)

pop_data = {
    "Municipality": [
        "National Harbor", "Town of Forest Heights", "Chillum", "Town of Brentwood",
        "City of Hyattsville", "City of Greenbelt", "City of Riverdale Park", "Lake Arbor",
        "Town of Mount Rainier", "Glassmanor", "Hillcrest Heights", "Marlow Heights",
        "Maryland Park", "Suitland - Silver Hill", "City of College Park", "Adelphi",
        "Largo", "Town of Bladensburg", "Town of Capitol Heights", "Oxon Hill",
        "Colmar Manor", "Langley Park", "Town of Cottage City", "Chapel Oaks",
        "Town of Edmonston", "Cedar Heights", "Summerfield", "Town of University Park",
        "Town of Seat Pleasant", "East Riverdale", "Coral Hills", "Town of Cheverly",
        "Kentland", "Fort Washington", "Tuxedo", "College Park",
        "Carmody Hills - Pepper Mill Village", "Walker Mill", "Calverton",
        "Camp Springs", "Hillandale", "Town of District Heights", "Beltsville",
        "Woodlawn", "Kettering", "Konterra", "Palmer Park", "Town of Colmar Manor",
        "Forestville", "Town of Berwyn Heights", "Goddard"
    ],
    "Population": [3500, 2500, 34000, 3500, 18000, 23000, 7000, 10000,
                   8000, 9000, 16000, 6000, 6000, 33000, 32000, 17000,
                   10000, 9000, 4500, 18000, 1300, 20000, 1300, 6000,
                   1500, 8000, 15000, 2500, 4500, 15000, 10000, 6000,
                   5000, 24000, 1000, 32000, 6000, 12000, 17000, 19000,
                   4000, 6000, 17000, 6000, 12000, 500, 5000, 1300,
                   12000, 3000, 1000],
    "Area_sq_km": [3.5, 1.5, 7.8, 1.0, 7.0, 16.5, 4.5, 6.0,
                   2.0, 2.5, 4.0, 2.0, 2.0, 10.0, 14.0, 7.0,
                   10.0, 2.0, 2.0, 6.0, 1.0, 4.0, 1.0, 2.0,
                   1.0, 2.0, 9.0, 2.0, 2.0, 5.0, 3.0, 3.0,
                   2.0, 16.0, 1.0, 14.0, 2.0, 5.0, 10.0, 20.0,
                   10.0, 4.0, 15.0, 10.0, 12.0, 5.0, 2.0, 1.0,
                   10.0, 3.0, 1.0],
                   # Adding approximate lat/lon coordinates
    "Latitude" : [
    38.7858, 38.8140, 38.9630, 38.9426, 38.9559, 39.0046, 38.9637, 38.9204,
    38.9410, 38.8271, 38.8415, 38.8254, 38.8854, 38.8487, 38.9897, 38.9980,
    38.8757, 38.9390, 38.8854, 38.8032, 38.9339, 38.9887, 38.9376, 38.9082,
    38.9507, 38.8726, 38.9204, 38.9715, 38.8965, 38.9637, 38.8687, 38.9282,
    38.9351, 38.7079, 38.9204, 38.9897, 38.8854, 38.8687, 39.0579,
    38.8048, 39.0287, 38.8595, 39.0343, 38.9526, 38.9290, 39.1000, 38.9204, 38.9339,
    38.8454, 38.9937, 38.9900
],

    "Longitude" : [
    -77.0164, -76.9986, -76.9900, -76.9564, -76.9455, -76.8755, -76.9355, -76.8189,
    -76.9647, -76.9958, -76.9597, -76.9447, -76.8897, -76.9230, -76.9378, -76.9719,
    -76.8303, -76.9336, -76.9158, -77.0003, -76.9470, -76.9817, -76.9466, -76.9086,
    -76.9358, -76.8769, -76.8769, -76.9366, -76.8991, -76.9355, -76.9097, -76.9155,
    -76.8897, -77.0231, -76.8769, -76.9378, -76.8897, -76.9097, -76.9355,
    -76.9063, -76.9900, -76.8891, -76.9075, -76.9377, -76.9157, -76.8015, -76.8995, -76.9103,
    -76.9264, -76.9264, -76.8900
]}

pop_df = pd.DataFrame(pop_data)
pop_df["Population_Density"] = pop_df["Population"] / pop_df["Area_sq_km"]


In [31]:
# formatting 
pop_df["Municipality"] = pop_df["Municipality"].str.upper()

# renaming 
pop_df = pop_df.rename(columns={"Municipality":"NAME_right"})

## Final Prince George dataframe

In [32]:
# merge population density data with prince_george df
pg_final = prince_george_fixed.merge(pop_df[["NAME_right", "Population_Density"]], 
                                    on="NAME_right", 
                                    how="left")

# dropping rows with NaN in Population_Density
pg_final = pg_final.dropna(subset="Population_Density")

# casting Population_Density to int
pg_final["Population_Density"]=pg_final["Population_Density"].astype(int)

# prince george final df
pg_final.to_parquet(PROCESSED_DIR / "prince_george.parquet")



## Prince George Station features for ML

In [33]:
# groupping by start station and calculating mean population density
grouped = pg_final.groupby("start_station_name")["Population_Density"].mean().reset_index(name="pop_density")
# grouped["pop_density"] = grouped["pop_density"].astype(int)

# station_features with population density
station_features_2021_to_2024 = station_features.merge(grouped,on ="start_station_name",how="left")

# station_features_2021_to_2024 to parquet for ML
station_features_2021_to_2024.to_parquet(INTERIM_DIR / "station_features_2021_to_2024.parquet")
station_features_2021_to_2024.to_csv(INTERIM_DIR / "station_features_2021_to_2024.csv", index=False)


  grouped = pg_final.groupby("start_station_name")["Population_Density"].mean().reset_index(name="pop_density")


In [45]:
station_features_2021_to_2024.head()

Unnamed: 0,start_station_name,avg_distance_nearest_station_km,distance_to_metro_km,distance_to_poi_km,distance_to_cc_km,pop_density,distance_nearest_station,distance_nearest_metro,distance_nearest_poi
0,1301 McCormick Dr / Wayne K. Curry Admin Bldg,3.552301,0.385641,6.275948,436.021757,1666.0,far,near,far
1,40th Ave & Bladensburg Rd,0.953318,2.800774,4.964679,428.27833,1300.0,near,medium,far
2,Baltimore Ave & Jefferson St,1.059654,0.946694,2.645513,429.763793,2570.914982,near,near,medium
3,Baltimore Ave & Van Buren St / Riverdale Park ...,1.006029,0.760299,1.426825,430.513516,1555.0,near,near,near
4,Baltimore Avenue and Hotel Drive at UMD,1.283055,1.125836,0.562109,431.256023,2285.0,near,near,near


## Mapping to have a visual representation

In [34]:
 # Creating a centered map with the average coordinates of the stations
pg_map = create_centered_map(pg_unique_stations)

# Adding the base layer with Maryland boundaries
l1 = folium.GeoJson(
    pg_unique_stations_gdf,  
    overlay= True, 
    control = True,
    show = True,
    name= "Cabi Stations",
    marker=folium.CircleMarker(radius=3, fill_color="blue", fill_opacity=1, color="black", weight=1),
    tooltip=folium.GeoJsonTooltip(fields=["start_station_name"],
                                  aliases=["Station: "]),
    popup=folium.GeoJsonPopup(fields=["start_station_name"]),
    highlight_function=lambda x: {"fillOpacity": 0.6},
    zoom_on_click=False,
).add_to(pg_map)

l2 = folium.GeoJson(
    metro_stations_gdf,  
    overlay= True, 
    control = True,
    show = True,
    name= "Metro Stations",
    marker=folium.Marker(radius=4,icon= folium.Icon(color="red", icon="train", prefix="fa")),
    tooltip=folium.GeoJsonTooltip(fields=["NAME"],
                                  aliases=["Metro Station: "]),
    popup=folium.GeoJsonPopup(fields=["NAME"]),
    highlight_function=lambda x: {"fillOpacity": 0.8},
    zoom_on_click=False,
).add_to(pg_map)

l3 = folium.GeoJson(
    train_stations_gdf,  
    overlay= True, 
    control = True,
    show = True,
    name= "Train Stations",
    marker=folium.Marker(radius=4,icon= folium.Icon(color="red", icon="train", prefix="fa")),
    tooltip=folium.GeoJsonTooltip(fields=["NAME"],
                                  aliases=["Train Station: "]),
    popup=folium.GeoJsonPopup(fields=["NAME"]),
    highlight_function=lambda x: {"fillOpacity": 0.8},
    zoom_on_click=False,
).add_to(pg_map)

l4 = folium.GeoJson(
    city_center_gdf,  
    overlay= True, 
    control = True,
    show = True,
    name= "City Centers",
    marker=folium.CircleMarker(radius=3, fill_color="green", fill_opacity=1, color="black", weight=1),
    tooltip=folium.GeoJsonTooltip(fields=["Municipality"],
                                  aliases=["Municipality: "]),
    popup=folium.GeoJsonPopup(fields=["Municipality"]),
    highlight_function=lambda x: {"fillOpacity": 0.6},
    zoom_on_click=False,
).add_to(pg_map)

pg_map.add_child(folium.LayerControl())
pg_map


# Categorizing stations

## distance to nearest cabi station

In [35]:
# Define conditions
conditions = [
    station_features_2021_to_2024["avg_distance_nearest_station_km"] < 1.5,  # Near (less than 500m)
    (station_features_2021_to_2024["avg_distance_nearest_station_km"] >= 1.5) & (station_features_2021_to_2024["avg_distance_nearest_station_km"] < 3),  # Medium (0.5-1.5km)
    station_features_2021_to_2024["avg_distance_nearest_station_km"] >= 3  # Far (more than 1.5km)
]

# Define labels
categories = ["near", "medium", "far"]

# Define labels
categories = ["near", "medium", "far"]

# Assign categories
station_features_2021_to_2024["distance_nearest_station"] = np.select(conditions, categories, default="unknown")

# make it categorical and assign it an order
station_features_2021_to_2024['distance_nearest_station'] = pd.Categorical(station_features_2021_to_2024['distance_nearest_station'],
categories=["near", "medium", "far"])



In [36]:
# Count number of stations in each category
station_features_2021_to_2024["distance_nearest_station"].value_counts().reset_index(name="Count of Stations")


Unnamed: 0,distance_nearest_station,Count of Stations
0,near,18
1,medium,9
2,far,8


## distance to train/metro

In [37]:
# Define conditions
conditions = [
    station_features_2021_to_2024["distance_to_metro_km"] < 1.5,  # Near (less than 500m)
    (station_features_2021_to_2024["distance_to_metro_km"] >= 1.5) & (station_features_2021_to_2024["distance_to_metro_km"] < 3),  # Medium (0.5-1.5km)
    station_features_2021_to_2024["distance_to_metro_km"] >= 3  # Far (more than 1.5km)
]

# Define labels
categories = ["near", "medium", "far"]

# Assign categories
station_features_2021_to_2024["distance_nearest_metro"] = np.select(conditions, categories, default="unknown")

# make it categorical and assign it an order
station_features_2021_to_2024['distance_nearest_metro'] = pd.Categorical(station_features_2021_to_2024['distance_nearest_metro'],
categories=["near", "medium", "far"])

station_features_2021_to_2024[["distance_nearest_metro", "distance_to_metro_km"]]

Unnamed: 0,distance_nearest_metro,distance_to_metro_km
0,near,0.385641
1,medium,2.800774
2,near,0.946694
3,near,0.760299
4,near,1.125836
5,medium,2.716629
6,near,0.021853
7,near,0.021964
8,near,0.156233
9,near,1.228416


In [38]:
# Count number of stations in each category
station_features_2021_to_2024["distance_nearest_metro"].value_counts().reset_index(name="Count of Stations")

Unnamed: 0,distance_nearest_metro,Count of Stations
0,near,20
1,medium,10
2,far,5


## Distance to POI

In [39]:
# Define conditions
conditions = [
    station_features_2021_to_2024["distance_to_poi_km"] < 1.5,  # Near (less than 500m)
    (station_features_2021_to_2024["distance_to_poi_km"] >= 1.5) & (station_features_2021_to_2024["distance_to_poi_km"] < 3),  # Medium (0.5-1.5km)
    station_features_2021_to_2024["distance_to_poi_km"] >= 3  # Far (more than 1.5km)
]

# Define labels
categories = ["near", "medium", "far"]

# Assign categories
station_features_2021_to_2024["distance_nearest_poi"] = np.select(conditions, categories, default="unknown")

# make it categorical and assign it an order
station_features_2021_to_2024['distance_nearest_poi'] = pd.Categorical(station_features_2021_to_2024['distance_nearest_poi'],
categories=["near", "medium", "far"])

# make it categorical and assign it an order
station_features_2021_to_2024['distance_nearest_poi'] = pd.Categorical(station_features_2021_to_2024['distance_nearest_poi'],
categories=["near", "medium", "far"])

In [40]:
# Count number of stations in each category
station_features_2021_to_2024["distance_nearest_poi"].value_counts().reset_index(name= "Number of Stations").rename(columns={"index": "Distance Category"})

Unnamed: 0,distance_nearest_poi,Number of Stations
0,far,16
1,near,12
2,medium,7


## Average weekly rides per categorized distance feature

In [41]:
# 
pg_final["started_at"] = pd.to_datetime(pg_final["started_at"],format="ISO8601")
pg_final["year_week"] = pg_final["started_at"].dt.strftime("%Y-%U")

# Group by station and week, counting rides per group
weekly_rides = pg_final.groupby(["start_station_name", "year_week"], observed=False).agg(
    avg_rides=("member_casual", "count")  # Counting total rides per station per week./ Using a random column to check amount of rides
).reset_index()

# Merge station features with grouped weekly rides
pg_station_features_weekly = weekly_rides.merge(station_features_2021_to_2024, on="start_station_name", how="left")

In [42]:
# avg rides by distance to station
pg_station_features_weekly.groupby("distance_nearest_station",observed=False)["avg_rides"].mean().reset_index(name="avg_rides_per_station").sort_values(by="avg_rides_per_station", ascending=False)

Unnamed: 0,distance_nearest_station,avg_rides_per_station
1,medium,21.744235
0,near,11.054158
2,far,3.694969


In [43]:
# avg rides by distance to metro
pg_station_features_weekly.groupby("distance_nearest_metro",observed=False)["avg_rides"].mean().reset_index(name="avg_rides_per_station").sort_values(by="avg_rides_per_station", ascending=False)

Unnamed: 0,distance_nearest_metro,avg_rides_per_station
2,far,17.703145
0,near,14.024214
1,medium,5.52327


In [44]:
# avg rides by distance to POI
pg_station_features_weekly.groupby("distance_nearest_poi",observed=False)["avg_rides"].mean().reset_index(name="avg_rides_per_station").sort_values(by="avg_rides_per_station", ascending=False)

Unnamed: 0,distance_nearest_poi,avg_rides_per_station
1,medium,18.568733
0,near,10.598532
2,far,10.441824
