# CaBi datasets from 2024

gets the datasets for the year 2024 and merges them into one = cabi_2024.csv

In [None]:
# 📦 Standard Library
import sys
from pathlib import Path
import json

# 📊 Data Analysis & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 🌍 Geospatial Libraries
from shapely.geometry import Point, shape
import geopandas as gpd
from shapely.geometry import Point, shape


# 🛠️ Project-Specific Modules
sys.path.append(str(Path().resolve().parent / "src"))
from paths import RAW_DATA_DIR
from helpers_folium import  load_bikeshare_data, load_geojson_as_gdf

## Load Data

In [None]:
# load Capital Bikeshare data for 2024
cabi_01_24 = load_bikeshare_data("202401-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_02_24 = load_bikeshare_data("202402-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_03_24 = load_bikeshare_data("202403-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_04_24 = load_bikeshare_data("202404-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_05_24 = load_bikeshare_data("202405-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_06_24 = load_bikeshare_data("202406-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_07_24 = load_bikeshare_data("202407-capitalbikeshare-tripdata.csv", data_types=data_types) 
cabi_08_24 = load_bikeshare_data("202408-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_09_24 = load_bikeshare_data("202409-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_10_24 = load_bikeshare_data("202410-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_11_24 = load_bikeshare_data("202411-capitalbikeshare-tripdata.csv", data_types=data_types)
cabi_12_24 = load_bikeshare_data("202412-capitalbikeshare-tripdata.csv", data_types=data_types)

In [None]:
# concatenate all monthly dataframes into one dataframe
cabi_2024 = pd.concat([cabi_01_24, cabi_02_24, cabi_03_24, cabi_04_24, cabi_05_24, cabi_06_24, cabi_07_24, cabi_08_24, cabi_09_24, cabi_10_24, cabi_11_24, cabi_12_24], ignore_index=True,axis=0)

In [None]:
# save to csv file
cabi_2024.to_csv("cabi_2024.csv")

In [None]:
# load Maryland GeoJSON file

maryland_gdf = load_geojson_as_gdf("Maryland_Physical_Boundaries_-_County_Boundaries_(Detailed).geojson")

In [12]:
# # Create a GeoDataFrame for the start stations
geometry = [Point(xy) for xy in zip(cabi_2024['start_lng'], cabi_2024['start_lat'])]
bikes_gdf = gpd.GeoDataFrame(cabi_2024, geometry=geometry, crs="EPSG:4326")

# # Perform spatial join with the maryland gdf
bikes_maryland_join = gpd.sjoin(bikes_gdf, maryland_gdf, how="left", predicate="within")

In [13]:
# drop na to keep only rides in maryland 
rides_maryland = bikes_maryland_join.dropna(subset="COUNTY")
# drop columns that are not being used
rides_maryland = rides_maryland.drop(columns= ['index_right','OBJECTID','DISTRICT', 'COUNTY_FIP', 'COUNTYNUM','CREATION_D', 'LAST_UPDAT'])

In [14]:
rides_maryland['year'] = rides_maryland['started_at'].dt.year
rides_maryland['dow'] = rides_maryland['started_at'].dt.dayofweek
rides_maryland["weekday"] = rides_maryland["started_at"].dt.day_name()

In [15]:
# # Count rides per county
rides_per_county = rides_maryland.groupby("COUNTY").size().reset_index(name="ride_count")  
rides_per_county

Unnamed: 0,COUNTY,ride_count
0,Montgomery,100139
1,Prince George's,78258


#TODO - compare the increase of rides between 2023 and 2024 in dc and maryland

In [16]:
# # Convert 'weekday' to a categorical type with the correct order
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rides_maryland["weekday"] = pd.Categorical(rides_maryland["weekday"], categories=weekday_order, ordered=True)


In [17]:
rides_maryland.to_csv("cabi_maryland_2024.csv", index=False)