In [1]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import statistics as st
import seaborn as sns
import datetime 
from geopy import distance
import folium
from folium.plugins import MarkerCluster
from folium.features import GeoJsonTooltip
from branca.colormap import LinearColormap
from collections import Counter
import json
from shapely.geometry import Point
import geopandas as gpd
from shapely.geometry import shape

In [3]:
data_types = {
    "rideable_type": "category", 
    "start_station_name": "category", 
    "end_station_name": "category", 
    "member_casual":"category",
    # "ride_id":"uint32",
    "time_of_day":"category",
    "trip_type":"category"}

In [7]:
cabi_01_24 = pd.read_csv("202401-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_02_24 = pd.read_csv("202402-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_03_24 = pd.read_csv("202403-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_04_24 = pd.read_csv("202004-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_05_24 = pd.read_csv("202405-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_06_24 = pd.read_csv("202406-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_07_24 = pd.read_csv("202407-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_08_24 = pd.read_csv("202408-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_09_24 = pd.read_csv("202409-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_10_24 = pd.read_csv("202410-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_11_24 = pd.read_csv("202411-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)
cabi_12_24 = pd.read_csv("202412-capitalbikeshare-tripdata.csv", dtype = data_types, parse_dates= ["started_at", "ended_at"], low_memory=False)

In [8]:
cabi_2024 = pd.concat([cabi_01_24, cabi_02_24, cabi_03_24, cabi_04_24, cabi_05_24, cabi_06_24, cabi_07_24, cabi_08_24, cabi_09_24, cabi_10_24, cabi_11_24, cabi_12_24], ignore_index=True,axis=0)

In [10]:
cabi_2024.to_csv("cabi_2024.csv")

In [11]:
with open ("Maryland_Physical_Boundaries_-_County_Boundaries_(Detailed).geojson") as i:
    maryland = json.loads(i.read())

features = maryland["features"]

#GDF
maryland_gdf = gpd.GeoDataFrame(
    pd.DataFrame([feature['properties'] for feature in features]),  # Extract properties as attributes
    geometry=[shape(feature['geometry']) for feature in features],  # Convert geometries
    crs="EPSG:4326")

In [12]:
# # Create a GeoDataFrame for the start stations
geometry = [Point(xy) for xy in zip(cabi_2024['start_lng'], cabi_2024['start_lat'])]
bikes_gdf = gpd.GeoDataFrame(cabi_2024, geometry=geometry, crs="EPSG:4326")

# # Perform spatial join with the maryland gdf
bikes_maryland_join = gpd.sjoin(bikes_gdf, maryland_gdf, how="left", predicate="within")

In [13]:
# drop na to keep only rides in maryland 
rides_maryland = bikes_maryland_join.dropna(subset="COUNTY")
# drop columns that are not being used
rides_maryland = rides_maryland.drop(columns= ['index_right','OBJECTID','DISTRICT', 'COUNTY_FIP', 'COUNTYNUM','CREATION_D', 'LAST_UPDAT'])

In [14]:
rides_maryland['year'] = rides_maryland['started_at'].dt.year
rides_maryland['dow'] = rides_maryland['started_at'].dt.dayofweek
rides_maryland["weekday"] = rides_maryland["started_at"].dt.day_name()

In [15]:
# # Count rides per county
rides_per_county = rides_maryland.groupby("COUNTY").size().reset_index(name="ride_count")  
rides_per_county

Unnamed: 0,COUNTY,ride_count
0,Montgomery,100139
1,Prince George's,78258


#TODO - compare the increase of rides between 2023 and 2024 in dc and maryland

In [16]:
# # Convert 'weekday' to a categorical type with the correct order
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rides_maryland["weekday"] = pd.Categorical(rides_maryland["weekday"], categories=weekday_order, ordered=True)


In [17]:
rides_maryland.to_csv("cabi_maryland_2024.csv", index=False)