In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [22]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

month = 4
year = 2024
path = Path("..") / "data" / "raw" / f"rides_{year}_{month:02}.parquet"

table = pq.read_table(path)
rides = table.to_pandas()
rides.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,399F3B5640FA95C7,classic_bike,2024-04-30 16:56:01,2024-04-30 19:12:48,Everett Square (Broadway at Norwood St),V32003,Chelsea St at Vine St,V32016,42.406721,-71.056438,42.403369,-71.047314,casual
1,2F8686F90772CC28,classic_bike,2024-04-15 17:27:02,2024-04-15 17:38:13,Nashua Street at Red Auerbach Way,A32025,Chinatown T Stop,D32019,42.365673,-71.064263,42.352409,-71.062679,casual
2,A6885D2B2CB78B9E,electric_bike,2024-04-01 18:07:03,2024-04-01 18:19:58,St Mary's,K32007,Chinatown T Stop,D32019,42.346226,-71.107078,42.352409,-71.062679,casual
3,1DC97B8EA498A8BA,electric_bike,2024-04-01 17:09:18,2024-04-01 17:15:54,Nashua Street at Red Auerbach Way,A32025,Chinatown T Stop,D32019,42.365749,-71.06428,42.352409,-71.062679,casual
4,704C39EED089A489,electric_bike,2024-04-12 21:35:23,2024-04-12 21:57:52,Nashua Street at Red Auerbach Way,A32025,Kennedy-Longfellow School 158 Spring St,M32065,42.36569,-71.064094,42.369553,-71.08579,casual


In [23]:
rides_cp = rides.copy()
rides_cp["started_at"] = pd.to_datetime(rides_cp["started_at"], errors="coerce")
rides_cp["ended_at"] = pd.to_datetime(rides_cp["ended_at"], errors="coerce")
rides_cp["duration"] = rides_cp["ended_at"] - rides_cp["started_at"]
rides_cp.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,399F3B5640FA95C7,classic_bike,2024-04-30 16:56:01,2024-04-30 19:12:48,Everett Square (Broadway at Norwood St),V32003,Chelsea St at Vine St,V32016,42.406721,-71.056438,42.403369,-71.047314,casual,0 days 02:16:47
1,2F8686F90772CC28,classic_bike,2024-04-15 17:27:02,2024-04-15 17:38:13,Nashua Street at Red Auerbach Way,A32025,Chinatown T Stop,D32019,42.365673,-71.064263,42.352409,-71.062679,casual,0 days 00:11:11
2,A6885D2B2CB78B9E,electric_bike,2024-04-01 18:07:03,2024-04-01 18:19:58,St Mary's,K32007,Chinatown T Stop,D32019,42.346226,-71.107078,42.352409,-71.062679,casual,0 days 00:12:55
3,1DC97B8EA498A8BA,electric_bike,2024-04-01 17:09:18,2024-04-01 17:15:54,Nashua Street at Red Auerbach Way,A32025,Chinatown T Stop,D32019,42.365749,-71.06428,42.352409,-71.062679,casual,0 days 00:06:36
4,704C39EED089A489,electric_bike,2024-04-12 21:35:23,2024-04-12 21:57:52,Nashua Street at Red Auerbach Way,A32025,Kennedy-Longfellow School 158 Spring St,M32065,42.36569,-71.064094,42.369553,-71.08579,casual,0 days 00:22:29


In [24]:
rides_cp["duration"].describe().T

count                       355732
mean     0 days 00:15:26.223499713
std      0 days 00:32:48.106522496
min              -1 days +23:59:59
25%                0 days 00:06:15
50%                0 days 00:10:32
75%                0 days 00:17:34
max                1 days 00:57:49
Name: duration, dtype: object

In [25]:
rides_cp["duration"].quantile(0)
rides_cp["duration"].quantile(0.01)
rides_cp["duration"].quantile(0.995)
rides_cp["duration"].quantile(0.999)

Timedelta('-1 days +23:59:59')

Timedelta('0 days 00:00:46')

Timedelta('0 days 01:52:18.344999999')

Timedelta('0 days 05:31:16.356000003')

In [26]:
duration_filter = (rides_cp["duration"] > pd.Timedelta(0)) & (rides_cp["duration"] <= pd.Timedelta(hours=5))
sum(~duration_filter)

472

In [39]:
raw_dir = Path("..") / "data" / "raw"

In [46]:
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Optional, Tuple, Union



def filter_blue_bike_data(rides: pd.DataFrame, year: int, month: int) -> pd.DataFrame:
    """
    Filters bike ride data for a specific year and month, removing outliers and invalid records.

    Args:
        rides (pd.DataFrame): DataFrame containing bike ride data.
        year (int): Year to filter for.
        month (int): Month to filter for (1-12).

    Returns:
        pd.DataFrame: Filtered DataFrame containing only valid rides for the specified year and month.

    Raises:
        ValueError: If no valid rides are found or if input parameters are invalid.
    """
    # Validate inputs
    if not (1 <= month <= 12):
        raise ValueError("Month must be between 1 and 12.")
    if not isinstance(year, int) or not isinstance(month, int):
        raise ValueError("Year and month must be integers.")

    # Add a duration column for filtering
    rides["started_at"] = pd.to_datetime(rides["started_at"], errors="coerce")
    rides["ended_at"] = pd.to_datetime(rides["ended_at"], errors="coerce")
    rides["duration"] = rides["ended_at"] - rides["started_at"]

    # Define filters
    duration_filter = (rides["duration"] > pd.Timedelta(0)) & (
        rides["duration"] <= pd.Timedelta(hours=5)
    )
    
    # Combine all filters
    final_filter = (
        duration_filter
    )

    # Calculate dropped records
    total_records = len(rides)
    valid_records = final_filter.sum()
    records_dropped = total_records - valid_records
    percent_dropped = (records_dropped / total_records) * 100

    print(f"Total records: {total_records:,}")
    print(f"Valid records: {valid_records:,}")
    print(f"Records dropped: {records_dropped:,} ({percent_dropped:.2f}%)")

    # Filter the DataFrame
    validated_rides = rides[final_filter]
    validated_rides = validated_rides[["started_at", "start_station_id"]]
    validated_rides.rename(
        columns={ "start_station_id": "pickup_location_id"},
        inplace=True,
    )
    # Verify we have data in the correct time range
    if validated_rides.empty:
        raise ValueError(f"No valid rides found for {year}-{month:02} after filtering.")

    return validated_rides


def load_and_process_bike_data(
    year: int, months: Optional[List[int]] = None
) -> pd.DataFrame:
    """
    Loads and processes bike ride data for a specified year and list of months.

    Args:
        year (int): Year to load data for.
        months (Optional[List[int]]): List of months to load. If None, loads all months (1-12).

    Returns:
        pd.DataFrame: Combined and processed ride data for the specified year and months.

    Raises:
        Exception: If no data could be loaded for the specified year and months.
    """

    # Use all months if none are specified
    if months is None:
        months = list(range(1, 13))

    # List to store DataFrames for each month
    monthly_rides = []

    for month in months:
        # Construct the file path
        file_path = raw_dir / f"rides_{year}_{month:02}.parquet"

        try:
            # Load the data
            print(f"Loading data for {year}-{month:02}...")
            rides = pd.read_parquet(file_path, engine="pyarrow")

            # Filter and process the data
            rides = filter_blue_bike_data(rides, year, month)
            print(f"Successfully processed data for {year}-{month:02}.")

            # Append the processed DataFrame to the list
            monthly_rides.append(rides)

        except FileNotFoundError:
            print(f"File not found for {year}-{month:02}. Skipping...")
        except Exception as e:
            print(f"Error processing data for {year}-{month:02}: {str(e)}")
            continue

    # Combine all monthly data
    if not monthly_rides:
        raise Exception(
            f"No data could be loaded for the year {year} and specified months: {months}"
        )

    print("Combining all monthly data...")
    combined_rides = pd.concat(monthly_rides, ignore_index=True)
    print("Data loading and processing complete!")

    return combined_rides


In [47]:
rides_2025 = load_and_process_bike_data(2025, months = [1, 2, 3, 4, 5, 6, 7])
rides.to_parquet(raw_dir / f"rides_2025.parquet", index=False)
print(f"Converted to parquet!")

Loading data for 2025-01...
Total records: 161,926
Valid records: 161,769
Records dropped: 157 (0.10%)
Successfully processed data for 2025-01.
Loading data for 2025-02...
Total records: 165,742
Valid records: 165,608
Records dropped: 134 (0.08%)
Successfully processed data for 2025-02.
Loading data for 2025-03...
Total records: 271,605
Valid records: 271,385
Records dropped: 220 (0.08%)
Successfully processed data for 2025-03.
Loading data for 2025-04...
Total records: 373,257
Valid records: 372,958
Records dropped: 299 (0.08%)
Successfully processed data for 2025-04.
Loading data for 2025-05...
Total records: 429,077
Valid records: 428,680
Records dropped: 397 (0.09%)
Successfully processed data for 2025-05.
Loading data for 2025-06...
Total records: 487,146
Valid records: 486,592
Records dropped: 554 (0.11%)
Successfully processed data for 2025-06.
Loading data for 2025-07...
Total records: 538,372
Valid records: 537,755
Records dropped: 617 (0.11%)
Successfully processed data for 2

In [48]:
rides_2024 = load_and_process_bike_data(2024)
rides.to_parquet(raw_dir / f"rides_2024.parquet", index=False)
print(f"Converted to parquet!")


Loading data for 2024-01...
Total records: 166,200
Valid records: 165,838
Records dropped: 362 (0.22%)
Successfully processed data for 2024-01.
Loading data for 2024-02...
Total records: 231,163
Valid records: 230,805
Records dropped: 358 (0.15%)
Successfully processed data for 2024-02.
Loading data for 2024-03...
Total records: 261,187
Valid records: 260,857
Records dropped: 330 (0.13%)
Successfully processed data for 2024-03.
Loading data for 2024-04...
Total records: 355,732
Valid records: 355,260
Records dropped: 472 (0.13%)
Successfully processed data for 2024-04.
Loading data for 2024-05...
Total records: 439,836
Valid records: 439,164
Records dropped: 672 (0.15%)
Successfully processed data for 2024-05.
Loading data for 2024-06...
Total records: 477,935
Valid records: 477,308
Records dropped: 627 (0.13%)
Successfully processed data for 2024-06.
Loading data for 2024-07...
Total records: 541,530
Valid records: 540,848
Records dropped: 682 (0.13%)
Successfully processed data for 2