In [1]:
import pandas as pd

rides_df = pd.read_parquet('../data/processed/rides_2023_01.parquet')
rides_df.head(20)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
5,2023-01-01 00:50:34,161
6,2023-01-01 00:09:22,239
7,2023-01-01 00:27:12,142
8,2023-01-01 00:21:44,164
9,2023-01-01 00:39:42,141


In [2]:
rides_df['pickup_hour'] = rides_df['pickup_datetime'].dt.floor('H')
rides_df

  rides_df['pickup_hour'] = rides_df['pickup_datetime'].dt.floor('H')


Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2023-01-01 00:32:10,161,2023-01-01 00:00:00
1,2023-01-01 00:55:08,43,2023-01-01 00:00:00
2,2023-01-01 00:25:04,48,2023-01-01 00:00:00
3,2023-01-01 00:03:48,138,2023-01-01 00:00:00
4,2023-01-01 00:10:29,107,2023-01-01 00:00:00
...,...,...,...
3066761,2023-01-31 23:58:34,107,2023-01-31 23:00:00
3066762,2023-01-31 23:31:09,112,2023-01-31 23:00:00
3066763,2023-01-31 23:01:05,114,2023-01-31 23:00:00
3066764,2023-01-31 23:40:00,230,2023-01-31 23:00:00


In [3]:
aggregated_rides_df = rides_df.groupby(['pickup_hour', 'pickup_location_id']).size().reset_index()
aggregated_rides_df.rename(columns={0: 'ride_count'}, inplace=True)
aggregated_rides_df.head()

Unnamed: 0,pickup_hour,pickup_location_id,ride_count
0,2023-01-01,4,19
1,2023-01-01,7,3
2,2023-01-01,12,1
3,2023-01-01,13,14
4,2023-01-01,24,20


In [4]:
from tqdm import tqdm

def add_missing(aggregated_rides_df: pd.DataFrame) -> pd.DataFrame:
    location_ids = aggregated_rides_df['pickup_location_id'].unique()
    full_range = pd.date_range(aggregated_rides_df['pickup_hour'].min(), aggregated_rides_df['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):
        # select rides for a particular location
        aggregated_rides_iter = aggregated_rides_df.loc[aggregated_rides_df.pickup_location_id == location_id, ['pickup_hour', 'ride_count']]

        # adding missing dates with 0 count
        aggregated_rides_iter.set_index('pickup_hour', inplace=True)
        aggregated_rides_iter.index = pd.DatetimeIndex(aggregated_rides_iter.index)
        aggregated_rides_iter = aggregated_rides_iter.reindex(full_range, fill_value=0)

        # add location_id cols
        aggregated_rides_iter['pickup_location_id'] = location_id
        output = pd.concat([output, aggregated_rides_iter])

    # set ride day to be a column instead of index
    output = output.reset_index().rename(columns={'index': 'pickup_hour'})
    return output

In [5]:
full_rides_df = add_missing(aggregated_rides_df)

  full_range = pd.date_range(aggregated_rides_df['pickup_hour'].min(), aggregated_rides_df['pickup_hour'].max(), freq='H')
100%|██████████| 257/257 [00:00<00:00, 1029.27it/s]


In [11]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
        rides_df: pd.DataFrame,
        locations: Optional[List[int]] = None
):
    """
    Plot time-series of rides for all or selected locations
    """
    rides_to_plot = rides_df[rides_df.pickup_location_id.isin(locations)] if locations else rides_df
    fig = px.line(
        rides_to_plot,
        x='pickup_hour',
        y='ride_count',
        color='pickup_location_id',
        title='Number of rides over time',
        template='plotly_dark'
    )

    fig.show()

In [12]:
plot_rides(aggregated_rides_df, locations=[43])

In [13]:
aggregated_rides_df.to_parquet('../data/processed/ts_data_2023_01.parquet')