In [None]:
from typing import Optional, List
import plotly .express as px

def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
    ):
    """
    Plot rides.

    Parameters
    ----------
    rides : pd.DataFrame
        Rides dataframe.
    locations : Optional[List[int]], optional
        List of locations to plot. The default is None.

    Returns
    -------
    fig : plotly.graph_objects.Figure
        Plotly figure.
    """

    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_datetime",
        y="rides",
        color="pickup_location",
        template='none'
    )

    fig.show()

    return None

In [None]:
from pathlib import Path
import requests

def download_file(year: int , month: int) -> Path:
    """"""
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    respnse = requests.get(URL)

    if respnse.status_code == 200:
        path = f"../data/raw/rides_{year}-{month:02d}.parquet"
        open(path, 'wb').write(respnse.content)
        return path
    else:
        raise Exception(f"Failed to download {URL}")

In [2]:
import pandas as pd
from tqdm import tqdm

def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    '''
    https://stackoverflow.com/questions/19324453/add-missing-dates-to-pandas-dataframe/19324591#19324591
    '''
    location_ids = agg_rides['pickup_location'].unique()
    full_range = pd.date_range(
        agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):

        agg_rides_i = agg_rides.loc[agg_rides.pickup_location == location_id, ['pickup_hour','rides']]

        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        agg_rides_i['pickup_location'] = location_id

        output = pd.concat([output, agg_rides_i])

    output = output.reset_index().rename(columns={"index": 'pickup_hour'})

    return output


In [14]:
# time series slicing function

# rides = pd.read_parquet("../data/raw/rides_2022-01.parquet")

def create_index_tuples(data, step_size, number_of_features) -> list:
    """
    Create a list of tuples that are being used to index a time series dataframe.
    """
    output_list = []

    stop_index = len(data) - 1
    starting_index = 0
    targets = starting_index + number_of_features + step_size
    
    while targets < stop_index:
        output_list.append((starting_index, starting_index + number_of_features, targets))               
        targets += step_size
        starting_index += step_size

    return output_list

# create_index_tuples(rides, step_size=1,number_of_features=13)

[(0, 13, 14),
 (1, 14, 15),
 (2, 15, 16),
 (3, 16, 17),
 (4, 17, 18),
 (5, 18, 19),
 (6, 19, 20),
 (7, 20, 21),
 (8, 21, 22),
 (9, 22, 23),
 (10, 23, 24),
 (11, 24, 25),
 (12, 25, 26),
 (13, 26, 27),
 (14, 27, 28),
 (15, 28, 29),
 (16, 29, 30),
 (17, 30, 31),
 (18, 31, 32),
 (19, 32, 33),
 (20, 33, 34),
 (21, 34, 35),
 (22, 35, 36),
 (23, 36, 37),
 (24, 37, 38),
 (25, 38, 39),
 (26, 39, 40),
 (27, 40, 41),
 (28, 41, 42),
 (29, 42, 43),
 (30, 43, 44),
 (31, 44, 45),
 (32, 45, 46),
 (33, 46, 47),
 (34, 47, 48),
 (35, 48, 49),
 (36, 49, 50),
 (37, 50, 51),
 (38, 51, 52),
 (39, 52, 53),
 (40, 53, 54),
 (41, 54, 55),
 (42, 55, 56),
 (43, 56, 57),
 (44, 57, 58),
 (45, 58, 59),
 (46, 59, 60),
 (47, 60, 61),
 (48, 61, 62),
 (49, 62, 63),
 (50, 63, 64),
 (51, 64, 65),
 (52, 65, 66),
 (53, 66, 67),
 (54, 67, 68),
 (55, 68, 69),
 (56, 69, 70),
 (57, 70, 71),
 (58, 71, 72),
 (59, 72, 73),
 (60, 73, 74),
 (61, 74, 75),
 (62, 75, 76),
 (63, 76, 77),
 (64, 77, 78),
 (65, 78, 79),
 (66, 79, 80),
 (67,