In [1]:
from typing import Optional, List
import plotly .express as px
import pandas as pd

def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
    ):
    """
    Plot rides.

    Parameters
    ----------
    rides : pd.DataFrame
        Rides dataframe.
    locations : Optional[List[int]], optional
        List of locations to plot. The default is None.

    Returns
    -------
    fig : plotly.graph_objects.Figure
        Plotly figure.
    """

    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_datetime",
        y="rides",
        color="pickup_location",
        template='none'
    )

    fig.show()

    return None

In [2]:
from pathlib import Path
import requests

def download_file(year: int , month: int) -> Path:
    """"""
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    respnse = requests.get(URL)

    if respnse.status_code == 200:
        path = f"../data/raw/rides_{year}-{month:02d}.parquet"
        open(path, 'wb').write(respnse.content)
        return path
    else:
        raise Exception(f"Failed to download {URL}")

In [3]:
def validate_raw_data(
        rides: pd.DataFrame,
        month: int,
        year: int) -> pd.DataFrame:
    '''
    Removes faulty data from the raw dataframe if the date are outside their valid range
    '''

    start_month_date = f'{year}-{month:02d}-01'
    end_month_date = f'{year}-{month+1:02d}-01' if month < 12 else f'{year+1}-01-01'

    rides = rides[rides.pickup_time >= start_month_date]
    rides = rides[rides.pickup_time < end_month_date]

    return rides

In [4]:
import pandas as pd
from tqdm import tqdm

def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    '''
    https://stackoverflow.com/questions/19324453/add-missing-dates-to-pandas-dataframe/19324591#19324591
    '''
    location_ids = agg_rides['pickup_location'].unique()
    full_range = pd.date_range(
        agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):

        agg_rides_i = agg_rides.loc[agg_rides.pickup_location == location_id, ['pickup_hour','rides']]

        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        agg_rides_i['pickup_location'] = location_id

        output = pd.concat([output, agg_rides_i])

    output = output.reset_index().rename(columns={"index": 'pickup_hour'})

    return output


In [5]:
# time series slicing function

# rides = pd.read_parquet("../data/raw/rides_2022-01.parquet")

def create_index_tuples(data, step_size, number_of_features) -> list:
    """
    Create a list of tuples that are being used to index a time series dataframe.
    """
    output_list = []

    stop_index = len(data) - 1
    starting_index = 0
    targets = starting_index + number_of_features + step_size
    
    while targets < stop_index:
        output_list.append((starting_index, starting_index + number_of_features, targets))               
        targets += step_size
        starting_index += step_size

    return output_list

# create_index_tuples(rides, step_size=1,number_of_features=13)

In [6]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
    rides: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot rides.

    Parameters
    ----------
    rides : pd.DataFrame
        Rides dataframe.
    locations : Optional[List[int]], optional
        List of locations to plot. The default is None.

    Returns
    -------
    fig : plotly.graph_objects.Figure
        Plotly figure.
    """

    rides_to_plot = rides[rides.pickup_location.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color="pickup_location",
        template='none'
    )

    fig.show()

    return None