In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd

from src.config import TRANSFORMED_DATA_DIR

features_and_targets = pd.read_parquet(TRANSFORMED_DATA_DIR / "transformed_features_and_target_top50.parquet")

In [4]:
# Drop the target column and any identifier columns
features = features_and_targets.drop(columns=["target"])
targets = features_and_targets["target"]

In [6]:
features.head()
features[features['start_station_id']=='M32006']

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,temp_t-2,temp_t-1,wind_speed_t-3,wind_speed_t-2,wind_speed_t-1,precipitation_t-3,precipitation_t-2,precipitation_t-1,start_station_id,hour
258933,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,10.0,...,,,,,,,,,M32006,2024-10-28 22:00:00
258934,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,10.0,18.0,...,,,,,,,,,M32006,2024-10-28 23:00:00
258935,0.0,0.0,1.0,1.0,0.0,0.0,4.0,10.0,18.0,12.0,...,,,,,,,,,M32006,2024-10-29 00:00:00
258936,0.0,1.0,1.0,0.0,0.0,4.0,10.0,18.0,12.0,14.0,...,,,,,,,,,M32006,2024-10-29 01:00:00
258937,1.0,1.0,0.0,0.0,4.0,10.0,18.0,12.0,14.0,11.0,...,,,,,,,,,M32006,2024-10-29 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267021,25.0,19.0,17.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0,...,,,,,,,,,M32006,2025-09-30 19:00:00
267022,19.0,17.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0,0.0,...,,,,,,,,,M32006,2025-09-30 20:00:00
267023,17.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0,0.0,2.0,...,,,,,,,,,M32006,2025-09-30 21:00:00
267024,7.0,5.0,2.0,4.0,0.0,0.0,0.0,0.0,2.0,12.0,...,,,,,,,,,M32006,2025-09-30 22:00:00


In [7]:
from datetime import timedelta
from typing import Optional

import pandas as pd
import plotly.express as px



def plot_aggregated_time_series(
    features: pd.DataFrame,
    targets: pd.Series,
    row_id: int,
    predictions: Optional[pd.Series] = None,
):
    """
    Plots the time series data for a specific location from NYC taxi data.

    Args:
        features (pd.DataFrame): DataFrame containing feature data, including historical ride counts and metadata.
        targets (pd.Series): Series containing the target values (e.g., actual ride counts).
        row_id (int): Index of the row to plot.
        predictions (Optional[pd.Series]): Series containing predicted values (optional).

    Returns:
        plotly.graph_objects.Figure: A Plotly figure object showing the time series plot.
    """
    # Extract the specific location's features and target
    location_features = features.iloc[row_id]
    actual_target = targets.iloc[row_id]

    # Identify time series columns (e.g., historical ride counts)
    time_series_columns = [
        col for col in features.columns if col.startswith("rides_t-")
    ]
    time_series_values = [location_features[col] for col in time_series_columns] + [
        actual_target
    ]

    # Generate corresponding timestamps for the time series
    time_series_dates = pd.date_range(
        start=location_features["hour"]
        - timedelta(hours=4*len(time_series_columns)),
        end=location_features["hour"],
        freq="4h",
    )

    # Create the plot title with relevant metadata
    title = f"Pickup Hour: {location_features['hour']}, Location ID: {location_features['start_station_id']}"

    # Create the base line plot
    fig = px.line(
        x=time_series_dates,
        y=time_series_values,
        template="plotly_white",
        markers=True,
        title=title,
        labels={"x": "Time", "y": "Ride Counts"},
    )

    # Add the actual target value as a green marker
    fig.add_scatter(
        x=time_series_dates[-1:],  # Last timestamp
        y=[actual_target],  # Actual target value
        line_color="green",
        mode="markers",
        marker_size=10,
        name="Actual Value",
    )

    # Optionally add the prediction as a red marker
    if predictions is not None:
        predicted_value = predictions[row_id]
        fig.add_scatter(
            x=time_series_dates[-1:],  # Last timestamp
            y=[predicted_value],  # Predicted value
            line_color="red",
            mode="markers",
            marker_symbol="x",
            marker_size=15,
            name="Prediction",
        )

    # if predictions is not None:
    #     fig.add_scatter(
    #         x=time_series_dates[-1:],  # Last timestamp
    #         y=predictions[
    #             predictions["pickup_location_id" == row_id]
    #         ],  # Predicted value
    #         line_color="red",
    #         mode="markers",
    #         marker_symbol="x",
    #         marker_size=15,
    #         name="Prediction",
    #     )

    return fig


In [8]:
plot_aggregated_time_series(features, targets, row_id=1)