In [None]:
%%bash
tree ..

#### **`Dependencies`**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings

import pandas as pd
import plotly.express as px

from dotenv import load_dotenv
from plotly.graph_objects import Figure

# taxi-demand-forecasting modules
from src.feature_store_api import get_feature_group
from src.inference import generate_forecast
from src.paths import PathConfig
from src.transform import fetch_and_transform
from src.utils import plot_record

warnings.filterwarnings("ignore")
load_dotenv(PathConfig.PROJECT_DIR / ".env")

True

In [3]:
# set the Pandas DataFrame and Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

#### **`Data ingestion`**

In [None]:
%%bash
# push the latest validated and pre-processed data to Hopsworks
cd ..
make features

In [None]:
# fetch the latest validated and pre-processed data from Hopsworks
df: pd.DataFrame = get_feature_group().read()
df = (
    df
    .assign(unix_time_ms=pd.to_datetime(df["unix_time_ms"], unit="ms"))
    .rename({"unix_time_ms": "pickup_time"}, axis=1)
    .sort_values(by=["location_id", "pickup_time"])
    .reset_index(drop=True)
)

# confirm that the 'df' pd.DataFrame is free of null values and duplicates
assert df.isna().sum().sum() == 0
assert df.duplicated(subset=["location_id", "pickup_time"]).sum() == 0

In [None]:
# a list of select location IDs
location_ids: list[int] = [43, 90, 107]

# plot the hourly taxi rides for each location ID in the 'location_ids' list
fig: Figure = px.line(
    df.query(f"location_id.isin({location_ids})"),
    x="pickup_time",
    y="n_rides",
    color="location_id",
    labels={
        "pickup_time": "Datetime",
        "n_rides": "Number of taxi rides",
        "location_id": "Location ID"
    },
    title="NYC Hourly Taxi Rides",
    template="plotly_dark"
)
fig.show()

#### **`Data transformation`**

In [None]:
# fetch the latest validated and pre-processed data from Hopsworks, and ...
# transform it into machine learning-ready features and labels
fetch_and_transform()

#### **`Model training and evaluation`**

In [None]:
%%bash
# evaluate the current model on the latest data and replace/update it if necessary
cd ..
make training

#### **`Inference`**

In [4]:
# generate each location's one-step forecast, i.e., its predicted taxi demand for the upcoming hour
df: pd.DataFrame = fetch_and_transform().pipe(generate_forecast)

# extract the top 10 busiest locations, based on forecasted taxi demand
df = df.sort_values(by="forecast", ascending=False).reset_index(drop=True).head(10)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.31s) 
2024-10-03 08:15:58,397 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 38.77it/s]

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.






Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Downloading model artifact (0 dirs, 3 files)... DONE

100%|██████████| 250/250 [00:00<00:00, 400.62it/s]


In [5]:
# line plots of the 10 busiest locations, based on forecasted taxi demand
_ = [plot_record(df, location_id).show() for location_id in df["location_id"]]