In [1]:
%%bash
tree ..
# rm -r catboost_info
# cd ..
# rm -r logs
# cd src
# rm -r __pycache__
# cd pipelines
# rm -r __pycache__

[01;34m..[0m
├── [00mLICENSE[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
│   └── [00mmodel.pkl[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
├── [01;34mnotebooks[0m
│   └── [00mtaxi-demand-forecasting.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
└── [01;34msrc[0m
    ├── [00m__init__.py[0m
    ├── [00mfeature_store_api.py[0m
    ├── [00minference.py[0m
    ├── [00mingest.py[0m
    ├── [00mlogger.py[0m
    ├── [00mpaths.py[0m
    ├── [01;34mpipelines[0m
    │   ├── [00mfeature_pipeline.py[0m
    │   └── [00mtraining_pipeline.py[0m
    ├── [00mtrain.py[0m
    └── [00mtransform.py[0m

6 directories, 18 files


#### **`Dependencies`**

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings

import pandas as pd
import plotly.express as px

from dotenv import load_dotenv
from plotly.graph_objects import Figure

# taxi-demand-forecasting modules
from src.feature_store_api import get_feature_group
from src.inference import generate_forecast
from src.paths import PathConfig
from src.transform import fetch_and_transform

warnings.filterwarnings("ignore")
load_dotenv(PathConfig.PROJECT_DIR / ".env")

True

In [4]:
# set the Pandas DataFrame and Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

#### **`Data ingestion`**

In [5]:
%%bash
# push the latest validated and pre-processed data to Hopsworks
cd ..
make features

poetry run python src/pipelines/feature_pipeline.py
2024-10-02 12:31:24,773 INFO: Downloading, validating, and pre-processing https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet.


100%|██████████| 258/258 [00:05<00:00, 49.07it/s]


2024-10-02 12:31:38,376 INFO: Downloading, validating, and pre-processing https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet.


100%|██████████| 256/256 [00:05<00:00, 42.94it/s]


2024-10-02 12:31:51,305 INFO: Uploading the latest NYC taxi demand data to Hopsworks, Project Name: taxi_demand_forecasting, Feature Group: hourly_taxi_rides
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 100.00% |██████████| Rows 78458/78458 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: hourly_taxi_rides_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/903316/jobs/named/hourly_taxi_rides_1_offline_fg_materialization/executions


In [6]:
# fetch the latest validated and pre-processed data from Hopsworks
df: pd.DataFrame = get_feature_group().read()
df = (
    df
    .assign(unix_time_ms=pd.to_datetime(df["unix_time_ms"], unit="ms"))
    .rename({"unix_time_ms": "pickup_time"}, axis=1)
    .sort_values(by=["location_id", "pickup_time"])
    .reset_index(drop=True)
)

# confirm that the 'df' pd.DataFrame is free of null values and duplicates
assert df.isna().sum().sum() == 0
assert df.duplicated(subset=["location_id", "pickup_time"]).sum() == 0

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.21s) 


In [7]:
# a list of select location IDs
location_ids: list[int] = [43, 90, 107]

# plot the hourly taxi rides for each location ID in the 'location_ids' list
fig: Figure = px.line(
    df.query(f"location_id.isin({location_ids})"),
    x="pickup_time",
    y="n_rides",
    color="location_id",
    labels={
        "pickup_time": "Datetime",
        "n_rides": "Number of taxi rides",
        "location_id": "Location ID"
    },
    title="NYC Hourly Taxi Rides",
    template="plotly_dark"
)
fig.show()

#### **`Data transformation`**

In [8]:
# fetch the latest validated and pre-processed data from Hopsworks, and ...
# transform it into machine learning-ready features and labels
fetch_and_transform()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.21s) 
2024-10-02 12:37:45,820 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 39.10it/s]


Unnamed: 0,location_id,pickup_time,day_of_week,hour,avg_24_lags,avg_20_lags,avg_16_lags,avg_12_lags,avg_8_lags,avg_4_lags,lag_24,lag_23,lag_22,lag_21,lag_20,lag_19,lag_18,lag_17,lag_16,lag_15,lag_14,lag_13,lag_12,lag_11,lag_10,lag_9,lag_8,lag_7,lag_6,lag_5,lag_4,lag_3,lag_2,lag_1,target
0,1,2024-09-17 15:00:00,1,15,0.666667,0.65,0.6875,0.916667,1.250,1.50,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1
1,1,2024-09-17 16:00:00,1,16,0.666667,0.70,0.7500,0.916667,1.375,1.75,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2
2,1,2024-09-17 17:00:00,1,17,0.750000,0.75,0.8750,1.083333,1.375,2.25,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1
3,1,2024-09-17 18:00:00,1,18,0.750000,0.75,0.9375,1.166667,1.500,1.75,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1.0,0
4,1,2024-09-17 19:00:00,1,19,0.708333,0.75,0.9375,1.166667,1.250,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83936,265,2024-10-02 15:00:00,2,15,4.041667,3.55,2.8125,1.500000,1.625,2.00,6.0,4.0,7.0,9.0,5.0,6.0,6.0,9.0,10.0,13.0,2.0,2.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,6
83937,265,2024-10-02 16:00:00,2,16,4.041667,3.60,2.5625,1.833333,2.250,3.00,4.0,7.0,9.0,5.0,6.0,6.0,9.0,10.0,13.0,2.0,2.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,6.0,3
83938,265,2024-10-02 17:00:00,2,17,4.000000,3.45,1.9375,2.000000,2.375,3.25,7.0,9.0,5.0,6.0,6.0,9.0,10.0,13.0,2.0,2.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,6.0,3.0,7
83939,265,2024-10-02 18:00:00,2,18,4.000000,3.50,2.2500,2.583333,3.250,4.50,9.0,5.0,6.0,6.0,9.0,10.0,13.0,2.0,2.0,2.0,1.0,0.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,6.0,3.0,7.0,6


#### **`Model training and evaluation`**

In [9]:
%%bash
# evaluate the current model on the latest data and replace/update it if necessary
cd ..
make training

poetry run python src/pipelines/training_pipeline.py
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.36s) 
2024-10-02 12:38:34,042 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 40.42it/s]


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.


100%|██████████| 249/249 [00:00<00:00, 416.75it/s]


2024-10-02 12:38:44,603 INFO: The current forecasting model is fine.


#### **`Inference`**

In [10]:
# generate each location ID's one-step forecast, that is, ...
# its predicted taxi demand for the upcoming hour
fetch_and_transform().pipe(generate_forecast)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.19s) 
2024-10-02 12:39:14,725 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 39.05it/s]

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.






Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Downloading model artifact (0 dirs, 3 files)... DONE

100%|██████████| 249/249 [00:00<00:00, 409.26it/s]


Unnamed: 0,location_id,pickup_time,day_of_week,hour,avg_24_lags,avg_20_lags,avg_16_lags,avg_12_lags,avg_8_lags,avg_4_lags,lag_24,lag_23,lag_22,lag_21,lag_20,lag_19,lag_18,lag_17,lag_16,lag_15,lag_14,lag_13,lag_12,lag_11,lag_10,lag_9,lag_8,lag_7,lag_6,lag_5,lag_4,lag_3,lag_2,lag_1,forecast
0,1,2024-10-02 20:00:00,2,20,0.291667,0.30,0.3750,0.416667,0.500,0.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0
1,2,2024-09-26 04:00:00,3,4,0.041667,0.05,0.0625,0.083333,0.125,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,3,2024-10-02 20:00:00,2,20,0.166667,0.20,0.2500,0.250000,0.375,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,4,2024-10-02 20:00:00,2,20,2.583333,2.20,2.5000,2.416667,2.500,2.00,3.0,4.0,8.0,3.0,2.0,1.0,0.0,1.0,2.0,0.0,3.0,6.0,6.0,3.0,0.0,0.0,2.0,4.0,1.0,5.0,1.0,1.0,5.0,1.0,1
4,6,2024-10-02 20:00:00,2,20,0.125000,0.00,0.0000,0.000000,0.000,0.00,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,261,2024-10-02 20:00:00,2,20,20.666667,21.95,26.8125,34.583333,42.375,45.00,20.0,18.0,15.0,4.0,6.0,2.0,1.0,1.0,3.0,2.0,2.0,7.0,13.0,17.0,19.0,27.0,31.0,38.0,39.0,51.0,49.0,49.0,55.0,27.0,23
245,262,2024-10-02 20:00:00,2,20,58.125000,66.45,82.3125,93.750000,84.625,83.50,25.0,19.0,14.0,8.0,4.0,2.0,3.0,3.0,5.0,14.0,57.0,116.0,127.0,123.0,100.0,98.0,90.0,78.0,68.0,107.0,94.0,112.0,86.0,42.0,29
246,263,2024-10-02 20:00:00,2,20,67.916667,69.90,84.4375,96.416667,97.750,99.25,88.0,66.0,46.0,32.0,21.0,10.0,7.0,9.0,12.0,17.0,51.0,114.0,113.0,103.0,75.0,84.0,94.0,91.0,96.0,104.0,90.0,108.0,99.0,100.0,82
247,264,2024-10-02 20:00:00,2,20,32.875000,32.25,38.5625,47.333333,53.375,56.25,46.0,35.0,35.0,28.0,13.0,8.0,6.0,1.0,3.0,6.0,12.0,28.0,36.0,27.0,39.0,39.0,52.0,55.0,46.0,49.0,45.0,56.0,68.0,56.0,48
