In [1]:
%%bash
tree ..
# cd ..
# rm -r logs
# cd src
# rm -r __pycache__
# cd pipelines
# rm -r __pycache__

[01;34m..[0m
├── [00mLICENSE[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
├── [01;34mnotebooks[0m
│   └── [00mtaxi-demand-forecasting.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
└── [01;34msrc[0m
    ├── [00m__init__.py[0m
    ├── [00mfeature_store_api.py[0m
    ├── [00mingest.py[0m
    ├── [00mlogger.py[0m
    ├── [00mpaths.py[0m
    ├── [01;34mpipelines[0m
    │   └── [00mfeature_pipeline.py[0m
    ├── [00mtrain.py[0m
    └── [00mtransform.py[0m

6 directories, 15 files


#### **`Dependencies`**

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings

import pandas as pd
import plotly.express as px

from IPython.display import Markdown
from catboost import CatBoostRegressor
from dotenv import load_dotenv
from lightgbm import LGBMRegressor
from plotly.graph_objects import Figure
from xgboost import XGBRegressor

# taxi-demand-forecasting modules
from src.feature_store_api import get_feature_group
from src.paths import PathConfig
from src.train import NaiveForecast, compute_metrics, get_time_series_splits, train_model
from src.transform import fetch_and_transform

warnings.filterwarnings("ignore")
load_dotenv(PathConfig.PROJECT_DIR / ".env")

True

In [4]:
# set the Pandas DataFrame and Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

#### **`Data ingestion`**

In [5]:
%%bash
# push the latest validated and pre-processed data to Hopsworks
cd ..
make features

poetry run python src/pipelines/feature_pipeline.py
2024-10-01 13:22:12,643 INFO: Downloading, validating, and pre-processing https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet.


100%|██████████| 258/258 [00:05<00:00, 48.53it/s]


2024-10-01 13:22:24,348 INFO: Downloading, validating, and pre-processing https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet.


100%|██████████| 256/256 [00:06<00:00, 42.61it/s]


2024-10-01 13:22:35,140 INFO: Uploading the latest NYC taxi demand data to Hopsworks, Project Name: taxi_demand_forecasting, Feature Group: hourly_taxi_rides
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 100.00% |██████████| Rows 79036/79036 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: hourly_taxi_rides_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/903316/jobs/named/hourly_taxi_rides_1_offline_fg_materialization/executions


In [6]:
# fetch the latest validated and pre-processed data from Hopsworks
df: pd.DataFrame = get_feature_group().read()
df = (
    df
    .assign(unix_time_ms=pd.to_datetime(df["unix_time_ms"], unit="ms"))
    .rename({"unix_time_ms": "pickup_time"}, axis=1)
    .sort_values(by=["location_id", "pickup_time"])
    .reset_index(drop=True)
)

# confirm that the 'df' pd.DataFrame is free of null values and duplicates
assert df.isna().sum().sum() == 0
assert df.duplicated(subset=["location_id", "pickup_time"]).sum() == 0

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.18s) 


In [7]:
# a list of select location IDs
location_ids: list[int] = [43, 90, 107]

# plot the hourly taxi rides for each location ID in the 'location_ids' list
fig: Figure = px.line(
    df.query(f"location_id.isin({location_ids})"),
    x="pickup_time",
    y="n_rides",
    color="location_id",
    labels={
        "pickup_time": "Datetime",
        "n_rides": "Number of taxi rides",
        "location_id": "Location ID"
    },
    title="NYC Hourly Taxi Rides",
    template="plotly_dark"
)
fig.show()

#### **`Data transformation`**

In [8]:
# fetch the latest validated and pre-processed data from Hopsworks, and ...
# transform it into machine learning-ready features and labels
fetch_and_transform()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.10s) 
2024-10-01 13:25:27,785 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 38.20it/s]


Unnamed: 0,location_id,pickup_time,day_of_week,hour,avg_24_lags,avg_20_lags,avg_16_lags,avg_12_lags,avg_8_lags,avg_4_lags,lag_24,lag_23,lag_22,lag_21,lag_20,lag_19,lag_18,lag_17,lag_16,lag_15,lag_14,lag_13,lag_12,lag_11,lag_10,lag_9,lag_8,lag_7,lag_6,lag_5,lag_4,lag_3,lag_2,lag_1,target
0,1,2024-09-17 15:00:00,1,15,0.666667,0.65,0.6875,0.916667,1.250,1.50,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1
1,1,2024-09-17 16:00:00,1,16,0.666667,0.70,0.7500,0.916667,1.375,1.75,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2
2,1,2024-09-17 17:00:00,1,17,0.750000,0.75,0.8750,1.083333,1.375,2.25,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1
3,1,2024-09-17 18:00:00,1,18,0.750000,0.75,0.9375,1.166667,1.500,1.75,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1.0,0
4,1,2024-09-17 19:00:00,1,19,0.708333,0.75,0.9375,1.166667,1.250,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,3.0,3.0,1.0,2.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78842,265,2024-10-01 16:00:00,1,16,3.375000,3.65,4.0000,3.666667,4.625,6.00,3.0,3.0,2.0,0.0,2.0,4.0,1.0,2.0,7.0,3.0,2.0,8.0,3.0,2.0,0.0,2.0,2.0,4.0,3.0,4.0,6.0,5.0,7.0,6.0,4
78843,265,2024-10-01 17:00:00,1,17,3.416667,3.75,3.8125,3.750000,4.875,5.50,3.0,2.0,0.0,2.0,4.0,1.0,2.0,7.0,3.0,2.0,8.0,3.0,2.0,0.0,2.0,2.0,4.0,3.0,4.0,6.0,5.0,7.0,6.0,4.0,7
78844,265,2024-10-01 18:00:00,1,18,3.583333,3.90,4.0625,4.166667,5.250,6.00,2.0,0.0,2.0,4.0,1.0,2.0,7.0,3.0,2.0,8.0,3.0,2.0,0.0,2.0,2.0,4.0,3.0,4.0,6.0,5.0,7.0,6.0,4.0,7.0,9
78845,265,2024-10-01 19:00:00,1,19,3.875000,4.30,4.5000,4.916667,6.000,6.50,0.0,2.0,4.0,1.0,2.0,7.0,3.0,2.0,8.0,3.0,2.0,0.0,2.0,2.0,4.0,3.0,4.0,6.0,5.0,7.0,6.0,4.0,7.0,9.0,5


#### **`Model training`**

In [9]:
# fetch the latest validated and pre-processed data from Hopsworks, and ...
# transform it into machine learning-ready features and labels
df: pd.DataFrame = fetch_and_transform()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/903316
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.11s) 
2024-10-01 13:25:54,970 INFO: Transforming the NYC taxi demand data into features and labels.


100%|██████████| 253/253 [00:06<00:00, 36.78it/s]


In [10]:
# train select models on the 'df' pd.DataFrame and extract the 'best' one
model: CatBoostRegressor | LGBMRegressor | XGBRegressor = train_model(df)
model

  0%|          | 0/3 [00:00<?, ?it/s]

2024-10-01 13:26:07,802 INFO: Training initiated for the CatBoostRegressor.


 33%|███▎      | 1/3 [00:06<00:13,  6.90s/it]

2024-10-01 13:26:14,705 INFO: Training initiated for the LGBMRegressor.


 67%|██████▋   | 2/3 [00:10<00:04,  4.94s/it]

2024-10-01 13:26:18,278 INFO: Training initiated for the XGBRegressor.


100%|██████████| 3/3 [00:12<00:00,  4.28s/it]

2024-10-01 13:26:20,649 INFO: Training complete, the LGBMRegressor produced the lowest average validation set RMSE.





In [11]:
# extract a subset from the 'df' pd.DataFrame as the test set
split: pd.Timestamp = get_time_series_splits(df)[0][-1]
df_test: pd.DataFrame = df.query(f"pickup_time > '{split}'")

# get the 'baseline' test set metrics
baseline_metrics: dict[str, float] = compute_metrics(
    df_test["target"], NaiveForecast().predict(df_test)
)

# get the model's test set metrics
model_metrics: dict[str, float] = compute_metrics(
    df_test["target"],
    model.predict(df_test.drop(["location_id", "pickup_time", "target"], axis=1))
)


# compare the baseline and model metrics
Markdown(
    f"""Baseline RMSE: {baseline_metrics.get('rmse')}; Model RMSE: {model_metrics.get('rmse')}\n
Baseline R²: {baseline_metrics.get('r_squared')}; Model R²: {model_metrics.get('r_squared')}"""
)

Baseline RMSE: 16.2972; Model RMSE: 6.337

Baseline R²: 0.9172; Model R²: 0.9875