In [1]:
%%bash 
cd ..
tree
# rm -r logs
# cd src
# rm -r __pycache__

[01;34m.[0m
├── [00mLICENSE[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
├── [01;34mnotebooks[0m
│   └── [00mtaxi-demand-forecasting.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
└── [01;34msrc[0m
    ├── [00m__init__.py[0m
    ├── [00mingest.py[0m
    ├── [00mlogger.py[0m
    ├── [00mpaths.py[0m
    └── [01;34mpipelines[0m
        └── [00mfeature_pipeline.py[0m

6 directories, 12 files


#### **`Dependencies`**

In [2]:
import pandas as pd
import plotly.express as px

from plotly.graph_objects import Figure

# taxi-demand-forecasting modules
from src.ingest import download_data

In [3]:
# set the Pandas DataFrame and Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

#### **`Data ingestion`**

In [4]:
# get the latest batch of validated and pre-processed hourly NYC taxi demand data
df: pd.DataFrame = download_data()

# confirm that the 'df' pd.DataFrame is free of null values and duplicates
assert df.isna().sum().sum() == 0
assert df.duplicated(subset=["location_id", "pickup_time"]).sum() == 0

100%|██████████| 252/252 [00:03<00:00, 83.20it/s]


In [5]:
# a list of select location IDs
location_ids: list[int] = [43, 90, 107]

# plot the hourly taxi rides for each location ID in the 'location_ids' list
fig: Figure = px.line(
    df.query(f"location_id.isin({location_ids})"),
    x="pickup_time",
    y="n_rides",
    color="location_id",
    labels={
        "pickup_time": "Datetime",
        "n_rides": "Number of taxi rides",
        "location_id": "Location ID"
    },
    title="NYC Hourly Taxi Rides",
    template="plotly_dark"
)
fig.show()