In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("/home/mcamara/taxi_demand_predictor")

In [3]:
import zipfile
from datetime import datetime

import requests
import numpy as np
import pandas as pd

# plotting libraries
import streamlit as st
import geopandas as gpd
import pydeck as pdk

from src.inference import load_predictions_from_store
from src.paths import DATA_DIR
from src.plot import plot_one_sample

In [4]:
current_date = pd.to_datetime(datetime.utcnow()).floor('H')
# current_date = pd.Timestamp('2023-03-01 11:00:00')
current_date

Timestamp('2023-03-29 09:00:00')

In [5]:
# fetch file with shape data
from geopandas.geodataframe import GeoDataFrame

def load_shape_data_file() -> GeoDataFrame:
    """Fetches remote file with shape data, that we later use to plot the
    different pickup_location_ids on the map of NYC.

    Raises:
        Exception: raised when we cannot connect to the external server where
        the file is.

    Returns:
        GeoDataFrame: columns -> (OBJECTID	Shape_Leng	Shape_Area	zone	LocationID	borough	geometry)
    """
    # download file
    URL = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'
    response = requests.get(URL)
    path = DATA_DIR / f'taxi_zones.zip'
    if response.status_code == 200:
        open(path, "wb").write(response.content)
    else:
        raise Exception(f'{URL} is not available')

    # unzip file
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR / 'taxi_zones')

    # load and return shape file
    return gpd.read_file(DATA_DIR / 'taxi_zones/taxi_zones.shp').to_crs('epsg:4326')




In [6]:
geo_df = load_shape_data_file()

In [9]:
from datetime import timedelta
from src.inference import load_predictions_from_store

predictions_df = load_predictions_from_store(
    from_pickup_hour=current_date - timedelta(hours=1),
    to_pickup_hour=current_date
)


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/24729
Connected. Call `.close()` to terminate connection gracefully.
Feature view model_predictions_feature_view               already existed. Skipped creation.
Fetching predictions for `pickup_hours` between 2023-03-29 08:00:00  and 2023-03-29 09:00:00


2023-03-29 15:14:37.706 INFO    pyhive.hive: USE `taxi_demand_mc_featurestore`
2023-03-29 15:14:38.180 INFO    pyhive.hive: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_mc_featurestore`.`time_series_hourly_feature_group_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '2023-03-28 08:00:00.000' AND `fg0`.`pickup_hour` <= TIMESTAMP '2023-03-30 09:00:00.000'


In [10]:
predictions_df

Unnamed: 0,pickup_hour,rides,pickup_location_id
9008,2023-03-29 08:00:00,2,1
12521,2023-03-29 08:00:00,0,2
11353,2023-03-29 08:00:00,2,3
2354,2023-03-29 08:00:00,10,4
6003,2023-03-29 08:00:00,0,5
...,...,...,...
6231,2023-03-29 09:00:00,38,261
6808,2023-03-29 09:00:00,256,262
11431,2023-03-29 09:00:00,290,263
12808,2023-03-29 09:00:00,84,264


In [11]:
predictions_df[predictions_df.pickup_hour == current_date].empty

next_hour_predictions_ready = \
    False if predictions_df[predictions_df.pickup_hour == current_date].empty else True

next_hour_predictions_ready

True

In [12]:
from src.inference import load_batch_of_features_from_store

features = load_batch_of_features_from_store(current_date)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/24729
Connected. Call `.close()` to terminate connection gracefully.
Fetching data from 2023-03-01 09:00:00 to 2023-03-29 08:00:00


2023-03-29 15:24:09.811 INFO    pyhive.hive: USE `taxi_demand_mc_featurestore`
2023-03-29 15:24:10.279 INFO    pyhive.hive: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_mc_featurestore`.`time_series_hourly_feature_group_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '2023-02-28 09:00:00.000' AND `fg0`.`pickup_hour` <= TIMESTAMP '2023-03-30 08:00:00.000'


In [13]:


features



Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
50,2.0,1.0,0.0,4.0,4.0,2.0,8.0,4.0,0.0,2.0,...,0.0,0.0,0.0,2.0,0.0,4.0,0.0,2.0,2023-03-29 09:00:00,1
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-03-29 09:00:00,2
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2023-03-29 09:00:00,3
202,7.0,5.0,1.0,6.0,2.0,14.0,2.0,2.0,12.0,8.0,...,2.0,2.0,0.0,0.0,0.0,0.0,6.0,10.0,2023-03-29 09:00:00,4
256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-03-29 09:00:00,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,17.0,24.0,23.0,40.0,66.0,88.0,62.0,46.0,62.0,76.0,...,6.0,0.0,0.0,0.0,4.0,8.0,28.0,36.0,2023-03-29 09:00:00,261
69,130.0,123.0,116.0,200.0,204.0,218.0,234.0,266.0,256.0,222.0,...,0.0,4.0,0.0,4.0,30.0,118.0,252.0,216.0,2023-03-29 09:00:00,262
49,145.0,156.0,99.0,254.0,248.0,278.0,260.0,294.0,296.0,376.0,...,44.0,20.0,18.0,14.0,44.0,150.0,238.0,282.0,2023-03-29 09:00:00,263
241,50.0,61.0,63.0,118.0,126.0,122.0,138.0,106.0,150.0,170.0,...,14.0,4.0,2.0,4.0,4.0,14.0,86.0,86.0,2023-03-29 09:00:00,264


In [None]:
print("Hello")