In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("/home/mcamara/taxi_demand_predictor")

In [3]:
import zipfile
from datetime import datetime

import requests
import numpy as np
import pandas as pd

# plotting libraries
import streamlit as st
import geopandas as gpd
import pydeck as pdk

from src.inference import load_predictions_from_store
from src.paths import DATA_DIR
from src.plot import plot_one_sample

In [4]:
current_date = pd.to_datetime(datetime.utcnow()).floor('H')
# current_date = pd.Timestamp('2023-03-01 11:00:00')
current_date

Timestamp('2023-03-30 09:00:00')

In [5]:
# fetch file with shape data
from geopandas.geodataframe import GeoDataFrame

def load_shape_data_file() -> GeoDataFrame:
    """Fetches remote file with shape data, that we later use to plot the
    different pickup_location_ids on the map of NYC.

    Raises:
        Exception: raised when we cannot connect to the external server where
        the file is.

    Returns:
        GeoDataFrame: columns -> (OBJECTID	Shape_Leng	Shape_Area	zone	LocationID	borough	geometry)
    """
    # download file
    URL = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'
    response = requests.get(URL)
    path = DATA_DIR / f'taxi_zones.zip'
    if response.status_code == 200:
        open(path, "wb").write(response.content)
    else:
        raise Exception(f'{URL} is not available')

    # unzip file
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR / 'taxi_zones')

    # load and return shape file
    return gpd.read_file(DATA_DIR / 'taxi_zones/taxi_zones.shp').to_crs('epsg:4326')




In [6]:
geo_df = load_shape_data_file()

In [7]:
from datetime import timedelta
from src.inference import load_predictions_from_store

predictions_df = load_predictions_from_store(
    from_pickup_hour=current_date - timedelta(hours=1),
    to_pickup_hour=current_date
)


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/24729
Connected. Call `.close()` to terminate connection gracefully.
Feature view model_predictions_feature_view               already existed. Skipped creation.
Fetching predictions for `pickup_hours` between 2023-03-30 08:00:00  and 2023-03-30 09:00:00


2023-03-30 11:42:35.756 INFO    pyhive.hive: USE `taxi_demand_mc_featurestore`
2023-03-30 11:42:36.285 INFO    pyhive.hive: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_mc_featurestore`.`time_series_hourly_feature_group_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '2023-03-29 08:00:00.000' AND `fg0`.`pickup_hour` <= TIMESTAMP '2023-03-31 09:00:00.000'


In [8]:
predictions_df

Unnamed: 0,pickup_hour,rides,pickup_location_id
8960,2023-03-30 08:00:00,2,1
10310,2023-03-30 08:00:00,0,2
536,2023-03-30 08:00:00,0,3
3608,2023-03-30 08:00:00,18,4
5046,2023-03-30 08:00:00,0,5
...,...,...,...
8665,2023-03-30 09:00:00,18,261
1012,2023-03-30 09:00:00,143,262
1578,2023-03-30 09:00:00,131,263
3355,2023-03-30 09:00:00,42,264


In [9]:
predictions_df[predictions_df.pickup_hour == current_date].empty

next_hour_predictions_ready = \
    False if predictions_df[predictions_df.pickup_hour == current_date].empty else True

next_hour_predictions_ready

True

In [10]:
from src.inference import load_batch_of_features_from_store

features = load_batch_of_features_from_store(current_date)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/24729
Connected. Call `.close()` to terminate connection gracefully.
Fetching data from 2023-03-02 09:00:00 to 2023-03-30 08:00:00


2023-03-30 11:42:51.234 INFO    pyhive.hive: USE `taxi_demand_mc_featurestore`
2023-03-30 11:42:51.743 INFO    pyhive.hive: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_mc_featurestore`.`time_series_hourly_feature_group_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '2023-03-01 09:00:00.000' AND `fg0`.`pickup_hour` <= TIMESTAMP '2023-03-31 08:00:00.000'


In [11]:


features



Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
46,0.0,6.0,2.0,2.0,8.0,4.0,2.0,2.0,6.0,8.0,...,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0,2023-03-30 09:00:00,1
74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-03-30 09:00:00,2
94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-03-30 09:00:00,3
197,18.0,6.0,2.0,6.0,4.0,4.0,8.0,16.0,12.0,22.0,...,2.0,0.0,0.0,0.0,0.0,6.0,16.0,18.0,2023-03-30 09:00:00,4
253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-03-30 09:00:00,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,22.0,48.0,56.0,64.0,64.0,58.0,110.0,66.0,72.0,70.0,...,4.0,4.0,0.0,8.0,4.0,4.0,24.0,22.0,2023-03-30 09:00:00,261
65,256.0,262.0,262.0,250.0,202.0,234.0,276.0,244.0,276.0,264.0,...,14.0,4.0,2.0,8.0,42.0,126.0,260.0,206.0,2023-03-30 09:00:00,262
140,320.0,276.0,252.0,278.0,256.0,346.0,308.0,364.0,334.0,434.0,...,44.0,36.0,10.0,22.0,54.0,152.0,236.0,268.0,2023-03-30 09:00:00,263
236,116.0,156.0,168.0,122.0,126.0,146.0,162.0,162.0,224.0,210.0,...,8.0,6.0,14.0,8.0,8.0,30.0,62.0,72.0,2023-03-30 09:00:00,264
