In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [12]:
from datetime import datetime, timedelta
import pytz
import pandas as pd

current_date = pd.to_datetime(datetime.now()).ceil("h").tz_localize("US/Eastern")
current_date

Timestamp('2025-05-04 14:00:00-0400', tz='US/Eastern')

In [13]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now("US/Eastern") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.start_hour >= current_hour)

    return query.read()

In [14]:
pd.Timestamp.now(tz="US/Eastern").floor('h')

Timestamp('2025-05-04 13:00:00-0400', tz='US/Eastern')

In [15]:
df = fetch_hourly_rides(1)

2025-05-04 13:56:57,309 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-04 13:56:57,315 INFO: Initializing external client
2025-05-04 13:56:57,317 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-04 13:56:58,117 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.12s) 


In [16]:
df = df.sort_values(['start_hour']).reset_index(drop=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype                  
---  ------            --------------  -----                  
 0   start_hour        15 non-null     datetime64[us, Etc/UTC]
 1   start_station_id  15 non-null     float32                
 2   rides             15 non-null     int32                  
dtypes: datetime64[us, Etc/UTC](1), float32(1), int32(1)
memory usage: 372.0 bytes


In [18]:
df["start_hour"] = df["start_hour"].dt.tz_convert("US/Eastern")
df

Unnamed: 0,start_hour,start_station_id,rides
0,2025-05-04 08:00:00-04:00,6140.049805,9
1,2025-05-04 08:00:00-04:00,5905.140137,6
2,2025-05-04 08:00:00-04:00,6822.089844,12
3,2025-05-04 09:00:00-04:00,5905.140137,16
4,2025-05-04 09:00:00-04:00,6140.049805,11
5,2025-05-04 09:00:00-04:00,6822.089844,7
6,2025-05-04 10:00:00-04:00,5905.140137,21
7,2025-05-04 10:00:00-04:00,6822.089844,8
8,2025-05-04 10:00:00-04:00,6140.049805,27
9,2025-05-04 11:00:00-04:00,5905.140137,25


In [19]:
df_pred = fetch_predictions(1)

2025-05-04 13:57:51,529 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-04 13:57:51,537 INFO: Initializing external client
2025-05-04 13:57:51,538 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-04 13:57:52,343 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.38s) 


In [20]:
df_pred

Unnamed: 0,start_station_id,predicted_demand,start_hour
0,6822.089844,18.0,2025-05-04 12:00:00-04:00
1,5905.140137,21.0,2025-05-04 12:00:00-04:00
2,6140.049805,21.0,2025-05-04 12:00:00-04:00
3,6822.089844,9.0,2025-05-04 14:00:00-04:00
4,5905.140137,23.0,2025-05-04 14:00:00-04:00
5,6140.049805,14.0,2025-05-04 14:00:00-04:00


In [21]:
merged_df = pd.merge(df, df_pred, on=['start_station_id', 'start_hour'])

In [22]:
merged_df

Unnamed: 0,start_hour,start_station_id,rides,predicted_demand
0,2025-05-04 12:00:00-04:00,5905.140137,21,21.0
1,2025-05-04 12:00:00-04:00,6822.089844,15,18.0
2,2025-05-04 12:00:00-04:00,6140.049805,20,21.0


In [23]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [24]:
merged_df.sort_values(["start_station_id", "start_hour"])

Unnamed: 0,start_hour,start_station_id,rides,predicted_demand,difference
0,2025-05-04 12:00:00-04:00,5905.140137,21,21.0,0.0
2,2025-05-04 12:00:00-04:00,6140.049805,20,21.0,1.0
1,2025-05-04 12:00:00-04:00,6822.089844,15,18.0,3.0


In [25]:
import pandas as pd
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'
merged_df = pd.merge(df1, df2, on=['start_station_id', 'start_hour'])

# Calculate the absolute error
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)
mae_by_hour = merged_df.groupby('start_hour')['absolute_error'].mean().reset_index()
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)

# Create a Plotly plot
fig = px.line(
    mae_by_hour,
    x='start_hour',
    y='MAE',
    title='Mean Absolute Error (MAE) by Pickup Hour',
    labels={'start_hour': 'Start Hour', 'MAE': 'Mean Absolute Error'},
    markers=True
)

# Show the plot
fig.show()


In [26]:
mae_by_hour["MAE"].mean()

1.3333333333333333