In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [4]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pd.Timestamp.now(tz="Etc/UTC").floor('h')

Timestamp('2025-02-24 20:00:00+0000', tz='Etc/UTC')

In [6]:
df = fetch_hourly_rides(12)

2025-02-24 15:37:07,150 INFO: Initializing external client
2025-02-24 15:37:07,152 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-24 15:37:09,314 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.85s) 


In [7]:
df = df.sort_values(['pickup_hour']).reset_index(drop=True)

In [8]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-24 03:00:00+00:00,162,1
1,2025-02-24 03:00:00+00:00,55,0
2,2025-02-24 03:00:00+00:00,191,0
3,2025-02-24 03:00:00+00:00,151,0
4,2025-02-24 03:00:00+00:00,79,12
...,...,...,...
4513,2025-02-24 20:00:00+00:00,233,50
4514,2025-02-24 20:00:00+00:00,256,2
4515,2025-02-24 20:00:00+00:00,122,0
4516,2025-02-24 20:00:00+00:00,17,0


In [54]:
df_pred = fetch_predictions(12)

2025-02-23 10:24:18,099 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-23 10:24:18,111 INFO: Initializing external client
2025-02-23 10:24:18,111 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-23 10:24:19,001 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.58s) 


In [55]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,193,2.0,2025-02-22 23:00:00+00:00
1,125,27.0,2025-02-22 23:00:00+00:00
2,72,0.0,2025-02-22 23:00:00+00:00
3,20,0.0,2025-02-22 23:00:00+00:00
4,216,1.0,2025-02-22 23:00:00+00:00
...,...,...,...
4107,125,20.0,2025-02-23 15:00:00+00:00
4108,24,10.0,2025-02-23 15:00:00+00:00
4109,81,0.0,2025-02-23 15:00:00+00:00
4110,19,0.0,2025-02-23 15:00:00+00:00


In [56]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [57]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2025-02-22 23:00:00+00:00,96,0,0.0
1,2025-02-22 23:00:00+00:00,178,0,0.0
2,2025-02-22 23:00:00+00:00,88,47,14.0
3,2025-02-22 23:00:00+00:00,236,112,270.0
4,2025-02-22 23:00:00+00:00,9,0,0.0
...,...,...,...,...
3760,2025-02-23 14:00:00+00:00,30,0,0.0
3761,2025-02-23 14:00:00+00:00,62,0,1.0
3762,2025-02-23 14:00:00+00:00,238,126,94.0
3763,2025-02-23 14:00:00+00:00,142,176,143.0


In [58]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [59]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
216,2025-02-22 23:00:00+00:00,2,0,0.0,0.0
381,2025-02-23 00:00:00+00:00,2,0,0.0,0.0
725,2025-02-23 02:00:00+00:00,2,0,0.0,0.0
784,2025-02-23 03:00:00+00:00,2,0,0.0,0.0
1139,2025-02-23 04:00:00+00:00,2,0,0.0,0.0
...,...,...,...,...,...
2729,2025-02-23 10:00:00+00:00,263,110,48.0,-62.0
2848,2025-02-23 11:00:00+00:00,263,130,62.0,-68.0
3244,2025-02-23 12:00:00+00:00,263,134,119.0,-15.0
3295,2025-02-23 13:00:00+00:00,263,154,133.0,-21.0


In [None]:
import pandas as pd
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])

# Calculate the absolute error
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)

# Create a Plotly plot
fig = px.line(
    mae_by_hour,
    x='pickup_hour',
    y='MAE',
    title='Mean Absolute Error (MAE) by Pickup Hour',
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},
    markers=True
)

# Show the plot
fig.show()

In [61]:
mae_by_hour["MAE"].mean()

13.010889774236384