In [12]:
import joblib
import sys
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px

# Add 'src' to path to import utils
sys.path.append('..')
from src.utils import plot_forecast_vs_actual, plot_feature_importance

# --- Load Data and Model ---
MODEL_PATH = "../models/lgbm_model.pkl"
VALIDATION_DATA_PATH = "../models/validation_data.pkl"

model = joblib.load(MODEL_PATH)
X_val, y_val, val_dates = joblib.load(VALIDATION_DATA_PATH)

print("Model and validation data loaded.")

Model and validation data loaded.


In [14]:
print("--- Validation Features (X_val) Sample (First 5 Rows) ---")
print("This is the data (features) from 2017 that the model will predict on.")
display(X_val.head())

print("\n--- Validation Target (y_val) Sample (First 5 Sales) ---")
print("These are the actual sales values we will check against.")
display(y_val.head())

print("\n--- Validation Dates (val_dates) Sample (First 5 Dates) ---")
print("These are the corresponding dates for our plot.")
display(val_dates.head())

--- Validation Features (X_val) Sample (First 5 Rows) ---
This is the data (features) from 2017 that the model will predict on.


Unnamed: 0,store,item,month,year,dayofweek,dayofmonth,dayofyear,lag_7,lag_28,rolling_mean_7,rolling_mean_28
1461,1,1,1,2017,6,1,1,20.0,24.0,17.285714,16.178571
1462,1,1,1,2017,0,2,2,16.0,13.0,17.142857,16.0
1463,1,1,1,2017,1,3,3,10.0,10.0,17.0,16.071429
1464,1,1,1,2017,2,4,4,16.0,17.0,17.0,16.071429
1465,1,1,1,2017,3,5,5,21.0,15.0,17.0,16.035714



--- Validation Target (y_val) Sample (First 5 Sales) ---
These are the actual sales values we will check against.


1461    19
1462    15
1463    10
1464    16
1465    14
Name: sales, dtype: int64


--- Validation Dates (val_dates) Sample (First 5 Dates) ---
These are the corresponding dates for our plot.


1461   2017-01-01
1462   2017-01-02
1463   2017-01-03
1464   2017-01-04
1465   2017-01-05
Name: date, dtype: datetime64[ns]

In [2]:
# --- Model Evaluation ---
print("Generating predictions on validation set...")
y_pred = model.predict(X_val)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("\n--- Regression Metrics ---")
print(f"R² (R-squared): {r2:.4f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.2f} items")

# --- Plot results ---
# To avoid crashing the browser, let's plot only 
# for one store and one item as an example.
STORE_ID = 1
ITEM_ID = 1

print(f"\nDisplaying plot for Store {STORE_ID}, Item {ITEM_ID}...")

# Create masks to filter the data
mask_store = (X_val['store'] == STORE_ID)
mask_item = (X_val['item'] == ITEM_ID)
mask_combined = mask_store & mask_item

# Filter the data
y_val_filtered = y_val[mask_combined].set_axis(val_dates[mask_combined])
y_pred_filtered = pd.Series(y_pred[mask_combined], index=val_dates[mask_combined])

# Plot
plot_forecast_vs_actual(
    y_val_filtered, 
    y_pred_filtered,
    title=f"Forecast vs. Actuals (Store {STORE_ID}, Item {ITEM_ID})"
)

Generating predictions on validation set...

--- Regression Metrics ---
R² (R-squared): 0.9371
RMSE (Root Mean Squared Error): 7.91 items

Displaying plot for Store 1, Item 1...


In [3]:
# --- Feature Importance Analysis ---
# This cell shows WHICH features the model uses to decide.

# Get all feature names from the X_val DataFrame
feature_names = X_val.columns.tolist()

print("Plotting feature importance...")
plot_feature_importance(model, feature_names)

Plotting feature importance...


In [8]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create unique lists of stores and items
all_stores = sorted(X_val['store'].unique())
all_items = sorted(X_val['item'].unique())

# --- Create Widgets ---
store_dropdown = widgets.Dropdown(options=all_stores, value=1, description='Store:')
item_dropdown = widgets.Dropdown(options=all_items, value=1, description='Item:')
plot_output = widgets.Output() # Create an empty "canvas" for our plot

# --- Define the function that will draw the plot ---
def update_plot(change):
    with plot_output:
        clear_output(wait=True) # Clear the previous plot
        
        # Get selected values
        store_id = store_dropdown.value
        item_id = item_dropdown.value
        
        # Filter the data (just like we did before)
        mask_store = (X_val['store'] == store_id)
        mask_item = (X_val['item'] == item_id)
        mask_combined = mask_store & mask_item
        
        y_val_filtered = y_val[mask_combined].set_axis(val_dates[mask_combined])
        
        # Check if we have data (some combos might not exist in 2017)
        if y_val_filtered.empty:
            print(f"No validation data for Store {store_id}, Item {item_id}.")
            return
            
        y_pred_filtered = pd.Series(y_pred[mask_combined], index=val_dates[mask_combined])

        # Draw the plot
        plot_forecast_vs_actual(
            y_val_filtered, 
            y_pred_filtered,
            title=f"Forecast vs. Actuals (Store {store_id}, Item {item_id})"
        )

# --- Link widgets to the function ---
# Any change in the dropdowns will call the update_plot function
store_dropdown.observe(update_plot, names='value')
item_dropdown.observe(update_plot, names='value')

# --- Display everything ---
print("Select Store and Item to see the forecast:")
display(widgets.HBox([store_dropdown, item_dropdown]))
display(plot_output)

# Manually trigger the function once on start
update_plot(None)

Select Store and Item to see the forecast:


HBox(children=(Dropdown(description='Store:', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), value=1), Dropdown(desc…

Output()

In [9]:
print("--- Analysis: Where does the model err the most? ---")

# 1. Create a DataFrame with all results
df_results = X_val.copy()
df_results['y_true'] = y_val
df_results['y_pred'] = y_pred
df_results['date'] = val_dates

# 2. Calculate the error (RMSE) for *each* store-item combination
# We group by store and item
g = df_results.groupby(['store', 'item'])

# Define a function to apply to each group
def calculate_rmse(group):
    return np.sqrt(mean_squared_error(group['y_true'], group['y_pred']))

# Apply the function to all groups
group_errors = g.apply(calculate_rmse).to_frame(name='rmse')

# 3. Find the 5 worst forecasts
worst_performers = group_errors.sort_values(by='rmse', ascending=False).head(5)

print("TOP 5 WORST FORECASTS (HIGHEST RMSE):")
display(worst_performers)

# 4. Plot the absolute worst performer
worst_store, worst_item = worst_performers.index[0]
print(f"\n--- Plot for the worst case: Store {worst_store}, Item {worst_item} ---")

# Filter data for this one case
mask_store = (X_val['store'] == worst_store)
mask_item = (X_val['item'] == worst_item)
mask_combined = mask_store & mask_item

y_val_filtered = y_val[mask_combined].set_axis(val_dates[mask_combined])
y_pred_filtered = pd.Series(y_pred[mask_combined], index=val_dates[mask_combined])

plot_forecast_vs_actual(
    y_val_filtered, 
    y_pred_filtered,
    title=f"WORST FORECAST: Store {worst_store}, Item {worst_item} (RMSE: {worst_performers.iloc[0,0]:.2f})"
)

--- Analysis: Where does the model err the most? ---
TOP 5 WORST FORECASTS (HIGHEST RMSE):






Unnamed: 0_level_0,Unnamed: 1_level_0,rmse
store,item,Unnamed: 2_level_1
2,18,12.977025
2,15,12.365633
8,15,12.152961
2,28,12.026433
2,38,11.801277



--- Plot for the worst case: Store 2, Item 18 ---


In [13]:
print("--- Analysis: Does the model understand seasonality? ---")

# Use the 'df_results' DataFrame from the previous cell
df_results['dayofweek'] = df_results['date'].dt.dayofweek
df_results['month'] = df_results['date'].dt.month

# --- 1. Analysis by Day of Week ---
print("\n--- Aggregation by Day of Week ---")
# Group all sales (actual and predicted) by day of week
agg_dow = df_results.groupby('dayofweek')[['y_true', 'y_pred']].sum()
agg_dow.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
agg_dow = agg_dow.rename(columns={'y_true': 'Actual', 'y_pred': 'Forecast'})

# Plot the bar chart
fig_dow = px.bar(agg_dow, barmode='group', title="Total Sales: Actual vs. Forecast (by Day of Week)")
fig_dow.show()

# --- 2. Analysis by Month ---
print("\n--- Aggregation by Month ---")
agg_month = df_results.groupby('month')[['y_true', 'y_pred']].sum()
agg_month.index.name = 'Month'
agg_month = agg_month.rename(columns={'y_true': 'Actual', 'y_pred': 'Forecast'})

# Plot the bar chart
fig_month = px.bar(agg_month, barmode='group', title="Total Sales: Actual vs. Forecast (by Month)")
fig_month.show()

--- Analysis: Does the model understand seasonality? ---

--- Aggregation by Day of Week ---



--- Aggregation by Month ---
