# Demand Forecasting Model Training Pipeline

This notebook trains demand forecasting models for each item-location combination using the feature store data.

In [None]:
# Import necessary libraries
import hopsworks
import joblib
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from dotenv import load_dotenv

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Configuration Parameters

Set the parameters for the training pipeline.

In [None]:
# Load environment variables
load_dotenv()

# Configure training parameters
project_name = 'models1000'
feature_group_name = 'demand_features'
# version = 1  # Version can be incremented automatically in some cases
model_name = 'demand_forecaster'
# model_version = 1  # Let Hopsworks handle versioning automatically
test_size = 0.2
location_id = None  # Set to specific location ID to filter for a single location

## Connect to Hopsworks

Establish connection to the Hopsworks Feature Store.

In [None]:
print("Connecting to Hopsworks")
api_key = os.getenv("HOPSWORKS_API_KEY")
host = os.getenv("HOST")
port = os.getenv("PORT")

project = hopsworks.login(host=host, port=port, api_key_value=api_key, project=project_name)
fs = project.get_feature_store()


## Retrieve Feature Group

Get the feature group containing the demand data.

In [None]:
print(f"Retrieving Feature Group: {feature_group_name}")
demand_fg = fs.get_feature_group(
    name=feature_group_name,
#    version=1,  # Using fixed version if needed
)

print("Feature Group Investigation")
demand_fg.show(5)

## Feature Selection and Query

Select features and prepare the query for training data.

In [None]:
print("Feature Selection")
# Define query with proper feature selection
query = demand_fg.select_all()

## Setup Transformation Functions

Define transformation functions for feature engineering.

In [None]:
print("Setting up transformation functions")
# Import the built-in transformations
from hopsworks.hsfs.builtin_transformations import label_encoder

print("Applying label encoding to location ID")
transformation_functions = [label_encoder("loc_id")]

print(f"Created transformation function for loc_id using label_encoder")

## Create Feature View

Create a feature view for the demand data.

In [None]:
print(f"Getting or creating feature view: {feature_group_name}_view")

# Use get_or_create_feature_view method
feature_view = fs.get_or_create_feature_view(
    name=f"{feature_group_name}_view",
#    version=1,  
    description="Feature view for demand forecasting",
    labels=["repetitive_demand_quantity"],
    query=query,
    transformation_functions=transformation_functions
)

print(f"Successfully got or created feature view: {feature_group_name}_view")

## Identify Training Scope

Determine the number of models to train.

In [None]:
# Get model registry
mr = project.get_model_registry()

# Get unique items and locations for training
items = query.read(limit=1000)['sp_id'].unique()  # Use a sample to get unique items
locations = query.read(limit=1000)['loc_id'].unique() if location_id is None else [location_id]

# Calculate total number of models
total_models = len(items) * len(locations)

print(f"Training {total_models} models (items: {len(items)} × locations: {len(locations)})")

print(f"\nData Overview:")
print(f"Unique items: {len(items)}")
print(f"Unique locations: {len(locations)}")

## Model Training Loop

Train models for each item-location combination.

In [None]:
# Dictionary to store metrics for all models
all_model_metrics = {}

# Counter for progress tracking
model_counter = 0

# Loop through each item-location combination
for item in items:
    for loc in locations:
        model_counter += 1
        
        # Display progress periodically
        if model_counter % 5 == 0 or model_counter == 1:
            print(f"Training model {model_counter}/{total_models} (Item: {item}, Location: {loc})")
        
        try:
            # Filter feature view for this specific item-location combination
            item_loc_fv = feature_view.filter(
                (feature_view.get_feature('sp_id') == item) & 
                (feature_view.get_feature('loc_id') == loc)
            )
            
            # Split the data for this item-location
            X_train, X_test, y_train, y_test = item_loc_fv.train_test_split(test_size=test_size)
            
            # Skip if we don't have enough data for this combination
            if len(X_train) < 10 or len(X_test) < 5:
                print(f"Skipping Item: {item}, Location: {loc} due to insufficient data (train: {len(X_train)}, test: {len(X_test)})")
                continue
            
            # We already filtered for specific item/location, so we can drop these ID columns
            X_train = X_train.drop(['sp_id', 'loc_id', 'datetime'], axis=1, errors='ignore')
            X_test = X_test.drop(['sp_id', 'loc_id', 'datetime'], axis=1, errors='ignore')
            
            # Model name for this item-location
            model_prefix = f"{model_name}_item{item}_loc{loc}"
            
            # Train RandomForest
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train, y_train)
            
            # Train XGBoost
            xgb_model = XGBRegressor(n_estimators=100, random_state=42)
            xgb_model.fit(X_train, y_train)
            
            # Evaluate models
            models = {
                "RandomForest": rf_model,
                "XGBoost": xgb_model
            }
            
            best_model = None
            best_rmse = float('inf')
            best_metrics = {}
            
            for model_type, model in models.items():
                # Make predictions
                y_pred = model.predict(X_test)
                
                # Calculate metrics
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred)
                
                metrics = {
                    "mae": mae,
                    "rmse": rmse,
                    "r2": r2
                }
                
                if model_counter % 5 == 0 or model_counter == 1:
                    print(f"  {model_type} Metrics - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")
                
                # Track best model
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model
                    best_model_type = model_type
                    best_metrics = metrics
            
            # Store metrics for this item-location combination
            all_model_metrics[f"item_{item}_loc_{loc}"] = {
                "model_type": best_model_type,
                "metrics": best_metrics
            }
            
            # Create model directory
            model_dir = model_prefix
            os.makedirs(model_dir, exist_ok=True)
            
            # Save model
            if best_model_type == "RandomForest":
                joblib.dump(best_model, os.path.join(model_dir, "model.joblib"))
            else:  # XGBoost
                best_model.save_model(os.path.join(model_dir, "model.json"))
            
            # Register model in Hopsworks
            model_api = mr.python.create_model(
                name=model_prefix,
                metrics=best_metrics,
                description=f"Demand forecasting model for item {item}, location {loc} using {best_model_type}",
                input_example=X_train.iloc[0].to_dict() if not X_train.empty else None,
                feature_view=feature_view
            )
            
            # Upload the model and artifacts
            model_api.save(model_dir)
            
            if model_counter % 5 == 0 or model_counter == 1:
                print(f"  Saved model for item {item}, location {loc} using {best_model_type}")
            
            # Clean up local model directory
            shutil.rmtree(model_dir, ignore_errors=True)
            
        except Exception as e:
            print(f"Error training model for Item: {item}, Location: {loc}: {str(e)}")
            continue

In [None]:
# Final cleanup of any remaining model directories
try:
    print("Performing final cleanup...")
    import shutil
    import glob
    
    # Find and remove any model directories matching the pattern
    model_dirs = glob.glob(f"{model_name}_item*_loc*")
    for dir_path in model_dirs:
        shutil.rmtree(dir_path, ignore_errors=True)
    
    print(f"Removed {len(model_dirs)} leftover model directories")
except Exception as final_clean_error:
    print(f"Warning: Error during final cleanup: {str(final_clean_error)}")