Colab Prerequisites

In [25]:
### install pyro-ppl for Deep learning model (BNN) --- if using Colab
# !pip install pyro-ppl

#from google.colab import drive
#drive.mount('/content/drive')

### Import Libraries



In [1]:
import pandas as pd
import numpy as np
import math
#import torch
#import torch.nn as nn
# import pyro
# import pyro.distributions as dist
# from pyro.nn import PyroModule, PyroSample
# from pyro.infer import SVI, Trace_ELBO
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from scipy.special import gammaln
import scipy.stats
import scipy.special
#from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import copy

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch.set_default_tensor_type('torch.cuda.FloatTensor' if torch.cuda.is_available() else 'torch.FloatTensor')
###pyro.set_rng_seed(0)

### Set Input Parameters

In [24]:
city = 'orléans'
scenario_name = 'DRT'
directory = f"study areas/{city}/"

# OSRM server URLs
# The first url is for computing walking distance, the second one for car driving distance
osrm_url_walk = "http://localhost:5000"  # Change if your OSRM server is at a different URL
osrm_url_car = "http://localhost:5001"   # Change if your OSRM server is at a different URL

# Study area boundaries (EPSG:4326)
bbox = [1.7676734, 47.7601594, 2.1089651, 48.0134634]  # orleans metropole area

# File paths
trips_input_filename = str(directory + 'trips_DRT_processed.csv')  # File is in the same folder as script

# path to the csv of generated virtual DRT trips trips
generated_virtual_drt_trips = str(directory + 'all_synthetic_trips.csv')
# path to the csv of generated trips (with ML predicted travel times) # Optional output path
trips_output_path_generated_drt_trips = f"{directory}virtual_trips_with_predicted_thresholds.CSV"
#path to PT gtfs files
gtfs_path = "study areas/orléans/gtfs/gtfs_PT.zip"

# # input features and target - file "trips_{scenario_name}.csv"
input_features = ['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon',
        'departure_time_hour', 'departure_time_minute',
       'departure_time_day_of_week', 'departure_time_day_of_month',
       'departure_time_month', 'departure_time_hour_sin',
       'departure_time_hour_cos', 'departure_time_day_of_week_sin',
       'departure_time_day_of_week_cos', 'departure_time_month_sin',
       'departure_time_month_cos', 'distance'] 

target_features = ['travel_time'] #, 'waiting_time'
                

# set departure times the same as the ones used in trips generation (previous notebook)
day = '2025-04-07'
day_gtfs = '20250407'
hours = [8, 12, 16, 20]
departure_times = [f"{day} {hour:02d}:00:00" for hour in hours]




# ''' only when running in colab
# import locale
# # Set the locale to UTF-8
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# # Define the fixed part of the path
# base_path = '/content/drive/MyDrive/ColabNotebooks/AccessibilityDRTcityChrone_CL/'
# # Now, run your command:
# !ls '/content/drive/MyDrive/ColabNotebooks/AccessibilityDRTcityChrone_CL/trips/Caltanissetta'
# legs_input_path = base_path + 'trips/trips_DRT_pr.CSV'
# '''

In [26]:


import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule, PyroSample
from pyro.infer import SVI, Trace_ELBO
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import RandomForestRegressor
import scipy.special
from typing import List, Dict, Union, Tuple, Optional
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Global Configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SUPPORTED_DISTRIBUTIONS = ['normal', 'lognormal', 'gamma']
torch.set_default_dtype(torch.float32)


def prepare_data(df: pd.DataFrame, 
                input_features: List[str], 
                target_features: List[str]) -> Tuple[np.ndarray, Union[np.ndarray, Dict]]:
    """
    Prepare features and target variables for model training.
    
    Handles missing values by replacing NaN with 0 and ensures all target
    values are positive (minimum 1e-6) for compatibility with positive
    distributions like Gamma and Log-Normal.
    
    Args:
        df: Dataset containing all features and targets
        input_features: List of column names to use as input features
        target_features: List of column names to use as target variables
        
    Returns:
        X: Input features as numpy array with NaN values replaced by 0
        y: Target values - numpy array if single target, dict if multiple targets.
           All values are guaranteed to be positive (>= 1e-6)
           
    Example:
        >>> df = pd.DataFrame({
        ...     'distance': [10, 20, np.nan],
        ...     'travel_time': [15, 30, 25]
        ... })
        >>> X, y = prepare_data(df, ['distance'], ['travel_time'])
        >>> print(X)  # [[10], [20], [0]]  # NaN replaced with 0
        >>> print(y)  # [15, 30, 25]
    """
    # Prepare input features - handle missing values
    X = df[input_features].values
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Prepare target variables - ensure positive values
    if len(target_features) == 1:
        # Single target - return as array
        y = np.maximum(df[target_features[0]].values, 1e-6)
    else:
        # Multiple targets - return as dictionary
        y = {target: np.maximum(df[target].values, 1e-6) 
             for target in target_features}
    
    return X, y


def evaluate_predictions(y_true: np.ndarray, 
                        predictions: Dict[str, np.ndarray], 
                        distribution: str = 'gamma') -> Dict[str, float]:
    """
    Calculate comprehensive evaluation metrics for probabilistic predictions.
    
    Computes Mean Absolute Error (MAE), Root Mean Square Error (RMSE), and
    distribution-specific Negative Log-Likelihood (NLL) for model evaluation.
    
    Args:
        y_true: True target values
        predictions: Dictionary containing 'mean' and 'std' predictions
        distribution: Distribution type for NLL calculation
                     ('normal', 'lognormal', or 'gamma')
        
    Returns:
        Dictionary containing evaluation metrics:
        - 'mae': Mean Absolute Error
        - 'rmse': Root Mean Square Error  
        - 'nll': Negative Log-Likelihood (distribution-specific)
        
    Raises:
        ValueError: If unsupported distribution is specified
        
    Example:
        >>> y_true = np.array([2.0, 3.0, 4.0])
        >>> predictions = {
        ...     'mean': np.array([2.1, 2.9, 4.1]),
        ...     'std': np.array([0.2, 0.2, 0.2])
        ... }
        >>> metrics = evaluate_predictions(y_true, predictions, 'gamma')
        >>> print(f"MAE: {metrics['mae']:.3f}")
    """
    y_pred = predictions['mean']
    epsilon = 1e-8
    
    # Basic point prediction metrics
    mae = np.mean(np.abs(y_true - y_pred))
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    
    # Distribution-specific Negative Log-Likelihood calculation
    predicted_std = predictions['std'] + epsilon
    
    if distribution == 'normal':
        # Normal distribution NLL: -log(N(y|μ,σ²))
        nll = (0.5 * np.mean((y_true - y_pred)**2 / predicted_std**2) + 
               0.5 * np.mean(np.log(2 * np.pi * predicted_std**2)))
               
    elif distribution == 'lognormal':
        # Log-Normal distribution NLL: -log(LogN(y|μ,σ²))
        y_true = np.maximum(y_true, epsilon)
        y_pred = np.maximum(y_pred, epsilon)
        
        log_mean = np.log(y_pred)
        log_std = np.maximum(predicted_std / y_pred, epsilon)
        
        nll = (0.5 * np.mean((np.log(y_true) - log_mean)**2 / log_std**2) + 
               0.5 * np.mean(np.log(2 * np.pi * log_std**2)) + 
               np.mean(np.log(y_true)))
               
    elif distribution == 'gamma':
        # Gamma distribution NLL: -log(Gamma(y|α,β))
        alpha = np.maximum((y_pred / predicted_std)**2, epsilon)
        beta = np.maximum(y_pred / (predicted_std**2), epsilon)
        y_true = np.maximum(y_true, epsilon)
        
        gamma_loglik = (alpha * np.log(beta) - 
                       scipy.special.gammaln(alpha) + 
                       (alpha - 1) * np.log(y_true) - 
                       beta * y_true)
        nll = -np.mean(gamma_loglik)
    else:
        raise ValueError(f"Unsupported distribution: {distribution}. "
                        f"Choose from: {SUPPORTED_DISTRIBUTIONS}")
    
    return {'mae': mae, 'rmse': rmse, 'nll': nll}


class BNNTravelTimeModel(PyroModule):
    """
    Bayesian Neural Network core model with configurable distributions.
    
    A single hidden layer neural network with Bayesian weights and biases.
    Supports different prior distributions for weights and different likelihood
    distributions for outputs.
    
    Architecture:
        Input -> Linear(input_dim, hidden_dim) -> ReLU -> Linear(hidden_dim, 2) -> Output
        
    The output layer produces 2 values which are interpreted based on the
    likelihood distribution:
    - Normal: mean and log(std)
    - Log-Normal: log(mean) and log(std)
    - Gamma: raw mean and log(variance) parameters
    
    Args:
        input_dim: Number of input features
        hidden_dim: Size of hidden layer (default: 16)
        prior_dist: Prior distribution for weights ('normal', 'lognormal', 'gamma')
        likelihood_dist: Output likelihood distribution ('normal', 'lognormal', 'gamma')
        
    Raises:
        ValueError: If unsupported distribution types are specified
    """

    def __init__(self, input_dim: int, hidden_dim: int = 16, 
                 prior_dist: str = 'normal', 
                 likelihood_dist: str = 'gamma'):
        super().__init__()
        
        # Validate distribution parameters
        if prior_dist not in SUPPORTED_DISTRIBUTIONS:
            raise ValueError(f"Unsupported prior: {prior_dist}")
        if likelihood_dist not in SUPPORTED_DISTRIBUTIONS:
            raise ValueError(f"Unsupported likelihood: {likelihood_dist}")

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.prior_dist = prior_dist
        self.likelihood_dist = likelihood_dist

        # Create prior distribution for weights and biases
        prior_dist_obj = self._create_prior_distribution(prior_dist)

        # Define Bayesian network layers
        # Hidden layer with Bayesian weights
        self.fc1 = PyroModule[nn.Linear](input_dim, hidden_dim)
        self.fc1.weight = PyroSample(
            prior_dist_obj.expand([hidden_dim, input_dim]).to_event(2)
        )
        self.fc1.bias = PyroSample(
            prior_dist_obj.expand([hidden_dim]).to_event(1)
        )

        # Output layer with Bayesian weights (2 outputs for distribution parameters)
        self.fc2 = PyroModule[nn.Linear](hidden_dim, 2)
        self.fc2.weight = PyroSample(
            prior_dist_obj.expand([2, hidden_dim]).to_event(2)
        )
        self.fc2.bias = PyroSample(
            prior_dist_obj.expand([2]).to_event(1)
        )

        # Activation functions
        self.relu = nn.ReLU()
        self.softplus = nn.Softplus()

    def _create_prior_distribution(self, prior_dist: str):
        """
        Create prior distribution object for network weights.
        
        Args:
            prior_dist: Type of prior distribution
            
        Returns:
            Pyro distribution object configured for the specified type
        """
        if prior_dist == 'normal':
            # Standard normal prior: N(0, 0.1²)
            loc = torch.tensor(0.0, dtype=torch.float32, device=DEVICE)
            scale = torch.tensor(0.1, dtype=torch.float32, device=DEVICE)
            return dist.Normal(loc, scale)
        elif prior_dist == 'lognormal':
            # Log-normal prior: LogN(0, 0.1²)
            loc = torch.tensor(0.0, dtype=torch.float32, device=DEVICE)
            scale = torch.tensor(0.1, dtype=torch.float32, device=DEVICE)
            return dist.LogNormal(loc, scale)
        elif prior_dist == 'gamma':
            # Gamma prior: Gamma(1.1, 10.0)
            concentration = torch.tensor(1.1, dtype=torch.float32, device=DEVICE)
            rate = torch.tensor(10.0, dtype=torch.float32, device=DEVICE)
            return dist.Gamma(concentration, rate)

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Forward pass through the Bayesian network.
        
        Args:
            x: Input features tensor
            y: Target values tensor (for training, None for prediction)
            
        Returns:
            Predicted mean values
        """
        # Ensure tensors are on correct device
        x = x.to(DEVICE).float()
        if y is not None:
            y = y.to(DEVICE).float()

        # Forward pass through network
        hidden = self.relu(self.fc1(x))
        network_output = self.fc2(hidden)

        return self._apply_likelihood(network_output, x, y)

    def _apply_likelihood(self, network_output: torch.Tensor, 
                         x: torch.Tensor, y: Optional[torch.Tensor]) -> torch.Tensor:
        """
        Apply likelihood distribution to network outputs.
        
        Args:
            network_output: Raw network outputs (batch_size, 2)
            x: Input features (for batch size)
            y: Target values (None for prediction)
            
        Returns:
            Predicted mean values
        """
        if self.likelihood_dist == 'normal':
            # Normal likelihood: N(μ, σ²)
            mean = network_output[..., 0]
            std = torch.clamp(self.softplus(network_output[..., 1]) + 1e-6, 
                            min=1e-6, max=10.0)
            
            with pyro.plate("data", x.shape[0]):
                pyro.sample("obs", dist.Normal(mean, std), obs=y)
            return mean
                
        elif self.likelihood_dist == 'lognormal':
            # Log-Normal likelihood: LogN(μ, σ²)
            log_mean = network_output[..., 0]
            log_std = torch.clamp(self.softplus(network_output[..., 1]) + 1e-6, 
                                min=1e-6, max=5.0)
            
            with pyro.plate("data", x.shape[0]):
                pyro.sample("obs", dist.LogNormal(log_mean, log_std), obs=y)
            return torch.exp(log_mean)
                
        else:  # gamma
            # Gamma likelihood: Gamma(α, β)
            mean_raw = network_output[..., 0]
            log_var = self.softplus(network_output[..., 1]) + 1e-6
            
            # Convert to mean and variance, then to shape and rate
            mean = torch.exp(mean_raw) + 1e-6
            variance = torch.exp(log_var) + 1e-6
            
            rate = torch.clamp(mean / variance, min=1e-3, max=100.0)
            shape = torch.clamp(mean * rate, min=1e-3, max=100.0)

            with pyro.plate("data", x.shape[0]):
                pyro.sample("obs", dist.Gamma(shape, rate), obs=y)
            return mean


class BNNPredictor(BaseEstimator, RegressorMixin):
    """
    Bayesian Neural Network predictor with full uncertainty quantification.
    
    A scikit-learn compatible estimator that provides probabilistic predictions
    with uncertainty estimates. Uses variational inference for training and
    supports GPU acceleration.
    
    Features:
    - Automatic feature scaling
    - Early stopping to prevent overfitting  
    - GPU acceleration when available
    - Comprehensive uncertainty quantification
    - Multiple prior and likelihood distributions
    
    Args:
        input_dim: Number of input features (must match len(input_features))
        input_features: List of input feature column names
        target_features: List of target feature column names  
        hidden_dim: Size of hidden layer (default: 16)
        prior_dist: Prior distribution for weights (default: 'normal')
        likelihood_dist: Likelihood distribution for outputs (default: 'gamma')
        
    Raises:
        ValueError: If input_dim doesn't match number of input_features
        
    Example:
        >>> bnn = BNNPredictor(
        ...     input_dim=3,
        ...     input_features=['distance', 'time', 'traffic'],
        ...     target_features=['travel_time'],
        ...     hidden_dim=32
        ... )
        >>> bnn.fit(X_train, y_train)
        >>> predictions = bnn.predict_distribution(X_test)
        >>> print(f"Mean: {predictions['mean']}")
        >>> print(f"Uncertainty: {predictions['std']}")
    """

    def __init__(self, input_dim: int, input_features: List[str], 
                 target_features: List[str], hidden_dim: int = 16, 
                 prior_dist: str = 'normal', likelihood_dist: str = 'gamma'):
        
        # Validate input dimensions
        if input_dim != len(input_features):
            raise ValueError(f"input_dim ({input_dim}) must match number of "
                           f"features ({len(input_features)})")

        # Store configuration
        self.input_dim = input_dim
        self.feature_names = input_features
        self.target_features = target_features
        self.hidden_dim = hidden_dim
        self.prior_dist = prior_dist
        self.likelihood_dist = likelihood_dist

        # Initialize Pyro components
        pyro.clear_param_store()
        
        self.model = BNNTravelTimeModel(input_dim, hidden_dim, prior_dist, likelihood_dist)
        self.model.to(DEVICE)
        
        # Automatic variational guide
        self.guide = pyro.infer.autoguide.AutoNormal(self.model)
        self.guide.to(DEVICE)
        
        # Training components (initialized during fit)
        self.svi = None
        self.feature_means = None
        self.feature_stds = None

    def _prepare_data(self, X: Union[np.ndarray, pd.DataFrame], 
                     y: Optional[Union[np.ndarray, dict]] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Prepare and scale data for training or prediction.
        
        Args:
            X: Input features (numpy array or pandas DataFrame)
            y: Target values (for training only)
            
        Returns:
            Scaled feature tensor, optionally with target tensor for training
            
        Raises:
            ValueError: If model hasn't been fitted when making predictions
        """
        # Convert to DataFrame if needed
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names)

        # Extract and clean features
        features = X[self.feature_names].values
        features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)

        # Initialize feature scaling during training
        if y is not None and self.feature_means is None:
            self.feature_means = np.mean(features, axis=0)
            self.feature_stds = np.std(features, axis=0)
            # Prevent division by zero for constant features
            self.feature_stds[self.feature_stds == 0] = 1.0

        # Check if model has been fitted
        if self.feature_means is None:
            raise ValueError("Model must be fitted before making predictions")

        # Apply feature scaling
        features = (features - self.feature_means) / self.feature_stds
        features_tensor = torch.tensor(features, dtype=torch.float32).to(DEVICE)

        # Prepare targets for training
        if y is not None:
            if isinstance(y, dict):
                target_values = np.maximum(y[self.target_features[0]], 1e-6)
            else:
                target_values = np.maximum(y, 1e-6)
            
            # Log-transform for log-normal likelihood
            if self.likelihood_dist == 'lognormal':
                target_values = np.log(target_values)
            
            y_tensor = torch.tensor(target_values, dtype=torch.float32).to(DEVICE)
            return features_tensor, y_tensor

        return features_tensor

    def fit(self, X: Union[np.ndarray, pd.DataFrame], 
            y: Union[np.ndarray, dict]) -> 'BNNPredictor':
        """
        Train the Bayesian Neural Network using variational inference.
        
        Uses Stochastic Variational Inference (SVI) with Adam optimizer and
        early stopping to prevent overfitting. Training includes automatic
        feature scaling and batch processing for efficiency.
        
        Args:
            X: Input features (n_samples, n_features)
            y: Target values (n_samples,) or dict for multiple targets
            
        Returns:
            self: Fitted estimator
            
        Raises:
            ValueError: If NaN values are found in prepared data
            
        Example:
            >>> bnn = BNNPredictor(input_dim=2, input_features=['x1', 'x2'], 
            ...                   target_features=['y'])
            >>> bnn.fit(X_train, y_train)
            >>> # Model is now ready for predictions
        """
        # Prepare and validate data
        features, target = self._prepare_data(X, y)

        if torch.isnan(features).any() or torch.isnan(target).any():
            raise ValueError("NaN values found in prepared data")

        # Initialize training components
        pyro.clear_param_store()
        optimizer = pyro.optim.Adam({"lr": 0.0001})  # Conservative learning rate
        self.svi = SVI(self.model, self.guide, optimizer, loss=Trace_ELBO())

        # Training hyperparameters
        max_epochs = 200
        patience = 15
        batch_size = min(256, len(features) // 8)
        
        # Training state
        best_loss = float('inf')
        no_improvement = 0

        print(f"Training BNN with {len(features)} samples on {DEVICE}...")

        # Training loop with early stopping
        for epoch in range(max_epochs):
            epoch_losses = []
            n_batches = len(features) // batch_size
            indices = torch.randperm(len(features))
            
            # Process mini-batches
            for i in range(n_batches):
                start_idx = i * batch_size
                end_idx = min((i + 1) * batch_size, len(features))
                batch_indices = indices[start_idx:end_idx]
                
                batch_X = features[batch_indices]
                batch_y = target[batch_indices]
                
                # Skip batches with NaN values
                if torch.isnan(batch_X).any() or torch.isnan(batch_y).any():
                    continue
                
                try:
                    loss = self.svi.step(batch_X, batch_y)
                    if np.isfinite(loss) and loss > 0:
                        epoch_losses.append(loss)
                except Exception as e:
                    print(f"Batch error at epoch {epoch}: {str(e)}")
                    continue

            # Check for improvement and early stopping
            if epoch_losses:
                avg_loss = np.mean(epoch_losses)
                
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    no_improvement = 0
                else:
                    no_improvement += 1
                
                # Progress reporting
                if epoch % 20 == 0:
                    print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")
                
                # Early stopping
                if no_improvement >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
            else:
                no_improvement += 1
                if no_improvement >= 5:
                    print("No valid losses, stopping training")
                    break

        print(f"Training completed. Best loss: {best_loss:.4f}")
        return self

    def predict_distribution(self, X: Union[np.ndarray, pd.DataFrame], 
                           n_samples: int = 1000) -> Dict[str, np.ndarray]:
        """
        Generate probabilistic predictions with full uncertainty quantification.
        
        Uses the trained posterior distribution to generate samples and compute
        prediction statistics including mean, standard deviation, and full
        predictive distribution.
        
        Args:
            X: Input features for prediction
            n_samples: Number of posterior samples for uncertainty estimation
                      (default: 1000)
            
        Returns:
            Dictionary containing:
            - 'predictions': Full sample array (n_test, n_samples)
            - 'mean': Point predictions (n_test,)
            - 'std': Prediction uncertainties (n_test,)
            
        Raises:
            Exception: If prediction fails (model not fitted, etc.)
            
        """
        features = self._prepare_data(X)

        try:
            # Generate posterior samples
            predictive = pyro.infer.Predictive(self.model, guide=self.guide, num_samples=n_samples)
            with torch.no_grad():
                predictions = predictive(features)
            
            # Extract samples and compute statistics
            samples = predictions["obs"].cpu().detach().numpy()

            return {
                'predictions': samples,
                'mean': np.mean(samples, axis=0),
                'std': np.std(samples, axis=0)
            }
        except Exception as e:
            print(f"Prediction error: {str(e)}")
            raise

    def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """
        Make point predictions (mean of predictive distribution).
        
        Args:
            X: Input features for prediction
            
        Returns:
            Point predictions as numpy array
        """
        return self.predict_distribution(X)['mean']

    def evaluate(self, X: Union[np.ndarray, pd.DataFrame], 
                y: Union[np.ndarray, dict]) -> Dict:
        """
        Evaluate model performance with comprehensive metrics.
        
        Args:
            X: Input features
            y: True target values
            
        Returns:
            Dictionary containing 'metrics' and 'predictions'
        """
        predictions = self.predict_distribution(X)
        
        # Extract target values
        if isinstance(y, dict):
            y_values = y[self.target_features[0]]
        else:
            y_values = y

        metrics = evaluate_predictions(y_values, predictions, self.likelihood_dist)
        return {'metrics': metrics, 'predictions': predictions}


class BaseModel(BaseEstimator, RegressorMixin):
    """
    Base class for models with distribution support and standardized evaluation.
    
    Provides common interface and evaluation methods for all model types.
    All derived models must implement predict_distribution method.
    
    Args:
        distribution: Output distribution type (default: 'gamma')
        
    Raises:
        ValueError: If unsupported distribution is specified
    """
    
    def __init__(self, distribution: str = 'gamma'):
        if distribution not in SUPPORTED_DISTRIBUTIONS:
            raise ValueError(f"Unsupported distribution: {distribution}. "
                           f"Choose from: {SUPPORTED_DISTRIBUTIONS}")
        self.distribution = distribution
    
    def evaluate(self, X: Union[np.ndarray, pd.DataFrame], 
                y: Union[np.ndarray, dict]) -> Dict:
        """
        Evaluate model performance using comprehensive metrics.
        
        Args:
            X: Input features
            y: True target values
            
        Returns:
            Dictionary containing 'predictions' and 'metrics'
        """
        predictions = self.predict_distribution(X)
        
        # Extract target values
        if isinstance(y, dict):
            y_values = list(y.values())[0]
        else:
            y_values = y
            
        metrics = evaluate_predictions(y_values, predictions, self.distribution)
        return {'predictions': predictions, 'metrics': metrics}


class LinearRegressionGamma(BaseModel):
    """
    Linear regression with configurable probability distributions for uncertainty.
    
    Fits a simple linear model and estimates residual variance to parameterize
    the chosen output distribution. Uses the last feature as the primary predictor
    (typically distance for travel time prediction).
    
    Model: y = θ * x_last + ε, where ε follows the specified distribution
    
    Args:
        distribution: Output distribution type (default: 'gamma')
        
    Attributes:
        theta: Linear coefficient
        scale: Scale parameter for distribution
        y_mean: Target mean (for robustness)
        y_std: Target standard deviation (for robustness)
        
    Example:
        >>> model = LinearRegressionGamma(distribution='normal')
        >>> model.fit(X_train, y_train)
        >>> predictions = model.predict_distribution(X_test)
    """
    
    def __init__(self, distribution: str = 'gamma'):
        super().__init__(distribution)
        self.theta = None
        self.scale = None
        self.y_mean = None
        self.y_std = None
    
    def fit(self, X: Union[np.ndarray, pd.DataFrame], 
            y: Union[np.ndarray, dict]) -> 'LinearRegressionGamma':
        """
        Fit linear regression model with robust scale estimation.
        
        Args:
            X: Input features
            y: Target values
            
        Returns:
            self: Fitted estimator
        """
        # Extract target values
        if isinstance(y, dict):
            y = list(y.values())[0]
        
        # Ensure positive values
        y = np.maximum(y, 1e-6)
        self.y_mean = np.mean(y)
        self.y_std = np.std(y)
        
        # Use last feature as primary predictor (typically distance)
        X = X[:, -1] if X.ndim > 1 else X
        
        # Fit linear coefficient using least squares
        self.theta = abs(np.sum(X * y) / np.sum(X * X))
        
        # Calculate scale parameter from residuals
        mean_pred = self.theta * X
        residuals = y - mean_pred
        
        # Robust scale estimation with minimum threshold
        raw_scale = np.var(residuals) / np.mean(mean_pred)
        min_scale = 0.1 * self.y_std
        self.scale = max(raw_scale, min_scale)
        
        return self
    
    def predict_distribution(self, X: Union[np.ndarray, pd.DataFrame], 
                           n_samples: int = 4000) -> Dict[str, np.ndarray]:
        """
        Generate probabilistic predictions using fitted distribution.
        
        Args:
            X: Input features
            n_samples: Number of samples to generate
            
        Returns:
            Dictionary with 'predictions', 'mean', and 'std'
        """
        # Use last feature as predictor
        X = X[:, -1] if X.ndim > 1 else X
        
        # Calculate mean predictions
        mean = np.maximum(self.theta * X, 1e-6)
        
        # Generate samples based on distribution type
        samples = np.zeros((len(X), n_samples))
        
        for i in range(len(X)):
            if self.distribution == 'normal':
                # Normal distribution: N(μ, σ²)
                std = np.sqrt(self.scale)
                samples[i] = np.maximum(
                    np.random.normal(mean[i], std, n_samples), 1e-6
                )
            elif self.distribution == 'lognormal':
                # Log-Normal distribution: LogN(μ, σ²)
                log_mean = np.log(mean[i])
                log_std = np.sqrt(self.scale / mean[i])
                samples[i] = np.random.lognormal(log_mean, log_std, n_samples)
            else:  # gamma
                # Gamma distribution: Gamma(α, β)
                shape = max(mean[i] / (self.scale + 1e-6), 1.0)
                samples[i] = np.random.gamma(shape, self.scale, n_samples)
        
        return {
            'predictions': samples,
            'mean': mean,
            'std': np.std(samples, axis=1)
        }

    def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """Make point predictions."""
        return self.predict_distribution(X)['mean']


class RandomForestRegressorGamma(BaseModel):
    """
    Random Forest with configurable probability distributions for uncertainty.
    
    Uses ensemble variance from individual trees to estimate prediction uncertainty,
    then samples from the specified distribution using tree-based statistics.
    
    The uncertainty estimation leverages the natural ensemble properties of 
    Random Forest to provide meaningful prediction intervals.
    
    Args:
        n_estimators: Number of trees in the forest (default: 100)
        distribution: Output distribution type (default: 'gamma')
        
    Attributes:
        model: Underlying RandomForestRegressor from scikit-learn
        
    Example:
        >>> rf = RandomForestRegressorGamma(n_estimators=200, distribution='lognormal')
        >>> rf.fit(X_train, y_train)
        >>> predictions = rf.predict_distribution(X_test)
        >>> print(f"Uncertainty: {predictions['std']}")
    """
    
    def __init__(self, n_estimators: int = 100, distribution: str = 'gamma'):
        super().__init__(distribution)
        self.model = RandomForestRegressor(
            n_estimators=n_estimators,
            min_samples_leaf=5,      # Prevent overfitting
            max_features='sqrt',     # Feature subsampling
            bootstrap=True,          # Bootstrap sampling
            random_state=42,         # Reproducibility
            n_jobs=-1               # Use all available cores
        )
    
    def fit(self, X: Union[np.ndarray, pd.DataFrame], 
            y: Union[np.ndarray, dict]) -> 'RandomForestRegressorGamma':
        """
        Fit Random Forest model.
        
        Args:
            X: Input features
            y: Target values
            
        Returns:
            self: Fitted estimator
        """
        # Extract target values
        if isinstance(y, dict):
            y = list(y.values())[0]
        
        # Ensure positive values
        y = np.maximum(y, 1e-6)
        self.model.fit(X, y)
        return self
    
    def predict_distribution(self, X: Union[np.ndarray, pd.DataFrame], 
                           n_samples: int = 4000) -> Dict[str, np.ndarray]:
        """
        Generate probabilistic predictions using tree ensemble variance.
        
        Uses predictions from individual trees to estimate uncertainty,
        then generates samples from the specified distribution.
        
        Args:
            X: Input features
            n_samples: Number of samples to generate per prediction
            
        Returns:
            Dictionary with 'predictions', 'mean', and 'std'
        """
        # Get predictions from all individual trees
        tree_predictions = np.array([tree.predict(X) for tree in self.model.estimators_])
        
        # Calculate ensemble statistics
        mean = np.mean(tree_predictions, axis=0)
        std = np.std(tree_predictions, axis=0)
        
        # Ensure non-zero standard deviation (minimum 10% of mean)
        std = np.maximum(std, 0.1 * mean)
        
        # Generate samples based on distribution type
        samples = np.zeros((len(X), n_samples))
        
        for i in range(len(X)):
            if self.distribution == 'normal':
                # Normal distribution: N(μ, σ²)
                samples[i] = np.maximum(
                    np.random.normal(mean[i], std[i], n_samples), 1e-6
                )
            elif self.distribution == 'lognormal':
                # Log-Normal distribution: LogN(μ, σ²)
                log_mean = np.log(mean[i])
                log_std = std[i] / mean[i]  # Coefficient of variation approximation
                samples[i] = np.random.lognormal(log_mean, log_std, n_samples)
            else:  # gamma
                # Gamma distribution: Gamma(α, β)
                # Method of moments parameter estimation
                alpha = max((mean[i] / std[i]) ** 2, 1.0)
                beta = max(mean[i] / (std[i] ** 2), 1e-6)
                samples[i] = np.random.gamma(alpha, 1/beta, n_samples)
        
        return {
            'predictions': samples,
            'mean': mean,
            'std': std
        }

    def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """Make point predictions."""
        return self.predict_distribution(X)['mean']


def train_and_evaluate(model: BaseEstimator, 
                      train_df: pd.DataFrame, 
                      test_df: pd.DataFrame, 
                      input_features: List[str], 
                      target_features: List[str],
                      model_name: str = "Model") -> Tuple[BaseEstimator, Optional[Dict]]:
    """
    Train and evaluate a model with comprehensive performance metrics.
    
    Handles the complete training and evaluation pipeline including data
    preparation, model fitting, prediction generation, and metric calculation.
    Provides robust error handling and informative progress reporting.
    
    Args:
        model: Model instance to train (must implement fit and evaluate methods)
        train_df: Training dataset as pandas DataFrame
        test_df: Testing dataset as pandas DataFrame
        input_features: List of input feature column names
        target_features: List of target feature column names
        model_name: Display name for progress reporting (default: "Model")
        
    Returns:
        Tuple of (trained_model, evaluation_results):
        - trained_model: The fitted model instance
        - evaluation_results: Dict with 'metrics' and 'predictions', or None if failed
        
    Example:
        >>> model = BNNPredictor(input_dim=3, input_features=['a', 'b', 'c'], 
        ...                     target_features=['y'])
        >>> trained_model, results = train_and_evaluate(
        ...     model, train_df, test_df, ['a', 'b', 'c'], ['y'], "BNN"
        ... )
        >>> if results:
        ...     print(f"MAE: {results['metrics']['mae']:.3f}")
    """
    # Prepare training and testing data
    X_train, y_train = prepare_data(train_df, input_features, target_features)
    X_test, y_test = prepare_data(test_df, input_features, target_features)

    print(f"\nTraining {model_name}...")
    print(f"  Training samples: {len(X_train)}")
    print(f"  Test samples: {len(X_test)}")
    print(f"  Features: {len(input_features)}")
    
    # Training phase with error handling
    try:
        model.fit(X_train, y_train)
        print(f"  ✓ Training completed successfully")
    except Exception as e:
        print(f"  ✗ Training failed: {str(e)}")
        return model, None

    # Evaluation phase with error handling
    try:
        evaluation_results = model.evaluate(X_test, y_test)
        metrics = evaluation_results['metrics']

        # Display results
        print(f"\n{model_name} Performance:")
        print(f"  MAE:  {metrics['mae']:.3f}")
        print(f"  RMSE: {metrics['rmse']:.3f}")
        print(f"  NLL:  {metrics['nll']:.4f}")
        
        return model, evaluation_results
        
    except Exception as e:
        print(f"  ✗ Evaluation failed: {str(e)}")
        return model, None


def run_all_models(train_df: pd.DataFrame, 
                  test_df: pd.DataFrame,
                  input_features: List[str],
                  target_features: List[str], 
                  distributions: Optional[Dict[str, str]] = None,
                  bnn_prior: str = 'normal', 
                  bnn_likelihood: str = 'gamma') -> Tuple[Dict, Dict]:
    """
    Train and evaluate all available models with configurable distributions.
    
    Provides a comprehensive comparison of Linear Regression, Random Forest,
    and Bayesian Neural Network models with different distribution configurations.
    Includes robust error handling to continue evaluation even if individual
    models fail.
    
    Args:
        train_df: Training dataset as pandas DataFrame
        test_df: Testing dataset as pandas DataFrame
        input_features: List of input feature column names
        target_features: List of target feature column names
        distributions: Distribution config for each model type. If None, defaults
                      to {'linear': 'gamma', 'rf': 'gamma', 'bnn': 'gamma'}
        bnn_prior: Prior distribution for BNN weights (default: 'normal')
        bnn_likelihood: Likelihood distribution for BNN outputs (default: 'gamma')
        
    Returns:
        Tuple of (trained_models_dict, results_dict):
        - trained_models_dict: Successfully trained model instances
        - results_dict: Evaluation results for each successful model
        
    Raises:
        ValueError: If invalid distribution types are specified
        
    Example:
        >>> # Standard comparison with default gamma distributions
        >>> models, results = run_all_models(train_df, test_df, 
        ...                                 ['dist', 'time'], ['travel_time'])
        >>> 
        >>> # Custom distribution configuration
        >>> custom_dists = {'linear': 'normal', 'rf': 'lognormal', 'bnn': 'gamma'}
        >>> models, results = run_all_models(train_df, test_df, 
        ...                                 ['dist', 'time'], ['travel_time'],
        ...                                 distributions=custom_dists,
        ...                                 bnn_prior='gamma', bnn_likelihood='normal')
    """
    # Set default distributions if not provided
    if distributions is None:
        distributions = {'linear': 'gamma', 'rf': 'gamma', 'bnn': 'gamma'}
    
    # Validate all distribution parameters
    for model_type, dist in distributions.items():
        if dist not in SUPPORTED_DISTRIBUTIONS:
            raise ValueError(f"Invalid distribution '{dist}' for {model_type}. "
                           f"Choose from: {SUPPORTED_DISTRIBUTIONS}")
    
    if bnn_prior not in SUPPORTED_DISTRIBUTIONS:
        raise ValueError(f"Invalid BNN prior '{bnn_prior}'. "
                        f"Choose from: {SUPPORTED_DISTRIBUTIONS}")
    if bnn_likelihood not in SUPPORTED_DISTRIBUTIONS:
        raise ValueError(f"Invalid BNN likelihood '{bnn_likelihood}'. "
                        f"Choose from: {SUPPORTED_DISTRIBUTIONS}")
    
    # Initialize all models with specified configurations
    models_config = {
        'Linear Regression': LinearRegressionGamma(
            distribution=distributions.get('linear', 'gamma')
        ),
        'Random Forest': RandomForestRegressorGamma(
            n_estimators=100, 
            distribution=distributions.get('rf', 'gamma')
        ),
        'Bayesian Neural Network': BNNPredictor(
            input_dim=len(input_features),
            input_features=input_features,
            target_features=target_features,
            hidden_dim=32,
            prior_dist=bnn_prior,
            likelihood_dist=bnn_likelihood
        )
    }

    # Storage for successful results
    trained_models = {}
    evaluation_results = {}

    # Header for model comparison
    print(f"{'='*60}")
    print("COMPREHENSIVE MODEL COMPARISON")
    print(f"{'='*60}")
    print(f"Dataset: {len(train_df)} train, {len(test_df)} test samples")
    print(f"Features: {input_features}")
    print(f"Target: {target_features}")
    print(f"Distributions: {distributions}")
    if 'Bayesian Neural Network' in models_config:
        print(f"BNN Config: Prior={bnn_prior}, Likelihood={bnn_likelihood}")

    # Process each model with individual error handling
    for model_name, model_instance in models_config.items():
        print(f"\n{'-'*40}")
        print(f"Processing {model_name}...")
        
        # Display model-specific configuration
        if model_name == 'Bayesian Neural Network':
            print(f"  Prior: {bnn_prior}")
            print(f"  Likelihood: {bnn_likelihood}")
            print(f"  Hidden units: {model_instance.hidden_dim}")
        else:
            dist_key = 'linear' if 'Linear' in model_name else 'rf'
            print(f"  Distribution: {distributions.get(dist_key, 'gamma')}")

        try:
            # Train and evaluate model
            trained_model, results = train_and_evaluate(
                model_instance, train_df, test_df, 
                input_features, target_features, model_name
            )

            # Store successful results
            if results is not None:
                trained_models[model_name] = trained_model
                evaluation_results[model_name] = results
                print(f"  ✓ {model_name} completed successfully")
            else:
                print(f"  ✗ {model_name} failed during evaluation")
                
        except Exception as e:
            print(f"  ✗ {model_name} failed with error: {str(e)}")
            print(f"    Continuing with remaining models...")
            continue

    # Generate comparison summary
    if evaluation_results:
        print(f"\n{'='*60}")
        print("MODEL PERFORMANCE SUMMARY")
        print(f"{'='*60}")

        # Create formatted results table
        print(f"{'Model':<25} {'MAE':<8} {'RMSE':<8} {'NLL':<10} {'Distribution'}")
        print(f"{'-'*65}")

        # Sort by NLL (lower is better) for ranking
        sorted_results = sorted(evaluation_results.items(), 
                              key=lambda x: x[1]['metrics']['nll'])

        for model_name, result in sorted_results:
            metrics = result['metrics']
            
            # Get distribution info
            if 'Linear' in model_name:
                dist_info = distributions.get('linear', 'gamma')
            elif 'Forest' in model_name:
                dist_info = distributions.get('rf', 'gamma')
            else:  # BNN
                dist_info = bnn_likelihood
            
            print(f"{model_name:<25} {metrics['mae']:<8.3f} {metrics['rmse']:<8.3f} "
                  f"{metrics['nll']:<10.4f} {dist_info}")

        # Highlight best performing model
        best_model_name = sorted_results[0][0]
        best_nll = sorted_results[0][1]['metrics']['nll']
        print(f"\n🏆 Best Model: {best_model_name} (NLL: {best_nll:.4f})")
        
        print(f"{'='*60}")
    else:
        print(f"\n⚠️  No models completed successfully!")

    return trained_models, evaluation_results


def compare_distributions(train_df: pd.DataFrame, 
                         test_df: pd.DataFrame,
                         input_features: List[str],
                         target_features: List[str], 
                         model_type: str = 'linear') -> Dict[str, Dict]:
    """
    Compare all supported distributions for a single model type.
    
    Systematically evaluates Normal, Log-Normal, and Gamma distributions
    for the specified model type to determine optimal distribution choice.
    
    Args:
        train_df: Training dataset
        test_df: Testing dataset  
        input_features: List of input feature column names
        target_features: List of target feature column names
        model_type: Model type to test ('linear', 'rf', or 'bnn')
        
    Returns:
        Dictionary mapping distribution names to their evaluation metrics
        
    Raises:
        ValueError: If unknown model_type is specified
        
    Example:
        >>> # Compare distributions for Random Forest
        >>> rf_results = compare_distributions(train_df, test_df, 
        ...                                   ['distance', 'time'], ['travel_time'], 
        ...                                   model_type='rf')
        >>> # Find best distribution
        >>> best_dist = min(rf_results.items(), key=lambda x: x[1]['nll'])
        >>> print(f"Best distribution for RF: {best_dist[0]}")
    """
    if model_type not in ['linear', 'rf', 'bnn']:
        raise ValueError(f"Unknown model type: {model_type}. "
                        f"Choose from: 'linear', 'rf', 'bnn'")
    
    results = {}
    
    print(f"\n{'='*60}")
    print(f"DISTRIBUTION COMPARISON FOR {model_type.upper()} MODEL")
    print(f"{'='*60}")
    print(f"Testing all distributions: {SUPPORTED_DISTRIBUTIONS}")
    
    # Test each supported distribution
    for dist in SUPPORTED_DISTRIBUTIONS:
        print(f"\n📊 Testing {dist.upper()} distribution...")
        
        try:
            # Create model instance based on type and distribution
            if model_type == 'linear':
                model = LinearRegressionGamma(distribution=dist)
                display_name = f"Linear-{dist.capitalize()}"
            elif model_type == 'rf':
                model = RandomForestRegressorGamma(n_estimators=100, distribution=dist)
                display_name = f"RandomForest-{dist.capitalize()}"
            elif model_type == 'bnn':
                model = BNNPredictor(
                    input_dim=len(input_features),
                    input_features=input_features, 
                    target_features=target_features,
                    hidden_dim=32,
                    prior_dist=dist,
                    likelihood_dist=dist
                )
                display_name = f"BNN-{dist.capitalize()}"
            
            # Train and evaluate
            trained_model, evaluation = train_and_evaluate(
                model, train_df, test_df, input_features, target_features, display_name
            )
            
            if evaluation is not None:
                results[dist] = evaluation['metrics']
                print(f"  ✓ {dist.capitalize()} completed successfully")
            else:
                print(f"  ✗ {dist.capitalize()} evaluation failed")
                
        except Exception as e:
            print(f"  ✗ {dist.capitalize()} failed: {str(e)}")
            continue
    
    # Generate comparison summary
    if results:
        print(f"\n{model_type.upper()} DISTRIBUTION COMPARISON RESULTS:")
        print(f"{'-'*50}")
        print(f"{'Distribution':<12} {'MAE':<8} {'RMSE':<8} {'NLL':<10}")
        print(f"{'-'*50}")
        
        # Sort by NLL for ranking
        sorted_results = sorted(results.items(), key=lambda x: x[1]['nll'])
        
        for dist, metrics in sorted_results:
            print(f"{dist:<12} {metrics['mae']:<8.3f} {metrics['rmse']:<8.3f} "
                  f"{metrics['nll']:<10.4f}")
            
        # Highlight best distribution
        best_dist = sorted_results[0][0]
        best_nll = sorted_results[0][1]['nll']
        print(f"\n🏆 Best distribution for {model_type}: {best_dist} (NLL: {best_nll:.4f})")
    else:
        print(f"\n⚠️  No distributions completed successfully for {model_type}!")
    
    return results


def create_example_data(n_samples: int = 1000, 
                       random_seed: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Generate realistic synthetic travel time dataset for testing and demonstration.
    
    Creates a dataset with realistic relationships between travel features and
    travel time, including rush hour effects, traffic impacts, and weather conditions.
    
    Features generated:
    - distance: Trip distance in kilometers (1-50 km)
    - time_of_day: Hour of day (0-24)
    - traffic_density: Traffic density factor (0.1-1.0)
    - weather_impact: Weather condition multiplier (0.8-1.2)
    
    Target variable:
    - travel_time: Realistic travel time with feature dependencies
    
    Args:
        n_samples: Total number of samples to generate (default: 1000)
        random_seed: Random seed for reproducibility (default: 42)
        
    Returns:
        Tuple of (train_df, test_df) with 80/20 split
        
    Example:
        >>> train_df, test_df = create_example_data(n_samples=500)
        >>> print(f"Training samples: {len(train_df)}")
        >>> print(f"Features: {list(train_df.columns)}")
        >>> # Use for model testing
        >>> models, results = run_all_models(train_df, test_df, 
        ...                                 ['distance', 'time_of_day'], 
        ...                                 ['travel_time'])
    """
    np.random.seed(random_seed)
    
    print(f"Generating synthetic travel time dataset...")
    print(f"  Samples: {n_samples}")
    print(f"  Random seed: {random_seed}")
    
    # Generate realistic travel features
    distance = np.random.uniform(1, 50, n_samples)  # Trip distance (km)
    time_of_day = np.random.uniform(0, 24, n_samples)  # Hour of day
    traffic_density = np.random.uniform(0.1, 1.0, n_samples)  # Traffic factor
    weather_impact = np.random.uniform(0.8, 1.2, n_samples)  # Weather factor
    
    # Create realistic travel time with dependencies
    # Base time proportional to distance
    base_time = distance * 2 + np.random.normal(0, 2, n_samples)
    
    # Rush hour effects (7-9 AM and 5-7 PM)
    rush_hour_multiplier = np.where(
        ((time_of_day >= 7) & (time_of_day <= 9)) | 
        ((time_of_day >= 17) & (time_of_day <= 19)), 
        1.3,  # 30% increase during rush hour
        1.0
    )
    
    # Traffic density effect
    traffic_multiplier = 1 + traffic_density * 0.5  # Up to 50% increase
    
    # Calculate final travel time
    travel_time = (base_time * rush_hour_multiplier * 
                  traffic_multiplier * weather_impact)
    
    # Ensure all travel times are positive
    travel_time = np.maximum(travel_time, 1.0)
    
    # Create DataFrame
    data = pd.DataFrame({
        'distance': distance,
        'time_of_day': time_of_day,
        'traffic_density': traffic_density,
        'weather_impact': weather_impact,
        'travel_time': travel_time
    })
    
    # Split into training and testing sets (80/20)
    split_index = int(0.8 * n_samples)
    train_df = data.iloc[:split_index].reset_index(drop=True)
    test_df = data.iloc[split_index:].reset_index(drop=True)
    
    print(f"  ✓ Generated {len(train_df)} training and {len(test_df)} test samples")
    print(f"  ✓ Travel time range: {travel_time.min():.1f} - {travel_time.max():.1f} minutes")
    
    return train_df, test_df


def main_example():
    """
    Comprehensive example demonstrating the library's capabilities.
    
    Generates synthetic data and runs a complete analysis including:
    - All model types with default configurations
    - Distribution comparisons
    - Best model selection and evaluation
    
    This serves as both a usage example and a validation test.
    """
    print("🚗 Bayesian Travel Time Prediction Library - Comprehensive Example")
    print("="*70)
    
    # Generate example dataset
    print("\n📊 STEP 1: Read Data")
    trips_df = pd.read_csv(trips_input_filename)
    train_df, test_df = train_test_split(trips_df, test_size=0.2, random_state=42)
    
    
    
    print(f"\nDataset Summary:")
    print(f"  Input features: {input_features}")
    print(f"  Target feature: {target_features[0]}")
    print(f"  Training samples: {len(train_df)}")
    print(f"  Test samples: {len(test_df)}")
    
    # Run comprehensive model comparison
    print(f"\n🤖 STEP 2: Model Comparison")
    models, results = run_all_models(
        train_df, test_df, input_features, target_features,
        distributions={'linear': 'gamma', 'rf': 'gamma', 'bnn': 'gamma'}
    )
    
   
        
    print(f"\n✅ completed successfully!")
    print("="*70)
    
    return models, results

if __name__ == "__main__":
    models, results =main_example()

🚗 Bayesian Travel Time Prediction Library - Comprehensive Example

📊 STEP 1: Read Data

Dataset Summary:
  Input features: ['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon', 'departure_time_hour', 'departure_time_minute', 'departure_time_day_of_week', 'departure_time_day_of_month', 'departure_time_month', 'departure_time_hour_sin', 'departure_time_hour_cos', 'departure_time_day_of_week_sin', 'departure_time_day_of_week_cos', 'departure_time_month_sin', 'departure_time_month_cos', 'distance']
  Target feature: travel_time
  Training samples: 12
  Test samples: 4

🤖 STEP 2: Model Comparison
COMPREHENSIVE MODEL COMPARISON
Dataset: 12 train, 4 test samples
Features: ['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon', 'departure_time_hour', 'departure_time_minute', 'departure_time_day_of_week', 'departure_time_day_of_month', 'departure_time_month', 'departure_time_hour_sin', 'departure_time_hour_cos', 'departure_time_day_of_week_sin', 'departure_time_day_of_we

In [34]:
def compute_normal_percentiles(mean, std, percentiles):
    """
    Compute percentiles directly from normal distribution parameters.
    
    Args:
        mean: Mean parameter(s) of normal distribution
        std: Standard deviation parameter(s) of normal distribution  
        percentiles: List of percentiles to compute (0-100)
        
    Returns:
        Dictionary with percentile values
    """
    percentile_dict = {}
    
    for p in percentiles:
        if np.isscalar(mean):
            # Single prediction
            percentile_val = stats.norm.ppf(p/100, loc=mean, scale=std)
        else:
            # Multiple predictions
            percentile_val = np.array([
                stats.norm.ppf(p/100, loc=mean[i], scale=std[i]) 
                for i in range(len(mean))
            ])
        percentile_dict[f'p{p}'] = percentile_val
    
    return percentile_dict

def compute_lognormal_percentiles(log_mean, log_std, percentiles):
    """
    Compute percentiles directly from log-normal distribution parameters.
    
    Args:
        log_mean: Log-scale mean parameter(s) of log-normal distribution
        log_std: Log-scale standard deviation parameter(s) of log-normal distribution  
        percentiles: List of percentiles to compute (0-100)
        
    Returns:
        Dictionary with percentile values
    """
    percentile_dict = {}
    
    for p in percentiles:
        if np.isscalar(log_mean):
            # Single prediction
            percentile_val = stats.lognorm.ppf(p/100, s=log_std, scale=np.exp(log_mean))
        else:
            # Multiple predictions
            percentile_val = np.array([
                stats.lognorm.ppf(p/100, s=log_std[i], scale=np.exp(log_mean[i])) 
                for i in range(len(log_mean))
            ])
        percentile_dict[f'p{p}'] = percentile_val
    
    return percentile_dict

def compute_gamma_percentiles(alpha, beta, percentiles):
    """
    Compute percentiles directly from gamma distribution parameters.
    
    Args:
        alpha: Shape parameter(s) of gamma distribution
        beta: Rate parameter(s) of gamma distribution  
        percentiles: List of percentiles to compute (0-100)
        
    Returns:
        Dictionary with percentile values
    """
    percentile_dict = {}
    
    for p in percentiles:
        if np.isscalar(alpha):
            # Single prediction
            percentile_val = stats.gamma.ppf(p/100, a=alpha, scale=1/beta)
        else:
            # Multiple predictions
            percentile_val = np.array([
                stats.gamma.ppf(p/100, a=alpha[i], scale=1/beta[i]) 
                for i in range(len(alpha))
            ])
        percentile_dict[f'p{p}'] = percentile_val
    
    return percentile_dict

def predict_thresholds_distribution_based(new_data_np, model, confidence_levels=[0.95, 0.50]):
    """
    Predict travel time thresholds for given confidence levels using direct distribution calculations.
    
    Args:
        new_data_np: Numpy array of features
        model: Trained model object
        confidence_levels: List of confidence levels (0-1)
        
    Returns:
        Dictionary with threshold values for each confidence level
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        predictions = model.predict_distribution(new_data_np, n_samples=1000)  # We only need mean/std, not samples
    
    mean_pred = predictions['mean']
    std_pred = predictions['std']
    
    # Convert confidence levels to percentiles
    percentiles = [100 * (1 + conf) / 2 for conf in confidence_levels]
    
    thresholds = {}
    
    # Determine which distribution to use based on model type
    if hasattr(model, 'distribution'):
        distribution = model.distribution
    elif hasattr(model, 'likelihood_dist'):
        distribution = model.likelihood_dist
    else:
        # Default fallback
        distribution = 'gamma'
    
    # Compute percentiles based on distribution type
    if distribution == 'normal':
        percentile_results = compute_normal_percentiles(mean_pred, std_pred, percentiles)
    elif distribution == 'lognormal':
        # For log-normal, we need to convert back to log-scale parameters
        # Assuming mean_pred and std_pred are in original scale
        log_mean = np.log(mean_pred)
        log_std = std_pred / mean_pred  # Coefficient of variation approximation
        percentile_results = compute_lognormal_percentiles(log_mean, log_std, percentiles)
    elif distribution == 'gamma':
        # Convert mean and std to gamma parameters (method of moments)
        alpha = np.maximum((mean_pred / std_pred) ** 2, 1e-6)
        beta = np.maximum(mean_pred / (std_pred ** 2), 1e-6)
        percentile_results = compute_gamma_percentiles(alpha, beta, percentiles)
    else:
        raise ValueError(f"Unsupported distribution: {distribution}")
    
    # Map percentiles back to confidence levels
    for i, conf in enumerate(confidence_levels):
        percentile_key = f'p{percentiles[i]}'
        thresholds[conf] = {
            'threshold': percentile_results[percentile_key],
            'mean': mean_pred,
            'std': std_pred
        }
    
    return thresholds

def predict_thresholds_and_save_distribution_based(input_df, output_csv_path, model, chunk_size=1000, confidence_levels=[0.95, 0.75, 0.50]):
    """
    Predict travel time thresholds for each row in the input DataFrame using direct distribution calculations,
    and save the results along with the original data to a CSV file.
    
    Args:
        input_df: Pandas DataFrame containing the input data
        output_csv_path: Path to save the output CSV file with predictions
        model: Trained model object
        chunk_size: Number of rows to process in each chunk
        confidence_levels: List of confidence levels (0-1)
    """
    # Check if output directory exists, create if not
    output_dir = os.path.dirname(output_csv_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Calculate number of chunks
    n_chunks = (len(input_df) + chunk_size - 1) // chunk_size  # Ceiling division
    
    # Flag to determine if we need to write header to output file
    first_chunk = True
    
    # Process each chunk
    for i in range(n_chunks):
        # Extract chunk
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(input_df))
        chunk_df = input_df.iloc[start_idx:end_idx].copy()
        
        print(f"Processing chunk {i+1}/{n_chunks}, rows: {start_idx} to {end_idx-1}")
        
        # Convert features to numpy array for prediction
        # You'll need to define input_features somewhere in your code
        # feature_cols = input_features  # Make sure this is defined
        feature_cols = ['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon',
                       'departure_time_hour', 'departure_time_minute',
                       'departure_time_day_of_week', 'departure_time_day_of_month',
                       'departure_time_month', 'departure_time_hour_sin',
                       'departure_time_hour_cos', 'departure_time_day_of_week_sin',
                       'departure_time_day_of_week_cos', 'departure_time_month_sin',
                       'departure_time_month_cos', 'distance']
        
        features_np = chunk_df[feature_cols].values
        
        # Get predictions for this chunk using distribution-based method
        thresholds = predict_thresholds_distribution_based(features_np, model, confidence_levels)
        
        # Add prediction columns to output dataframe
        for conf in confidence_levels:
            # Add threshold value for this confidence level
            chunk_df[f'threshold_{int(conf*100)}'] = thresholds[conf]['threshold']
        
        # Write to output file (append mode after first chunk)
        if first_chunk:
            chunk_df.to_csv(output_csv_path, index=False, mode='w')
            first_chunk = False
        else:
            chunk_df.to_csv(output_csv_path, index=False, mode='a', header=False)
    
    print(f"Processing complete. Results saved to {output_csv_path}")


df = pd.read_csv(generated_virtual_drt_trips)

# Create a mapping dictionary for renaming
column_mapping = {
    'start_x': 'origin_lat',
    'start_y': 'origin_lon', 
    'end_x': 'destination_lat',
    'end_y': 'destination_lon',
    'road_distance': 'distance',
    'departure_time_hour': 'departure_time_hour',
    'departure_time_minute': 'departure_time_minute',
    'departure_time_day_of_week': 'departure_time_day_of_week',
    'departure_time_day_of_month': 'departure_time_day_of_month',
    'departure_time_month': 'departure_time_month',
    'departure_time_hour_sin': 'departure_time_hour_sin',
    'departure_time_hour_cos': 'departure_time_hour_cos',
    'departure_time_day_of_week_sin': 'departure_time_day_of_week_sin',
    'departure_time_day_of_week_cos': 'departure_time_day_of_week_cos',
    'departure_time_month_sin': 'departure_time_month_sin',
    'departure_time_month_cos': 'departure_time_month_cos'
}


df_renamed = df.rename(columns=column_mapping)
final_columns = ['origin_lat', 'origin_lon', 'destination_lat', 'destination_lon',
                 'departure_time_hour', 'departure_time_minute',
                 'departure_time_day_of_week', 'departure_time_day_of_month',
                 'departure_time_month', 'departure_time_hour_sin',
                 'departure_time_hour_cos', 'departure_time_day_of_week_sin',
                 'departure_time_day_of_week_cos', 'departure_time_month_sin',
                 'departure_time_month_cos', 'distance']
df_final = df_renamed[final_columns]


# Call the function to predict and save
predict_thresholds_and_save(df_final, trips_output_path_generated_drt_trips, models["Random Forest"])

Processing chunk 1/42, rows: 0 to 999
Processing chunk 2/42, rows: 1000 to 1999
Processing chunk 3/42, rows: 2000 to 2999
Processing chunk 4/42, rows: 3000 to 3999
Processing chunk 5/42, rows: 4000 to 4999
Processing chunk 6/42, rows: 5000 to 5999
Processing chunk 7/42, rows: 6000 to 6999
Processing chunk 8/42, rows: 7000 to 7999
Processing chunk 9/42, rows: 8000 to 8999
Processing chunk 10/42, rows: 9000 to 9999
Processing chunk 11/42, rows: 10000 to 10999
Processing chunk 12/42, rows: 11000 to 11999
Processing chunk 13/42, rows: 12000 to 12999
Processing chunk 14/42, rows: 13000 to 13999
Processing chunk 15/42, rows: 14000 to 14999
Processing chunk 16/42, rows: 15000 to 15999
Processing chunk 17/42, rows: 16000 to 16999
Processing chunk 18/42, rows: 17000 to 17999
Processing chunk 19/42, rows: 18000 to 18999
Processing chunk 20/42, rows: 19000 to 19999
Processing chunk 21/42, rows: 20000 to 20999
Processing chunk 22/42, rows: 21000 to 21999
Processing chunk 23/42, rows: 22000 to 2299

##  Create GTFS files

In [40]:
'''import sys
sys.path.append('/content/drive/MyDrive/ColabNotebooks/AccessibilityDRTcityChrone_CL/library/')  # Path to the folder containing the .py file
from create_GTFS_from_Simulated_Trips import *  # Import all functions from the .py file'
'''
from datetime import datetime, timedelta
import zipfile
import os
#from tqdm import tqdm  # Using regular tqdm instead of tqdm.notebook

### input variable to create GTFS ###
percentile_travel_time = 95



# *** Define configuration with your column names ***
config = {
    'region': 'orleans',  # replace with your region
    'threshold_threshold': f"threshold_{percentile_travel_time}",  # or whichever travel time column you want to use
    'coord_columns': {
        'origin_lat': 'origin_lat',
        'origin_lon': 'origin_lon',
        'destination_lat': 'destination_lat',
        'destination_lon': 'destination_lon'
    },
    'time_columns': {
        'departure_hour': 'departure_time_hour',
        'departure_min': 'departure_time_minute'
    },
    'start_date': day_gtfs,
    'end_date': day_gtfs,
    'service_days': {
        'monday': 1, 'tuesday': 1, 'wednesday': 1, 'thursday': 1, 'friday': 1,
        'saturday': 0, 'sunday': 0
    },
    'agency_info': {
        'agency_id': scenario_name,
        'agency_name': '{scenario_name} Service',
        'agency_url': 'https://example.com',
        'agency_timezone': 'Europe/Rome'
    },
    # replace with your desired output path
    "scenario_name": scenario_name,  # replace with your scenario name
    "output_path": directory_GTFS
}
# *** adjust the columns above if they have different names ***

def create_calendar(start_date, end_date, service_days):
    """Create calendar.txt with service dates based on user parameters"""
    calendar_data = {
        'service_id': ['WEEKDAY'],
        'monday': [service_days.get('monday', 0)],
        'tuesday': [service_days.get('tuesday', 0)],
        'wednesday': [service_days.get('wednesday', 0)],
        'thursday': [service_days.get('thursday', 0)],
        'friday': [service_days.get('friday', 0)],
        'saturday': [service_days.get('saturday', 0)],
        'sunday': [service_days.get('sunday', 0)],
        'start_date': [start_date],
        'end_date': [end_date]
    }
    return pd.DataFrame(calendar_data)

def create_agency(agency_info):
    """Create agency.txt with user-defined parameters"""
    return pd.DataFrame({
        'agency_id': [agency_info.get('agency_id', scenario_name)],
        'agency_name': [agency_info.get('agency_name', '{scenario_name} Service')],
        'agency_url': [agency_info.get('agency_url', 'https://example.com')],
        'agency_timezone': [agency_info.get('agency_timezone', 'Europe/Rome')]
    })

def create_stops(trips_df, region, coord_columns):
    """
    Create stops.txt with unique points and return a mapping dict for coordinates to stop_ids
    
    Returns:
    - stops_df: DataFrame with unique stops
    - coords_to_stop_id: Dictionary mapping (lat, lon) tuples to stop_ids
    """
    # Extract column names
    origin_lat = coord_columns.get('origin_lat', 'origin_lat')
    origin_lon = coord_columns.get('origin_lon', 'origin_lon')
    destination_lat = coord_columns.get('destination_lat', 'destination_lat')
    destination_lon = coord_columns.get('destination_lon', 'destination_lon')
    
    print("Creating unified stops...")
    
    # Create a dictionary to store unique coordinates and their stop IDs
    coords_to_stop_id = {}
    stop_data = []
    stop_idx = 0
    
    # Process all coordinates (both origins and destinations)
    for idx, row in trips_df.iterrows():
        # Process origin coordinates
        origin_coords = (row[origin_lat], row[origin_lon])
        if origin_coords not in coords_to_stop_id:
            stop_id = f"stop_{stop_idx}"
            coords_to_stop_id[origin_coords] = stop_id
            stop_data.append({
                'stop_id': stop_id,
                'stop_name': f'Stop_{stop_idx}',
                'stop_lat': origin_coords[0],
                'stop_lon': origin_coords[1],
                'city': region.lower(),
                'file': region.lower(),
                'pos': stop_idx
            })
            stop_idx += 1
        
        # Process destination coordinates
        dest_coords = (row[destination_lat], row[destination_lon])
        if dest_coords not in coords_to_stop_id:
            stop_id = f"stop_{stop_idx}"
            coords_to_stop_id[dest_coords] = stop_id
            stop_data.append({
                'stop_id': stop_id,
                'stop_name': f'Stop_{stop_idx}',
                'stop_lat': dest_coords[0],
                'stop_lon': dest_coords[1],
                'city': region.lower(),
                'file': region.lower(),
                'pos': stop_idx
            })
            stop_idx += 1
    
    # Create stops DataFrame
    stops_df = pd.DataFrame(stop_data)
    
    print(f"Created {len(stops_df)} unique stops")
    
    # Return both the stops DataFrame and the mapping dictionary
    return stops_df, coords_to_stop_id

def create_stop_times(trips_df, travel_time_threshold, time_columns, coord_columns, coords_to_stop_id):
    """
    Create stop_times.txt using a pre-existing coordinate to stop_id mapping
    
    Parameters:
    - trips_df: DataFrame with trip data
    - travel_time_threshold: Column name with travel time data
    - time_columns: Dict mapping time column names
    - coord_columns: Dict mapping coordinate column names
    - coords_to_stop_id: Dictionary mapping (lat, lon) tuples to stop_ids
    """
    trip_ids = []
    arrival_times = []
    departure_times_list = []
    stop_ids = []
    stop_sequences = []
    
    # Extract column names
    origin_lat = coord_columns.get('origin_lat', 'origin_lat')
    origin_lon = coord_columns.get('origin_lon', 'origin_lon')
    destination_lat = coord_columns.get('destination_lat', 'destination_lat')
    destination_lon = coord_columns.get('destination_lon', 'destination_lon')
    departure_hour = time_columns.get('departure_hour', 'departure_hour')
    departure_min = time_columns.get('departure_min', 'departure_min')
    
    print(f"Creating stop times for {len(trips_df)} trips...")
    for idx, row in trips_df.iterrows():
        # Format departure time
        dep_time = f"{int(row[departure_hour]):02d}:{int(row[departure_min]):02d}:00"
        base_time = datetime.strptime(dep_time, "%H:%M:%S")
        
        # Calculate arrival time
        travel_time = timedelta(seconds=int(float(row[travel_time_threshold])))
        arr_time = (base_time + travel_time).strftime("%H:%M:%S")
        trip_id = f'trip_{idx}'
        
        # Get stop IDs from the mapping
        dep_stop_id = coords_to_stop_id[(row[origin_lat], row[origin_lon])]
        arr_stop_id = coords_to_stop_id[(row[destination_lat], row[destination_lon])]
        
        # Add to lists
        trip_ids.extend([trip_id, trip_id])
        arrival_times.extend([dep_time, arr_time])
        departure_times_list.extend([dep_time, arr_time])
        stop_ids.extend([dep_stop_id, arr_stop_id])
        stop_sequences.extend([1, 2])
    
    stop_times = pd.DataFrame({
        'trip_id': trip_ids,
        'arrival_time': arrival_times,
        'departure_time': departure_times_list,
        'stop_id': stop_ids,
        'stop_sequence': stop_sequences
    })
    
    return stop_times

def create_trips(stop_times, region):
    """Create trips.txt with updated trip format"""
    unique_trips = stop_times['trip_id'].unique()
    print(f"Creating trips for {len(unique_trips)} unique trips...")
    trips = pd.DataFrame({
        'route_id': [f'route_{trip_id}' for trip_id in unique_trips],
        'service_id': ['WEEKDAY' for _ in unique_trips],
        'trip_id': unique_trips,
        'trip_headsign': [f'DRT Service {trip_id}' for trip_id in unique_trips],
        'city': [region.lower() for _ in unique_trips],
        'file': [region.lower() for _ in unique_trips]
    })
    return trips

def create_routes(trips, region):
    """Create routes.txt with updated format"""
    unique_routes = trips['route_id'].unique()
    print(f"Creating routes for {len(unique_routes)} unique routes...")
    routes = pd.DataFrame({
        'route_id': unique_routes,
        'route_type': [3 for _ in range(len(unique_routes))],
        'route_short_name': [f'DRT_{i}' for i in range(len(unique_routes))],
        'agency_id': ['DRT_1' for _ in range(len(unique_routes))],
        'route_long_name': [f'DRT Service Route {i}' for i in range(len(unique_routes))],
        'city': [region.lower() for _ in range(len(unique_routes))],
        'file': [region.lower() for _ in range(len(unique_routes))]
    })
    return routes

def create_gtfs_from_trips(trips_df, config):
    """Convert trips dataframe to GTFS format with user-defined parameters"""
    gtfs = {}
    
    region = config.get('region', 'orleans')
    travel_time_threshold = config.get('travel_time_threshold', 'threshold_95')
    coord_columns = config.get('coord_columns', {
        'origin_lat': 'origin_lat',
        'origin_lon': 'origin_lon',
        'destination_lat': 'destination_lat',
        'destination_lon': 'destination_lon'
    })
    time_columns = config.get('time_columns', {
        'departure_hour': 'departure_hour',
        'departure_min': 'departure_min'
    })
    
    print("Creating GTFS files...")
    
    print("Creating stops.txt...")
    stops, coords_to_stop_id = create_stops(trips_df, region, coord_columns)
    gtfs['stops'] = stops
    
    print("Creating stop_times.txt...")
    stop_times = create_stop_times(trips_df, travel_time_threshold, time_columns, coord_columns, coords_to_stop_id)
    gtfs['stop_times'] = stop_times
    
    print("Creating trips.txt...")
    trips = create_trips(stop_times, region)
    gtfs['trips'] = trips
    
    print("Creating routes.txt...")
    routes = create_routes(trips, region)
    gtfs['routes'] = routes
    
    print("Creating calendar.txt...")
    service_days = config.get('service_days', {
        'monday': 1, 'tuesday': 1, 'wednesday': 1, 'thursday': 1, 'friday': 1,
        'saturday': 0, 'sunday': 0
    })
    calendar = create_calendar(
        config.get('start_date', '20250401'),
        config.get('end_date', '20250402'),
        service_days
    )
    gtfs['calendar'] = calendar
    
    print("Creating agency.txt...")
    agency_info = config.get('agency_info', {
        'agency_id': 'DRT_1',
        'agency_name': 'DRT Service',
        'agency_url': 'https://example.com',
        'agency_timezone': 'Europe/Paris'
    })
    agency = create_agency(agency_info)
    gtfs['agency'] = agency
    
    return gtfs

def save_gtfs_files(gtfs, output_path, scenario_name):
    """Save GTFS files into a zip archive"""
    temp_dir = os.path.join(output_path, 'temp_gtfs')
    os.makedirs(temp_dir, exist_ok=True)
    
    zip_path = os.path.join(output_path, f'gtfs_{scenario_name}.zip')
    
    try:
        print("Creating GTFS text files...")
        for filename, df in gtfs.items():
            file_path = os.path.join(temp_dir, f"{filename}.txt")
            df.to_csv(file_path, index=False)
            print(f"- Created {filename}.txt with {len(df)} rows")
        
        print("Creating zip archive...")
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for filename in os.listdir(temp_dir):
                file_path = os.path.join(temp_dir, filename)
                zipf.write(file_path, filename)
        
        print(f"GTFS feed successfully saved to: {zip_path}")
        
    finally:
        print("Cleaning up temporary files...")
        for filename in os.listdir(temp_dir):
            os.remove(os.path.join(temp_dir, filename))
        os.rmdir(temp_dir)

# Example: Creating a small sample dataset for testing
# Replace this with your actual data loading
def create_sample_data(num_trips=10):
    """Create a small sample dataset for testing"""
    np.random.seed(42)
    
    # Create random coordinates
    origins = [(47.90 + np.random.random()*0.1, 1.90 + np.random.random()*0.1) for _ in range(5)]
    destinations = [(47.95 + np.random.random()*0.1, 1.95 + np.random.random()*0.1) for _ in range(5)]
    
    data = []
    for i in range(num_trips):
        origin = origins[np.random.randint(0, len(origins))]
        destination = destinations[np.random.randint(0, len(destinations))]
        
        trip = {
            'origin_lat': origin[0],
            'origin_lon': origin[1],
            'destination_lat': destination[0],
            'destination_lon': destination[1],
            'departure_hour': np.random.randint(7, 20),
            'departure_min': np.random.randint(0, 60),
            'travel_time_25': np.random.randint(500, 1500),
            'travel_time_50': np.random.randint(600, 1800),
            'travel_time_70': np.random.randint(700, 2100),
            'travel_time_95': np.random.randint(800, 2400)
        }
        data.append(trip)
    
    return pd.DataFrame(data)

# ------------------------------------------------------------------
# Run the GTFS generation process
# ------------------------------------------------------------------

# ------------------------------------------------------------------
# To modify the configuration and run again:
# ------------------------------------------------------------------
# Configuration - customize these values as needed
if __name__ == "__main__":
    # Load your trips DataFrame
    trips_df = pd.read_csv(trips_output_path_generated_drt_trips)
    
    # Create GTFS files
    gtfs = create_gtfs_from_trips(trips_df, config)
    
    # Save GTFS files
    save_gtfs_files(gtfs, directory_GTFS, f"{scenario_name}_{percentile_travel_time}p")

# # Update configuration as needed
# config['region'] = 'paris'
# config['travel_timee_threshold'] = 'travel_time_50'
# config['scenario_name'] = 'paris_scenario'
# config['service_days']['saturday'] = 1  # Add Saturday service

Creating GTFS files...
Creating stops.txt...
Creating unified stops...
Created 11 unique stops
Creating stop_times.txt...
Creating stop times for 41230 trips...
Creating trips.txt...
Creating trips for 41230 unique trips...
Creating routes.txt...
Creating routes for 41230 unique routes...
Creating calendar.txt...
Creating agency.txt...
Creating GTFS text files...
- Created stops.txt with 11 rows
- Created stop_times.txt with 82460 rows
- Created trips.txt with 41230 rows
- Created routes.txt with 41230 rows
- Created calendar.txt with 1 rows
- Created agency.txt with 1 rows
Creating zip archive...
GTFS feed successfully saved to: ./GTFS/orleans/DRT_SF/gtfs_DRT_95p.zip
Cleaning up temporary files...


### Merge GTFS

In [None]:
'''
# Using default threshold (travel_time_25)
gtfs_data = create_gtfs_from_trips(data,"travel_time_25")

# Or specifying a different threshold
gtfs_data_25 = create_gtfs_from_trips(data, travel_time_threshold='travel_time_25')

# Save with default threshold
save_gtfs_files(gtfs_data, "path/to/output")

# Or save with specific threshold
save_gtfs_files(gtfs_data_95, "path/to/output", travel_time_threshold='travel_time_25')

# For joining and exporting GTFS files:
merged_gtfs = joinGTFS(gtfs_dataset1, gtfs_dataset2)
exportGTFS("path/to/output", merged_gtfs)  # These functions don't need the threshold parameter'
'''