In [1]:
#!/usr/bin/env python
# coding: utf-8

import sys
import os
import platform
import logging
import trino
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from scipy import stats

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.table import Table



# Configure basic logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [2]:
def get_vehicle_data(start_time=None, end_time=None, vehicle_ids=None):
    """
    Fetch vehicle and GPS data from Trino with flexible time filtering.
    Data is fetched from individual tables and then joined using pandas.

    Args:
        start_time: Start time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        end_time: End time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        vehicle_ids: List of vehicle IDs to filter (optional)

    Returns:
        DataFrame with vehicle data from all three tables
    """
    # Set default vehicle IDs if not provided
    # if vehicle_ids is None:
    #     vehicle_ids = ['18','19']

    # Format vehicle IDs for the query
    vehicle_ids_str = "', '".join(vehicle_ids)

    # Build time filter conditions
    time_filter = ""

    if start_time and end_time:
        # Convert string times to datetime objects
        start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')

        # Format for the query
        time_filter = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    else:
        # Default time range if not provided
        time_filter = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """

    # Connect to Trino
    conn = trino.dbapi.connect(
        host="trino",
        port=8080,
        user="admin",
        catalog="adhoc",
        schema="default"
    )

    # Query: Fetch data from can_parsed_output_100
    query_cpo100 = f"""
    SELECT 
        id, timestamp, dt, 
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS IST,
        date_trunc('minute', timestamp) as ts_mins,  -- Truncate to minutes
        total_battery_current, bat_voltage, gear_position, odometerreading, 
        round(vehicle_speed_vcu,2) as vehicle_speed_vcu,
        vehiclereadycondition, ignitionstatus, gun_connection_status, brakepedalpos
    FROM 
        facts_prod.can_parsed_output_100
    WHERE 
        id IN ('{vehicle_ids_str}')
        {time_filter}
    """

    # Execute query and fetch data into dataframe
    cur = conn.cursor()
    cur.execute(query_cpo100)
    columns_cpo100 = [desc[0] for desc in cur.description]
    rows_cpo100 = cur.fetchall()
    df_cpo100 = pd.DataFrame(rows_cpo100, columns=columns_cpo100)

    # Close connections
    cur.close()
    conn.close()

    # Sort the final dataframe
    final_df = df_cpo100.sort_values(['id', 'IST'])

    # Reset index
    final_df = final_df.reset_index(drop=True)
    imputed_df = impute_odometer_readings(final_df)

    return imputed_df

In [3]:
def impute_odometer_readings(df):
    """
    Impute missing OdometerReading values using vehicle speed and timestamp data.

    This function fills missing odometer readings by:
    1. Identifying segments between known odometer readings
    2. Using vehicle speed and time differences to calculate distance
    3. Interpolating odometer values based on proportional time elapsed

    Args:
        df (pd.DataFrame): DataFrame containing vehicle data with columns:
            - id: Vehicle identifier
            - IST: Timestamp
            - OdoMeterReading: Odometer reading (may contain NaN values)
            - Vehicle_speed_VCU: Vehicle speed in km/h

    Returns:
        pd.DataFrame: DataFrame with imputed OdometerReading values
    """
    logging.info("⚙️ Starting odometer reading imputation...")

    # Make a copy to avoid SettingWithCopyWarning
    df = df.copy()

    # Check if DataFrame is empty
    if df.empty:
        logging.warning("Input DataFrame is empty. Returning empty DataFrame.")
        return df

    # Group by vehicle ID and process each group separately
    imputed_dfs = []

    for vehicle_id, vehicle_df in df.groupby('id'):
        # Sort by timestamp to ensure correct order
        vehicle_df = vehicle_df.sort_values('IST')

        # Check if there are any missing odometer readings
        if vehicle_df['odometerreading'].isna().any():
            # Get indices where odometer is not null (boundaries)
            non_null_indices = vehicle_df[vehicle_df['odometerreading'].notna()].index

            if len(non_null_indices) > 0:
                # For each segment between two known odometer readings
                for i in range(len(non_null_indices) - 1):
                    start_idx = non_null_indices[i]
                    end_idx = non_null_indices[i + 1]

                    # Get the segment between two known odometer readings
                    segment = vehicle_df.loc[start_idx:end_idx]

                    # Skip if there are no missing values in this segment
                    if segment['odometerreading'].isna().sum() == 0:
                        continue

                    # Get boundary values
                    start_odometer = vehicle_df.loc[start_idx, 'odometerreading']
                    end_odometer = vehicle_df.loc[end_idx, 'odometerreading']

                    # Get timestamps for boundaries
                    start_time = vehicle_df.loc[start_idx, 'IST']
                    end_time = vehicle_df.loc[end_idx, 'IST']

                    # Total time difference in seconds
                    total_time_diff = (end_time - start_time).total_seconds()

                    # Skip if time difference is zero to avoid division by zero
                    if total_time_diff == 0:
                        continue

                    # Calculate cumulative distance for each row in the segment
                    for idx in segment.index:
                        if idx == start_idx:
                            # First row already has the correct odometer reading
                            continue

                        # Get time difference from start
                        time_diff = (vehicle_df.loc[idx, 'IST'] - start_time).total_seconds()

                        # Calculate odometer reading using linear interpolation
                        if total_time_diff > 0:
                            # Proportional distance based on time elapsed
                            time_ratio = time_diff / total_time_diff
                            interpolated_odometer = start_odometer + (end_odometer - start_odometer) * time_ratio

                            # Update the odometer reading
                            vehicle_df.loc[idx, 'odometerreading'] = interpolated_odometer

                # Handle the case where there are missing values at the beginning
                if pd.isna(vehicle_df['odometerreading'].iloc[0]):
                    # Use the first available odometer reading
                    first_valid_idx = vehicle_df['odometerreading'].first_valid_index()
                    if first_valid_idx is not None:
                        first_valid_odometer = vehicle_df.loc[first_valid_idx, 'odometerreading']
                        vehicle_df.loc[:first_valid_idx, 'odometerreading'] = first_valid_odometer

                # Handle the case where there are missing values at the end
                if pd.isna(vehicle_df['odometerreading'].iloc[-1]):
                    # Use the last available odometer reading
                    last_valid_idx = vehicle_df['odometerreading'].last_valid_index()
                    if last_valid_idx is not None:
                        last_valid_odometer = vehicle_df.loc[last_valid_idx, 'odometerreading']
                        vehicle_df.loc[last_valid_idx:, 'odometerreading'] = last_valid_odometer

        imputed_dfs.append(vehicle_df)

    # Check if we have any DataFrames to concatenate
    if not imputed_dfs:
        logging.warning("No vehicle data to process. Returning empty DataFrame.")
        return pd.DataFrame(columns=df.columns)

    # Combine all the imputed DataFrames
    result_df = pd.concat(imputed_dfs)

    logging.info(f"✅ Odometer imputation completed. {result_df['odometerreading'].isna().sum()} missing values remaining.")

    return result_df

In [4]:
def create_max_bpp_buckets(max_bpp_values, top_speed):
    """
    Create buckets for max_bpp values and calculate mode.

    Args:
        max_bpp_values (list): List of max_bpp values from braking events.
        top_speed (float): The top speed threshold used for filtering.

    Returns:
        dict: Dictionary containing bucket information and mode.
    """
    if not max_bpp_values:
        return {}

    # Create buckets starting from top_speed in increments of 5
    min_bpp = min(max_bpp_values)
    max_bpp = max(max_bpp_values)

    # Adjust bucket start to be a multiple of 5
    bucket_start = int(min_bpp // 5) * 5
    bucket_end = int(max_bpp // 5) * 5 + 5

    # Create bucket ranges
    buckets = {}
    for i in range(bucket_start, bucket_end + 1, 5):
        bucket_key = f"{i}-{i+5}"
        buckets[bucket_key] = 0

    # Assign values to buckets
    for value in max_bpp_values:
        bucket_index = int(value // 5) * 5
        bucket_key = f"{bucket_index}-{bucket_index+5}"
        if bucket_key in buckets:
            buckets[bucket_key] += 1

    # Calculate mode
    try:
        mode_result = stats.mode(max_bpp_values)
        mode_value = mode_result.mode[0] if len(mode_result.mode) > 0 else None
    except Exception as e:
        logging.warning(f"Error calculating mode: {e}")
        mode_value = None

    # Find the bucket with the most events (mode bucket)
    mode_bucket = max(buckets, key=buckets.get) if buckets else None

    return {
        'buckets': buckets,
        'mode_bucket': mode_bucket,
        'mode_value': mode_value,
        'top_speed': top_speed
    }

In [5]:
def analyze_filtered_braking_events(df, top_speed=30.0, search_window_seconds=10.0):
    """
    Analyzes a DataFrame to find and extract braking events.

    This function identifies hard stops (speed dropping to 0) and then works
    backwards within a specified time window to find the exact moment the
    brake pedal was pressed. It filters events based on a minimum top speed
    reached during the braking maneuver.

    Args:
        df (pd.DataFrame): DataFrame with vehicle data. Must contain 'timestamp',
                           'brakepedalpos', and 'vehicle_speed_vcu' columns.
                           The 'timestamp' should be in a format that can be
                           converted to a datetime object (e.g., milliseconds since epoch).
        top_speed (float): The minimum peak speed (km/h) for an event to be considered.
        search_window_seconds (float): The time window before a hard stop to search
                                      for the initial brake press.

    Returns:
        tuple: A tuple containing:
            - list of pd.DataFrame: A list where each element is a DataFrame
                                    representing a single braking event.
            - list of float: A list of the durations (in seconds) for each event.
            - dict: A dictionary containing bucketed max_bpp data and mode information.
    """
    if df.empty:
        logging.warning("Input DataFrame is empty. No events to analyze.")
        return [], [], {}

    # Convert timestamp to human-readable IST and clean the data
    # Assuming timestamp is in milliseconds, which is common
    df['IST'] = pd.to_datetime(df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
    df = df[["IST", "brakepedalpos", "vehicle_speed_vcu"]].copy()
    df.dropna(subset=["vehicle_speed_vcu", "brakepedalpos"], inplace=True)
    df.sort_values(by='IST', inplace=True)
    df.loc[:, 'IST_formatted_string'] = df['IST'].dt.strftime('%H:%M:%S')

    # Identify hard stop events (speed becomes 0 from a non-zero value)
    hard_stop_mask = (df['vehicle_speed_vcu'] == 0.0) & (df['vehicle_speed_vcu'].shift(1) > 0.0)
    hard_stop_events = df[hard_stop_mask].copy()

    if hard_stop_events.empty:
        logging.info("No hard stop events found in the DataFrame.")
        return [], [], {}

    all_event_data = []
    event_durations = []
    max_bpp_values = []  # Collect max_bpp values for bucketing

    # Extract and aggregate the data for each filtered event
    for _, event_row in hard_stop_events.iterrows():
        end_time = event_row['IST']
        # Define a search window before the stop
        search_start_time = end_time - pd.Timedelta(seconds=search_window_seconds)

        # Look for the first instance of brake pedal application within this window
        search_segment = df[(df['IST'] >= search_start_time) & (df['IST'] <= end_time)].copy()

        # Find the first row where brakepedalpos is greater than 0
        first_brake_press = search_segment[search_segment['brakepedalpos'] > 0.0].head(1)

        # Check if a brake press was found in the search window
        if not first_brake_press.empty:
            # Get the exact start time of the braking event
            start_time = first_brake_press.iloc[0]['IST']

            # Filter the event segment from the exact start of braking to the stop
            event_segment = df[(df['IST'] >= start_time) & (df['IST'] <= end_time)].copy()

            # Check if the top speed in this dynamic segment meets the filter criteria
            if not event_segment.empty and event_segment['vehicle_speed_vcu'].max() >= top_speed:
                # Calculate the dynamic time taken to come to a full stop
                time_to_stop_seconds = (end_time - start_time).total_seconds()

                # Append the filtered event data and its duration
                all_event_data.append(event_segment)
                event_durations.append(time_to_stop_seconds)

                # Collect max_bpp value for this event
                max_bpp = event_segment['brakepedalpos'].max()
                max_bpp_values.append(max_bpp)

    if not all_event_data:
        logging.info(f"No events found that reached a top speed of {top_speed} km/h or greater and had a brake press within {search_window_seconds} seconds of the stop.")
        return [], [], {}

    # Create max_bpp buckets
    bucket_info = create_max_bpp_buckets(max_bpp_values, top_speed)

    return all_event_data, event_durations, bucket_info

In [6]:
def create_max_bpp_buckets(max_bpp_values, top_speed):
    """
    Create buckets for max_bpp values and calculate mode.

    Args:
        max_bpp_values (list): List of max_bpp values from braking events.
        top_speed (float): The top speed threshold used for filtering.

    Returns:
        dict: Dictionary containing bucket information and mode.
    """
    if not max_bpp_values:
        return {}

    # Create buckets starting from top_speed in increments of 5
    min_bpp = min(max_bpp_values)
    max_bpp = max(max_bpp_values)

    # Adjust bucket start to be a multiple of 5
    bucket_start = int(min_bpp // 5) * 5
    bucket_end = int(max_bpp // 5) * 5 + 5

    # Create bucket ranges
    buckets = {}
    for i in range(bucket_start, bucket_end + 1, 5):
        bucket_key = f"{i}-{i+5}"
        buckets[bucket_key] = 0

    # Assign values to buckets
    for value in max_bpp_values:
        bucket_index = int(value // 5) * 5
        bucket_key = f"{bucket_index}-{bucket_index+5}"
        if bucket_key in buckets:
            buckets[bucket_key] += 1

    # Calculate mode
    try:
        mode_result = stats.mode(max_bpp_values)
        mode_value = mode_result.mode[0] if len(mode_result.mode) > 0 else None
    except Exception as e:
        logging.warning(f"Error calculating mode: {e}")
        mode_value = None

    # Find the bucket with the most events (mode bucket)
    mode_bucket = max(buckets, key=buckets.get) if buckets else None

    return {
        'buckets': buckets,
        'mode_bucket': mode_bucket,
        'mode_value': mode_value,
        'top_speed': top_speed
    }

In [7]:
def compute_event_metrics(event_df, duration):
    """
    Compute metrics for a single braking event.

    Args:
        event_df (pd.DataFrame): DataFrame for a single braking event.
        duration (float): Duration of the event in seconds.

    Returns:
        dict: Dictionary containing computed metrics.
    """
    # Start velocity (km/h) - first row
    start_velocity = event_df['vehicle_speed_vcu'].iloc[0]

    # Total distance: integral of speed (convert km/h to m/s) over time
    event_df = event_df.copy()
    event_df['speed_mps'] = event_df['vehicle_speed_vcu'] * (5/18)  # Convert km/h to m/s
    # Time differences in seconds
    time_diffs = event_df['IST'].diff().dt.total_seconds().fillna(0)
    # Distance for each segment: speed_mps * time_diffs
    distances = event_df['speed_mps'] * time_diffs
    total_distance = distances.sum()

    # Deceleration: (start_velocity in m/s) / total_time
    start_velocity_mps = start_velocity * (5/18)
    deceleration = start_velocity_mps / duration if duration > 0 else 0

    return {
        'start_velocity': start_velocity,
        'total_time': duration,
        'total_distance': total_distance,
        'deceleration': deceleration
    }

In [8]:
def generate_bucket_statistics(braking_events, event_durations, bucket_info):
    """
    Generate statistics for each bucket including medians, 95th and 99th percentiles.

    Args:
        braking_events (list): List of DataFrames, each representing a braking event.
        event_durations (list): List of durations for each event.
        bucket_info (dict): Dictionary containing bucket information.

    Returns:
        tuple: Three DataFrames containing median, 95th percentile, and 99th percentile statistics.
    """
    event_data = []

    for i, event_df in enumerate(braking_events):
        # Get the max_bpp for this event
        max_bpp = event_df['brakepedalpos'].max()
        # Compute bucket
        bucket_index = int(max_bpp // 5) * 5
        bucket_key = f"{bucket_index}-{bucket_index+5}"

        # Compute metrics for the event
        metrics = compute_event_metrics(event_df, event_durations[i])

        # Append to event_data
        event_data.append({
            'bucket': bucket_key,
            'start_velocity': metrics['start_velocity'],
            'total_time': metrics['total_time'],
            'total_distance': metrics['total_distance'],
            'deceleration': metrics['deceleration']
        })

    # Create a dataframe
    df_events = pd.DataFrame(event_data)

    # For the first dataframe: medians
    df_median = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        median_start_velocity=('start_velocity', 'median'),
        median_total_time=('total_time', 'median'),
        median_total_distance=('total_distance', 'median'),
        median_deceleration=('deceleration', 'median')
    ).reset_index()

    # For the second dataframe: 95th percentile
    df_95 = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        p95_start_velocity=('start_velocity', lambda x: np.percentile(x, 95)),
        p95_total_time=('total_time', lambda x: np.percentile(x, 95)),
        p95_total_distance=('total_distance', lambda x: np.percentile(x, 95)),
        p95_deceleration=('deceleration', lambda x: np.percentile(x, 95))
    ).reset_index()

    # For the third dataframe: 99th percentile
    df_99 = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        p99_start_velocity=('start_velocity', lambda x: np.percentile(x, 99)),
        p99_total_time=('total_time', lambda x: np.percentile(x, 99)),
        p99_total_distance=('total_distance', lambda x: np.percentile(x, 99)),
        p99_deceleration=('deceleration', lambda x: np.percentile(x, 99))
    ).reset_index()

    return df_median.round(2), df_95.round(2), df_99.round(2)

In [9]:
def generate_start_velocity_bucket_statistics(braking_events, event_durations):
    """
    Generate statistics for each start velocity bucket including medians, 95th and 99th percentiles.

    Args:
        braking_events (list): List of DataFrames, each representing a braking event.
        event_durations (list): List of durations for each event.

    Returns:
        tuple: Three DataFrames containing median, 95th percentile, and 99th percentile statistics.
    """
    event_data = []

    for i, event_df in enumerate(braking_events):
        # Get the max_bpp for this event
        max_bpp = event_df['brakepedalpos'].max()
        
        # Compute metrics for the event
        metrics = compute_event_metrics(event_df, event_durations[i])
        
        # Append to event_data
        event_data.append({
            'start_velocity': metrics['start_velocity'],
            'max_bpp': max_bpp,
            'total_time': metrics['total_time'],
            'total_distance': metrics['total_distance'],
            'deceleration': metrics['deceleration']
        })

    # Create a dataframe
    df_events = pd.DataFrame(event_data)
    
    # Create start velocity buckets (5 km/h intervals)
    min_velocity = df_events['start_velocity'].min()
    max_velocity = df_events['start_velocity'].max()
    
    # Adjust bucket start to be a multiple of 5
    bucket_start = int(min_velocity // 5) * 5
    bucket_end = int(max_velocity // 5) * 5 + 5
    
    # Create bucket labels
    bucket_labels = []
    for i in range(bucket_start, bucket_end, 5):
        bucket_labels.append(f"{i}-{i+5}")
    
    # Assign each event to a bucket
    df_events['bucket'] = pd.cut(
        df_events['start_velocity'],
        bins=np.arange(bucket_start, bucket_end + 5, 5),
        labels=bucket_labels,
        right=False
    )
    
    # Drop rows with NaN in bucket (events that don't fit in any bucket)
    df_events = df_events.dropna(subset=['bucket'])
    
    # Define a safe percentile function that handles empty arrays
    def safe_percentile(x, p):
        if len(x) == 0:
            return np.nan
        return np.percentile(x, p)
    
    # For the first dataframe: medians
    df_median = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        median_max_bpp=('max_bpp', 'median'),
        median_total_time=('total_time', 'median'),
        median_total_distance=('total_distance', 'median'),
        median_deceleration=('deceleration', 'median')
    ).reset_index()

    # For the second dataframe: 95th percentile
    df_95 = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        p95_max_bpp=('max_bpp', lambda x: safe_percentile(x, 95)),
        p95_total_time=('total_time', lambda x: safe_percentile(x, 95)),
        p95_total_distance=('total_distance', lambda x: safe_percentile(x, 95)),
        p95_deceleration=('deceleration', lambda x: safe_percentile(x, 95))
    ).reset_index()

    # For the third dataframe: 99th percentile
    df_99 = df_events.groupby('bucket',observed=False).agg(
        total_events=('start_velocity', 'count'),
        p99_max_bpp=('max_bpp', lambda x: safe_percentile(x, 99)),
        p99_total_time=('total_time', lambda x: safe_percentile(x, 99)),
        p99_total_distance=('total_distance', lambda x: safe_percentile(x, 99)),
        p99_deceleration=('deceleration', lambda x: safe_percentile(x, 99))
    ).reset_index()

    return df_median.round(2), df_95.round(2), df_99.round(2)

In [10]:
def generate_bucket_pdf_report(bucket_type, df_median, df_95, df_99, output_filename):
    """
    Generate a PDF report with bucket statistics tables.
    
    Args:
        bucket_type (str): Type of bucket (e.g., "Max Brake Pedal Position" or "Start Velocity")
        df_median (pd.DataFrame): DataFrame with median statistics
        df_95 (pd.DataFrame): DataFrame with 95th percentile statistics
        df_99 (pd.DataFrame): DataFrame with 99th percentile statistics
        output_filename (str): Path to save the PDF file
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    
    with PdfPages(output_filename) as pdf:
        # Create a summary page
        fig_summary = plt.figure(figsize=(12, 8))
        ax_summary = fig_summary.add_subplot(111)
        ax_summary.axis('off')
        
        # Add title
        plt.suptitle(f"{bucket_type} Bucket Analysis Report", fontsize=18, y=0.95)
        
        # Create summary text
        summary_text = f"""
        Report Summary:
        - Bucket Type: {bucket_type}
        - Total Buckets: {len(df_median)}
        - Date Range: {start_time.split()[0]} to {end_time.split()[0]}
        
        This report contains three tables showing:
        1. Median statistics for each bucket
        2. 95th percentile statistics for each bucket
        3. 99th percentile statistics for each bucket
        
        Each table shows:
        - Bucket range
        - Total events in the bucket
        - Median/95th/99th percentile values for:
          * Max Brake Pedal Position (BPP)
          * Total Time (seconds)
          * Total Distance (meters)
          * Deceleration (m/s²)
        """
        
        ax_summary.text(0.5, 0.5, summary_text, 
                        ha='center', va='center', 
                        fontsize=12, 
                        bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="black", lw=1))
        
        pdf.savefig(fig_summary, bbox_inches='tight')
        plt.close(fig_summary)
        
        # Create pages for each statistics table
        dataframes = [
            (df_median, "Median Statistics"),
            (df_95, "95th Percentile Statistics"),
            (df_99, "99th Percentile Statistics")
        ]
        
        for df, title in dataframes:
            fig_table = plt.figure(figsize=(16, 10))
            ax_table = fig_table.add_subplot(111)
            ax_table.axis('off')
            
            # Create table
            table = ax_table.table(
                cellText=df.values,
                colLabels=df.columns,
                cellLoc='center',
                loc='center',
                colColours=['#f3f3f3']*len(df.columns)
            )
            
            # Style the table
            table.auto_set_font_size(False)
            table.set_fontsize(10)
            table.scale(1, 1.5)
            
            # Highlight header row
            for i in range(len(df.columns)):
                table[(0, i)].set_facecolor('#40466e')
                table[(0, i)].set_text_props(weight='bold', color='white')
            
            # Add title
            plt.title(f"{bucket_type} - {title}", fontsize=16, pad=20)
            
            # Adjust layout
            plt.tight_layout()
            
            # Save to PDF
            pdf.savefig(fig_table, bbox_inches='tight')
            plt.close(fig_table)
    
    print(f"PDF report saved as {output_filename}")

In [11]:
# # Main execution
# if __name__ == "__main__":
#     # Define time range and vehicle IDs
start_time = '2025-08-01 00:00:00'
# end_time = '2025-08-04 00:00:00'
end_time = '2025-10-15 23:59:59'
vehicle_ids = ['9', '7', '11', '13', '14','15']  # Add all vehicle IDs you want to analyze
# vehicle_ids = ['11','13']

In [None]:
# Process each vehicle separately
for vehicle_id in vehicle_ids:
    print(f"\nProcessing vehicle {vehicle_id}...")
    
    # 1. Fetch the data for this vehicle
    vehicle_data_df = get_vehicle_data(start_time, end_time, [vehicle_id])
    
    if vehicle_data_df.empty:
        print(f"No data found for vehicle {vehicle_id}. Skipping.")
        continue
    
    # 2. Analyze the DataFrame to find braking events
    braking_events, event_durations, bucket_info = analyze_filtered_braking_events(
        df=vehicle_data_df,
        top_speed=25.0,
        search_window_seconds=15.0
    )
    
    if not braking_events:
        print(f"No braking events found for vehicle {vehicle_id} matching the criteria.")
        continue
    # 3. Generate bucket statistics
    df_median, df_95, df_99 = generate_bucket_statistics(braking_events, event_durations, bucket_info)
    
    # 4. Display results
    print(f"\nResults for Vehicle {vehicle_id}:")
    print(f"Total braking events found: {len(braking_events)}")
    print(f"Date Range: {start_time.split()[0]} to {end_time.split()[0]}")
    
    # Display bucket information
    if bucket_info:
        print("\nMax Brake Pedal Position (BPP) Buckets:")
        for bucket, count in bucket_info['buckets'].items():
            print(f"  {bucket}%: {count} events")
        
        if bucket_info['mode_bucket']:
            print(f"\nMode Bucket: {bucket_info['mode_bucket']}%")
        if bucket_info['mode_value']:
            print(f"Mode Value: {bucket_info['mode_value']:.2f}%")
    
    # Display the three statistics dataframes
    print("\nMedian Statistics by Bucket:")
    display(df_median)
    
    print("\n95th Percentile Statistics by Bucket:")
    display(df_95)
    
    print("\n99th Percentile Statistics by Bucket:")
    display(df_99)   

    # Generate max_bpp bucket report
    generate_bucket_pdf_report(
        bucket_type="Max Brake Pedal Position",
        df_median=df_median,
        df_95=df_95,
        df_99=df_99,
        output_filename=f"reports/max_bpp_bucket/vehicle_{vehicle_id}_max_bpp_bucket_report.pdf"
    )    


Processing vehicle 9...


2025-10-22 17:53:02 - INFO - ⚙️ Starting odometer reading imputation...


In [None]:
logging.info(f"Execute Velocity bucketing review")

for vehicle_id in vehicle_ids:
    print(f"\nProcessing vehicle {vehicle_id}...")
    
    # 1. Fetch the data for this vehicle
    vehicle_data_df = get_vehicle_data(start_time, end_time, [vehicle_id])
    
    if vehicle_data_df.empty:
        print(f"No data found for vehicle {vehicle_id}. Skipping.")
        continue
    
    # 2. Analyze the DataFrame to find braking events
    braking_events, event_durations, bucket_info = analyze_filtered_braking_events(
        df=vehicle_data_df,
        top_speed=25.0,
        search_window_seconds=15.0
    )
    
    if not braking_events:
        print(f"No braking events found for vehicle {vehicle_id} matching the criteria.")
        continue
    
    # 3. Generate start velocity bucket statistics
    df_median, df_95, df_99 = generate_start_velocity_bucket_statistics(braking_events, event_durations)
    
    # 4. Display results
    print(f"\nResults for Vehicle {vehicle_id}:")
    print(f"Total braking events found: {len(braking_events)}")
    print(f"Date Range: {start_time.split()[0]} to {end_time.split()[0]}")
    
    # Display bucket information
    if bucket_info:
        print("\nMax Brake Pedal Position (BPP) Buckets:")
        for bucket, count in bucket_info['buckets'].items():
            print(f"  {bucket}%: {count} events")
        
        if bucket_info['mode_bucket']:
            print(f"\nMode Bucket: {bucket_info['mode_bucket']}%")
        if bucket_info['mode_value']:
            print(f"Mode Value: {bucket_info['mode_value']:.2f}%")
    
    # Display the three statistics dataframes
    print("\nMedian Statistics by Start Velocity Bucket:")
    display(df_median)
    
    print("\n95th Percentile Statistics by Start Velocity Bucket:")
    display(df_95)
    
    print("\n99th Percentile Statistics by Start Velocity Bucket:")        
    display(df_99)
    
    generate_bucket_pdf_report(
        bucket_type="Initial Velocity",
        df_median=df_median,
        df_95=df_95,
        df_99=df_99,
        output_filename=f"reports/init_velocity_bucket/vehicle_{vehicle_id}_initial_velocity_bucket_report.pdf"
    )    