In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
# !{sys.executable} -m pip install matplotlib fpdf2 scipy pandas numpy trino
import os
import platform
import logging
import trino

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from IPython.display import display, HTML
from fpdf import FPDF
import logging
from scipy import stats

from math import radians, degrees, sin, cos, atan2, asin, sqrt # Corrected imports
sys.path.append('..')
from common import db_operations
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg

In [2]:
# Configure basic logging for the business logic file
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [3]:
def get_vehicle_gps_data(start_time=None, end_time=None, vehicle_ids=None):
    """
    Fetch vehicle and GPS data from Trino with flexible time filtering.
    Data is fetched from individual tables and then joined using pandas.
    
    Args:
        start_time: Start time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        end_time: End time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        vehicle_ids: List of vehicle IDs to filter (optional)
    
    Returns:
        DataFrame with vehicle and GPS data from all three tables
    """
    # Set default vehicle IDs if not provided
    if vehicle_ids is None:
        vehicle_ids = ['18','19']
    
    # Format vehicle IDs for the query
    vehicle_ids_str = "', '".join(vehicle_ids)
    
    # Build time filter conditions
    time_filter = ""
    
    if start_time and end_time:
        # Convert string times to datetime objects
        start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        
        # Format for the query
        time_filter = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    else:
        # Default time range if not provided
        time_filter = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    
    # Connect to Trino
    conn = trino.dbapi.connect(
        host="trino",
        port=8080,
        user="admin",
        catalog="adhoc",
        schema="default"
    )
    
    # Query 1: Fetch data from can_parsed_output_100
    query_cpo100 = f"""
    SELECT 
        id, timestamp, dt, 
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS IST,
        date_trunc('minute', timestamp) as ts_mins,  -- Truncate to minutes
        total_battery_current, bat_voltage, gear_position, odometerreading, 
        round(vehicle_speed_vcu,2) as vehicle_speed_vcu,
        vehiclereadycondition, ignitionstatus, gun_connection_status, brakepedalpos
    FROM 
        facts_prod.can_parsed_output_100
    WHERE 
        id IN ('{vehicle_ids_str}')
        {time_filter}
    """
    
    # Query 2: Fetch data from c2c_gps
    query_gps = f"""
    SELECT 
        id, timestamp,
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS ts_ist,
        date_trunc('minute', timestamp) as ts_mins, date, latitude, longitude, altitude, 
        ground_speed_kmph
    FROM 
        facts_prod.c2c_gps
    WHERE 
        id IN ('{vehicle_ids_str}')
        {time_filter}
    """
    
    # Query 3: Fetch data from can_output_ac
    query_ac = f"""
    SELECT 
        id, timestamp,
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS ac_ist,
        date_trunc('minute', timestamp) as ts_mins,
        ac_voltage, ac_current, ac_power, ac_frequency, ac_energy
    FROM 
        facts_prod.can_output_ac
    WHERE 
        id IN ('{vehicle_ids_str}')
        {time_filter}
    """
    
    # Execute queries and fetch data into dataframes
    cur = conn.cursor()
    
    # Fetch can_parsed_output_100 data
    cur.execute(query_cpo100)
    columns_cpo100 = [desc[0] for desc in cur.description]
    rows_cpo100 = cur.fetchall()
    df_cpo100 = pd.DataFrame(rows_cpo100, columns=columns_cpo100)
    
    # # Add row number for joining
    # df_cpo100['cpo_rn'] = df_cpo100.groupby(['id', 'ts_mins']).cumcount() + 1
    
    # # Fetch c2c_gps data
    # cur.execute(query_gps)
    # columns_gps = [desc[0] for desc in cur.description]
    # rows_gps = cur.fetchall()
    # df_gps = pd.DataFrame(rows_gps, columns=columns_gps)
    
    # # Add row number for joining
    # df_gps['cg_rn'] = df_gps.groupby(['id', 'ts_mins']).cumcount() + 1
    
    # # Fetch can_output_ac data
    # cur.execute(query_ac)
    # columns_ac = [desc[0] for desc in cur.description]
    # rows_ac = cur.fetchall()
    # df_ac = pd.DataFrame(rows_ac, columns=columns_ac)
    
    # # Add row number for joining
    # df_ac['ac_rn'] = df_ac.groupby(['id', 'ts_mins']).cumcount() + 1
    
    # Close connections
    cur.close()
    conn.close()
    
    # # Perform joins using pandas
    # # First join: can_parsed_output_100 left join c2c_gps
    # can_gps_join = pd.merge(
    #     df_cpo100, 
    #     df_gps, 
    #     how='left', 
    #     left_on=['id', 'ts_mins', 'cpo_rn'], 
    #     right_on=['id', 'ts_mins', 'cg_rn'],
    #     suffixes=('', '_gps')
    # )
    
    # # Second join: can_gps_join left join can_output_ac
    # final_df = pd.merge(
    #     can_gps_join, 
    #     df_ac, 
    #     how='left', 
    #     left_on=['id', 'ts_mins', 'cpo_rn'], 
    #     right_on=['id', 'ts_mins', 'ac_rn'],
    #     suffixes=('', '_ac')
    # )
    
    # # Drop the redundant row number columns
    # final_df = final_df.drop(['cpo_rn', 'cg_rn', 'ac_rn'], axis=1)
    
    # Sort the final dataframe
    final_df = df_cpo100.sort_values(['id', 'IST'])
    
    # Reset index
    final_df = final_df.reset_index(drop=True)
    
    return final_df

In [4]:
# Ensure you have the necessary libraries installed:
# pip install pandas numpy matplotlib fpdf

def analyze_filtered_braking_events(df, top_speed=30.0, search_window_seconds=10.0):
    """
    Analyzes a DataFrame to find and extract braking events.

    This function identifies hard stops (speed dropping to 0) and then works
    backwards within a specified time window to find the exact moment the
    brake pedal was pressed. It filters events based on a minimum top speed
    reached during the braking maneuver.

    Args:
        df (pd.DataFrame): DataFrame with vehicle data. Must contain 'timestamp',
                           'brakepedalpos', and 'vehicle_speed_vcu' columns.
                           The 'timestamp' should be in a format that can be
                           converted to a datetime object (e.g., milliseconds since epoch).
        top_speed (float): The minimum peak speed (km/h) for an event to be considered.
        search_window_seconds (float): The time window before a hard stop to search
                                      for the initial brake press.

    Returns:
        tuple: A tuple containing:
            - list of pd.DataFrame: A list where each element is a DataFrame
                                    representing a single braking event.
            - list of float: A list of the durations (in seconds) for each event.
            - dict: A dictionary containing bucketed max_bpp data and mode information.
    """
    if df.empty:
        logging.warning("Input DataFrame is empty. No events to analyze.")
        return [], [], {}

    # Convert timestamp to human-readable IST and clean the data
    # Assuming timestamp is in milliseconds, which is common
    df['IST'] = pd.to_datetime(df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
    df = df[["IST", "brakepedalpos", "vehicle_speed_vcu"]].copy()
    df.dropna(subset=["vehicle_speed_vcu", "brakepedalpos"], inplace=True)
    df.sort_values(by='IST', inplace=True)
    df.loc[:, 'IST_formatted_string'] = df['IST'].dt.strftime('%H:%M:%S')

    # Identify hard stop events (speed becomes 0 from a non-zero value)
    hard_stop_mask = (df['vehicle_speed_vcu'] == 0.0) & (df['vehicle_speed_vcu'].shift(1) > 0.0)
    hard_stop_events = df[hard_stop_mask].copy()

    if hard_stop_events.empty:
        logging.info("No hard stop events found in the DataFrame.")
        return [], [], {}

    all_event_data = []
    event_durations = []
    max_bpp_values = []  # Collect max_bpp values for bucketing

    # Extract and aggregate the data for each filtered event
    for _, event_row in hard_stop_events.iterrows():
        end_time = event_row['IST']
        # Define a search window before the stop
        search_start_time = end_time - pd.Timedelta(seconds=search_window_seconds)

        # Look for the first instance of brake pedal application within this window
        search_segment = df[(df['IST'] >= search_start_time) & (df['IST'] <= end_time)].copy()

        # Find the first row where brakepedalpos is greater than 0
        first_brake_press = search_segment[search_segment['brakepedalpos'] > 0.0].head(1)

        # Check if a brake press was found in the search window
        if not first_brake_press.empty:
            # Get the exact start time of the braking event
            start_time = first_brake_press.iloc[0]['IST']

            # Filter the event segment from the exact start of braking to the stop
            event_segment = df[(df['IST'] >= start_time) & (df['IST'] <= end_time)].copy()

            # Check if the top speed in this dynamic segment meets the filter criteria
            if not event_segment.empty and event_segment['vehicle_speed_vcu'].max() >= top_speed:
                # Calculate the dynamic time taken to come to a full stop
                time_to_stop_seconds = (end_time - start_time).total_seconds()

                # Append the filtered event data and its duration
                all_event_data.append(event_segment)
                event_durations.append(time_to_stop_seconds)
                
                # Collect max_bpp value for this event
                max_bpp = event_segment['brakepedalpos'].max()
                max_bpp_values.append(max_bpp)

    if not all_event_data:
        logging.info(f"No events found that reached a top speed of {top_speed} km/h or greater and had a brake press within {search_window_seconds} seconds of the stop.")
        return [], [], {}

    # Create max_bpp buckets
    bucket_info = create_max_bpp_buckets(max_bpp_values, top_speed)
    
    return all_event_data, event_durations, bucket_info

In [5]:
def create_max_bpp_buckets(max_bpp_values, top_speed):
    """
    Create buckets for max_bpp values and calculate mode.
    
    Args:
        max_bpp_values (list): List of max_bpp values from braking events.
        top_speed (float): The top speed threshold used for filtering.
        
    Returns:
        dict: Dictionary containing bucket information and mode.
    """
    if not max_bpp_values:
        return {}
    
    # Create buckets starting from top_speed in increments of 5
    min_bpp = min(max_bpp_values)
    max_bpp = max(max_bpp_values)
    
    # Adjust bucket start to be a multiple of 5
    bucket_start = int(min_bpp // 5) * 5
    bucket_end = int(max_bpp // 5) * 5 + 5
    
    # Create bucket ranges
    buckets = {}
    for i in range(bucket_start, bucket_end + 1, 5):
        bucket_key = f"{i}-{i+5}"
        buckets[bucket_key] = 0
    
    # Assign values to buckets
    for value in max_bpp_values:
        bucket_index = int(value // 5) * 5
        bucket_key = f"{bucket_index}-{bucket_index+5}"
        if bucket_key in buckets:
            buckets[bucket_key] += 1
    
    # Calculate mode - Fixed version
    try:
        mode_result = stats.mode(max_bpp_values)
        # The result of stats.mode is a ModeResult object with attributes 'mode' and 'count'
        # We need to access the first element of the 'mode' array
        mode_value = mode_result.mode[0] if len(mode_result.mode) > 0 else None
    except Exception as e:
        logging.warning(f"Error calculating mode: {e}")
        mode_value = None
    
    # Find the bucket with the most events (mode bucket)
    mode_bucket = max(buckets, key=buckets.get) if buckets else None
    
    return {
        'buckets': buckets,
        'mode_bucket': mode_bucket,
        'mode_value': mode_value,
        'top_speed': top_speed
    }

In [6]:
def generate_combined_report(vehicle_results):
    """
    Generate a combined report comparing all vehicles.
    
    Args:
        vehicle_results (list): List of dictionaries containing results for each vehicle.
    """
    # Filter out vehicles with no events
    valid_results = [r for r in vehicle_results if r and r['events']]
    
    if not valid_results:
        print("No valid results to generate combined report.")
        return
    
    # Create a combined summary DataFrame
    combined_summary = pd.DataFrame()
    
    for result in valid_results:
        if result['summary_df'] is not None and not result['summary_df'].empty:
            # Add vehicle_id column
            summary = result['summary_df'].copy()
            summary['vehicle_id'] = result['vehicle_id']
            combined_summary = pd.concat([combined_summary, summary], ignore_index=True)
    
    if combined_summary.empty:
        print("No summary data available for combined report.")
        return
    
    # Generate the combined report
    generate_final_report(combined_summary, "reports/combined_vehicle_comparison")
    
    # Create a comparison chart for bucket distributions
    plt.figure(figsize=(12, 8))
    
    for result in valid_results:
        if result['bucket_info'] and result['bucket_info'].get('buckets'):
            bucket_df = pd.DataFrame(list(result['bucket_info']['buckets'].items()), 
                                    columns=['Bucket', 'Count'])
            bucket_df['vehicle_id'] = result['vehicle_id']
            plt.plot(bucket_df['Bucket'], bucket_df['Count'], 
                    marker='o', linestyle='-', label=f"Vehicle {result['vehicle_id']}")
    
    plt.title('Max Brake Pedal Position Bucket Distribution by Vehicle')
    plt.xlabel('BPP Bucket (%)')
    plt.ylabel('Event Count')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    
    # Save the chart
    plt.savefig("reports/combined_bpp_distribution.png")
    plt.close()
    
    print("Combined report generated successfully.")


In [7]:
def generate_report_pdf(events, durations, output_filename):
    """
    Generates a multi-page PDF report with a summary page and individual plots.
    This version uses the dynamic braking duration for each plot's title.
    """
    # Define constants for the kgf calculation
    BUS_MASS_KG = 13500  # 13.5 tonnes * 1000 kg/tonne
    G_ACCELERATION = 9.80665 # Standard acceleration due to gravity

    with PdfPages(output_filename) as pdf:
        all_peak_speeds = []
        all_distances = []
        all_max_bpps = [] # List to store maximum bpp for each event
        all_avg_bpps = [] # List to store average bpp for each event
        
        for i, event_group in enumerate(events):
            start_time = event_group['IST'].iloc[0]
            end_time = event_group['IST'].iloc[-1]
            peak_velocity = event_group['vehicle_speed_vcu'].max()
            
            event_group.loc[:, 'speed_mps'] = event_group['vehicle_speed_vcu'] * (1000 / 3600)
            time_diffs_sec = event_group['IST'].diff().dt.total_seconds().fillna(0)
            distance_covered_m = (event_group['speed_mps'] * time_diffs_sec).sum()
            
            # Calculate and store BPP values for the summary
            max_bpp = event_group['brakepedalpos'].max()
            avg_bpp = event_group['brakepedalpos'].mean()
            
            all_peak_speeds.append(peak_velocity)
            all_distances.append(distance_covered_m)
            all_max_bpps.append(max_bpp)
            all_avg_bpps.append(avg_bpp)

        # Create and save the summary page
        fig_summary = plt.figure(figsize=(11, 8.5))
        ax_summary = fig_summary.add_subplot(111)
        ax_summary.axis('off')
        
        # Create a table for the summary data
        summary_data = [
            ['Total events found:', f"{len(events)}"],
            ['Max speed across all events:', f"{max(all_peak_speeds):.2f} km/h"],
            ['Average speed across all events:', f"{sum(all_peak_speeds)/len(all_peak_speeds):.2f} km/h"],
            
            ['Maximum distance covered:', f"{max(all_distances):.1f} m"],
            ['Minimum distance covered:', f"{max(all_distances):.1f} m"],
            ['Average distance covered:', f"{sum(all_distances)/len(all_distances):.1f} m"],
            
            ['Maximum duration:', f"{max(durations):.1f} s"],
            ['Minimum duration:', f"{min(durations):.1f} s"],
            ['Average duration:', f"{sum(durations)/len(durations):.1f} s"],
            
            ['Maximum BPP:', f"{max(all_max_bpps):.1f}"],
            ['Average BPP:', f"{sum(all_avg_bpps)/len(all_avg_bpps):.1f}"]
        ]
        
        # Define a title for the table
        plt.suptitle("Braking Analysis Report", fontsize=18, y=0.95)
        
        # Create the table
        summary_table = ax_summary.table(
            cellText=summary_data,
            loc='center',
            cellLoc='left',
            colWidths=[0.5, 0.5]
        )
        
        summary_table.auto_set_font_size(False)
        summary_table.set_fontsize(10)
        summary_table.scale(1.2, 1.5)
        
        pdf.savefig(fig_summary)
        plt.close(fig_summary)
        
        # --- Paginate the detailed event table ---
        
        table_data = []
        for i, event_group in enumerate(events):
            start_time = event_group['IST'].iloc[0]
            end_time = event_group['IST'].iloc[-1]
            max_bpp = event_group['brakepedalpos'].max()
            avg_bpp = event_group['brakepedalpos'].mean()
            dist_m = (event_group['vehicle_speed_vcu'] * (1000/3600) * event_group['IST'].diff().dt.total_seconds().fillna(0)).sum()
            start_vel = event_group['vehicle_speed_vcu'].iloc[0]
            peak_vel = event_group['vehicle_speed_vcu'].max()
            total_time_s = (end_time - start_time).total_seconds()
            avg_decel = (peak_vel * 1000/3600) / total_time_s if total_time_s > 0 else 0
            
            braking_force_kgf = (BUS_MASS_KG * avg_decel) / G_ACCELERATION

            table_data.append([
                i + 1,
                start_time.strftime('%d/%m/%y %H:%M:%S'),
                end_time.strftime('%d/%m/%y %H:%M:%S'),
                f"{durations[i]:.2f}",
                f"{max_bpp:.2f}",
                f"{avg_bpp:.2f}",
                f"{dist_m:.2f}",
                f"{start_vel:.2f}",
                f"{avg_decel:.2f}",
                f"{braking_force_kgf:.2f}"
            ])

        # Define the number of rows per page
        ROWS_PER_PAGE = 23
        
        # Split the data into chunks for pagination
        chunks = [table_data[i:i + ROWS_PER_PAGE] for i in range(0, len(table_data), ROWS_PER_PAGE)]
        
        # Define columns for the table
        columns = [
            'idx', 'start', 'end', 'duration_s', 'max_bpp', 'avg_bpp', 
            'ttl_dist_m', 'start_vel', 'avg_decel_mps2', 'braking_force_kgf'
        ]

        # Loop through each chunk of data and create a new page
        for page_num, chunk in enumerate(chunks):
            fig_table = plt.figure(figsize=(11, 8.5))
            ax_table = fig_table.add_subplot(111)
            ax_table.axis('off')
            
            # Set relative column widths
            col_widths = [0.05, 0.15, 0.15, 0.08, 0.08, 0.08, 0.08, 0.08, 0.1, 0.15]
            
            table = ax_table.table(cellText=chunk, colLabels=columns, loc='center', cellLoc='center', colWidths=col_widths)
            table.auto_set_font_size(False)
            table.set_fontsize(8)
            table.scale(1, 1.5)
            
            plt.title(f"DETAILED BRAKING EVENT TABLE (Page {page_num + 1})", y=0.95)
            plt.tight_layout(rect=[0, 0, 1, 0.95])
            pdf.savefig(fig_table)
            plt.close(fig_table)
            
        # Now, plot and save each individual graph on its own page
        for i, event_group in enumerate(events):
            fig, ax = plt.subplots(figsize=(15, 6))
            
            event_group.loc[:, 'IST_formatted_string'] = event_group['IST'].dt.strftime('%H:%M:%S')

            distance_covered_m = (event_group['vehicle_speed_vcu'] * (1000 / 3600) * event_group['IST'].diff().dt.total_seconds().fillna(0)).sum()
            total_distance_ft = distance_covered_m * 3.28084
            distance_label = (
                f'Distance Covered:\n'
                f'{distance_covered_m:.2f} m\n'
                f'{total_distance_ft:.2f} ft'
            )

            ax.text(
                0.95, 0.95,
                distance_label,
                transform=ax.transAxes,
                ha='right',
                va='top',
                fontsize=12,
                bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1)
            )

            ax.plot(
                event_group['IST_formatted_string'],
                event_group['vehicle_speed_vcu'],
                label='Vehicle Speed (km/h)',
                color='blue'
            )
            ax.plot(
                event_group['IST_formatted_string'],
                event_group['brakepedalpos'],
                label='Brake Pedal Position',
                color='red'
            )
            
            start_time_str = event_group['IST'].iloc[0].strftime('%d/%m/%y %H:%M:%S')
            end_time_str = event_group['IST'].iloc[-1].strftime('%d/%m/%y %H:%M:%S')
            ax.set_title(f"Event: {start_time_str} to {end_time_str}")
            ax.set_xlabel('Time (hh:mm:ss)')
            ax.set_ylabel('Value')
            ax.grid(True)
            ax.legend()
            
            ax.tick_params(axis='x', rotation=45)
            
            # Use the duration from the list to create the dynamic title
            plt.suptitle(f"Analysis of Braking Events ({durations[i]:.2f}s to stop)", fontsize=18)
            plt.tight_layout(rect=[0, 0, 1, 0.96])
            
            pdf.savefig(fig)
            plt.close(fig)
            
        print(f"\nCombined PDF report saved as '{output_filename}'.")

In [8]:
def generate_final_report(df,file_name):
    """
    Analyzes and compares braking performance data from a CSV file
    for two distinct time periods. It generates a summary report and a
    visual comparison chart, then combines them into a single PDF document.
    """
    # # Check if the file exists in the current directory
    # if not os.path.exists(file_name):
    #     print(f"Error: The file '{file_name}' was not found in the current directory.")
    #     print("Please ensure the CSV file is saved in the same folder as this script.")
    #     return

    # try:
    #     # Read the CSV file directly from the local file path
    #     print(f"Reading data from '{file_name}'...")
    #     df = pd.read_csv(file_name)
    #     print("File read successfully.")

    # except Exception as e:
    #     print(f"An error occurred while reading the file: {e}")
    #     return

    # --- Data Cleaning and Preprocessing ---
    # Clean column names to handle any leading/trailing spaces or newlines
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    
    # Now, check the cleaned column names to ensure they exist before proceeding
    required_cols = ['avg_decel_mps2', 'avg_bpp', 'ttl_dist_m', 'start_vel', 'peak_vel', 'start', 'max_bpp']
    
    # Check if all required columns are present after cleaning
    if not all(col in df.columns for col in required_cols):
        missing_cols = [col for col in required_cols if col not in df.columns]
        print("Error: The CSV file is missing one or more required columns after cleaning.")
        print(f"Missing columns: {missing_cols}")
        print("\nAvailable columns are:")
        print(df.columns)
        return

    # Convert the 'start' column to datetime objects
    try:
        df['start_datetime'] = pd.to_datetime(df['start'], format='%d/%m/%y %H:%M:%S')
    except Exception as e:
        print(f"Error converting dates: {e}. Please check the date format in your CSV file.")
        return

    # Convert the key metrics columns to float
    numeric_cols = ['avg_decel_mps2', 'avg_bpp', 'ttl_dist_m', 'start_vel', 'peak_vel', 'max_bpp']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Drop any rows with NaN values in critical columns
    df.dropna(subset=numeric_cols, inplace=True)

    # Define the cutoff date to split the data
    cutoff_date = pd.to_datetime('17/08/25', format='%d/%m/%y')

    # Partition the data into 'before' and 'after' the cutoff date
    df_then = df[df['start_datetime'] < cutoff_date]
    df_now = df[df['start_datetime'] >= cutoff_date]

    # Define the key metrics for analysis
    metrics_avg = {
        'avg_decel_mps2': 'Avg Deceleration ($m/s^2$)',
        'avg_bpp': 'Avg Brake Pedal Position (%)',
        'ttl_dist_m': 'Avg Distance (m)',
        'start_vel': 'Avg Start Velocity (km/h)', # Renamed to reflect the start of the event
        'peak_vel': 'Avg Peak Velocity (km/h)'
    }
    
    # Calculate summary statistics for both periods
    summary_then_avg = df_then[metrics_avg.keys()].mean()
    summary_now_avg = df_now[metrics_avg.keys()].mean()

    # Create a DataFrame for the average metrics comparison
    comparison_df_avg = pd.DataFrame({
        'Then (Aug 1 - Aug 16)': summary_then_avg,
        'Now (Aug 17 - Aug 25)': summary_now_avg
    })
    
    # Rename the index to the more descriptive names
    comparison_df_avg = comparison_df_avg.rename(index=metrics_avg)

    # Calculate min distance and max brake pedal position
    min_dist_then = df_then['ttl_dist_m'].min()
    min_dist_now = df_now['ttl_dist_m'].min()
    
    max_bpp_then = df_then['max_bpp'].max()
    max_bpp_now = df_now['max_bpp'].max()
    
    # Create a new DataFrame for these specific metrics and concatenate
    specific_metrics_df = pd.DataFrame({
        'Then (Aug 1 - Aug 16)': [min_dist_then, max_bpp_then],
        'Now (Aug 17 - Aug 25)': [min_dist_now, max_bpp_now]
    }, index=['Min Distance (m)', 'Max Brake Pedal Position (%)'])

    # Combine the average and specific metrics DataFrames and round the results
    comparison_df = pd.concat([comparison_df_avg, specific_metrics_df])

    # Re-order the rows as requested
    new_order = [
        'Avg Peak Velocity (km/h)',
        'Avg Distance (m)',
        # 'Min Distance (m)',
        'Avg Brake Pedal Position (%)',
        'Max Brake Pedal Position (%)',
        'Avg Deceleration ($m/s^2$)'
    ]
    comparison_df = comparison_df.reindex(new_order).round(2)
    
    # Add a row for total events
    comparison_df.loc['Total Events'] = [len(df_then), len(df_now)]
    
    # --- Generate the comparison chart (PNG) ---
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Exclude 'Total Events' from the chart
    plot_df = comparison_df.drop('Total Events')

    plot_df.T.plot(kind='bar', ax=ax, width=0.8, rot=0)

    # Add labels and title
    ax.set_title('Braking Performance: Before vs. After August 17, 2025', fontsize=16, fontweight='bold')
    ax.set_ylabel('Value', fontsize=12)
    ax.set_xlabel('Metric', fontsize=12)
    ax.legend(title='Period', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Customize x-axis labels
    plt.xticks(ha='center')

    # Add value labels on top of the bars
    for container in ax.containers:
        ax.bar_label(container, fmt='%.2f', label_type='edge', fontsize=10)

    plt.tight_layout(rect=[0, 0, 0.85, 1])

    chart_filename = file_name.split('_')[0] + "_braking_comparison_chart.png"
    plt.savefig(chart_filename)
    plt.close(fig)
    print(f"\nChart successfully saved as '{chart_filename}'.")

    # --- Generate the PDF Report ---
    pdf = FPDF('P', 'mm', 'A4')
    pdf.add_page()

    # Add a title
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(200, 10, "Braking Performance Comparison Report", 0, 1, 'C')

    # Add the summary table
    pdf.set_font("Arial", '', 12)
    pdf.multi_cell(0, 10, "Summary of Braking Metrics:", 0, 1)

    # Convert the DataFrame to a string with a fixed width for the table
    table_str = comparison_df.to_string()
    pdf.set_font("Courier", '', 10) # Using a monospace font for table formatting
    pdf.multi_cell(0, 5, table_str, 0, 1)
    
    # Add a title for the chart
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, "Visual Comparison", 0, 1, 'C')
    
    # Add the generated chart image
    # Note: Adjust the x, y, width, and height as needed to fit the page.
    pdf.image(chart_filename, x=15, y=pdf.get_y() + 5, w=180)

    pdf_filename = file_name.split('_')[0] + "_Braking_Analysis_Report.pdf"    
    pdf.output(pdf_filename)
    
    print(f"\nPDF report successfully generated as '{pdf_filename}'.")

In [9]:
def process_vehicle_data(vehicle_id, start_time, end_time, top_speed=25.0, search_window_seconds=15.0):
    """
    Process data for a single vehicle and generate reports.
    
    Args:
        vehicle_id (str): The ID of the vehicle to process.
        start_time (str): Start time in 'YYYY-MM-DD HH:MM:SS' format.
        end_time (str): End time in 'YYYY-MM-DD HH:MM:SS' format.
        top_speed (float): The minimum peak speed for an event to be considered.
        search_window_seconds (float): The time window before a hard stop to search for brake press.
    
    Returns:
        dict: A dictionary containing the results for the vehicle.
    """
    print(f"Processing vehicle {vehicle_id}...")
    
    # 1. Fetch the data for this vehicle
    vehicle_data_df = get_vehicle_gps_data(start_time, end_time, [vehicle_id])
    
    if vehicle_data_df.empty:
        print(f"No data found for vehicle {vehicle_id}. Skipping.")
        return None
    
    # 2. Analyze the DataFrame to find braking events
    braking_events, event_durations, bucket_info = analyze_filtered_braking_events(
        df=vehicle_data_df,
        top_speed=top_speed,
        search_window_seconds=search_window_seconds
    )
    
    if not braking_events:
        print(f"No braking events found for vehicle {vehicle_id} matching the criteria.")
        return {
            'vehicle_id': vehicle_id,
            'events': [],
            'bucket_info': None,
            'summary_df': None
        }
    
    # 3. Create output directory for this vehicle
    base_output_path = f"reports/vehicle_{vehicle_id}_braking_analysis"
    os.makedirs(os.path.dirname(base_output_path), exist_ok=True)
    
    # 4. Generate the detailed CSV and PDF reports
    summary_df = generate_report_csv(braking_events, f"{base_output_path}_combined_report.csv")
    generate_report_pdf(braking_events, event_durations, f"{base_output_path}_combined_report.pdf")
    
    # 5. Generate the final comparison report
    generate_final_report(summary_df, f"{base_output_path}_final_comparison")
    
    # 6. Return the results for further processing
    return {
        'vehicle_id': vehicle_id,
        'events': braking_events,
        'durations': event_durations,
        'bucket_info': bucket_info,
        'summary_df': summary_df
    }

In [10]:
def generate_bucket_focus_report(vehicle_results, output_filename="reports/bucket_focus_report"):
    """
    Generate a report focusing on the bucket with the highest occurrences for each vehicle.
    
    Args:
        vehicle_results (list): List of dictionaries containing results for each vehicle.
        output_filename (str): The base name for the output files.
    """
    # Prepare data for the report
    vehicle_ids = []
    mode_buckets = []
    mode_counts = []
    mode_values = []
    
    for result in vehicle_results:
        if not result or not result.get('bucket_info') or not result['bucket_info'].get('buckets'):
            continue
            
        bucket_info = result['bucket_info']
        vehicle_id = result['vehicle_id']
        
        # Find the bucket with the highest count
        buckets = bucket_info['buckets']
        if not buckets:
            continue
            
        mode_bucket = max(buckets, key=buckets.get)
        mode_count = buckets[mode_bucket]
        mode_value = bucket_info.get('mode_value')
        
        vehicle_ids.append(vehicle_id)
        mode_buckets.append(mode_bucket)
        mode_counts.append(mode_count)
        mode_values.append(mode_value)
    
    if not vehicle_ids:
        print("No valid bucket data found for any vehicle.")
        return
    
    # Create a DataFrame for the report
    bucket_df = pd.DataFrame({
        'Vehicle ID': vehicle_ids,
        'Mode Bucket': mode_buckets,
        'Count in Mode Bucket': mode_counts,
        'Mode Value': mode_values
    })
    
    # Sort by count in descending order
    bucket_df = bucket_df.sort_values('Count in Mode Bucket', ascending=False)
    
    # Display the table
    print("### Bucket Focus Report")
    display(HTML(bucket_df.to_html(index=False)))
    
    # Create a bar chart
    plt.figure(figsize=(12, 6))
    bars = plt.bar(vehicle_ids, mode_counts, color='skyblue')
    
    # Add labels on top of the bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height)}',
                 ha='center', va='bottom')
    
    plt.title('Count of Events in Mode Bucket by Vehicle')
    plt.xlabel('Vehicle ID')
    plt.ylabel('Event Count')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the chart
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    chart_filename = f"{output_filename}_bucket_focus_chart.png"
    plt.savefig(chart_filename)
    plt.close()
    
    # Generate a PDF report using matplotlib
    fig = plt.figure(figsize=(8.27, 11.69))  # A4 size in inches
    gs = fig.add_gridspec(3, 1, height_ratios=[0.1, 0.4, 0.5])
    
    # Title section
    ax_title = fig.add_subplot(gs[0])
    ax_title.axis('off')
    ax_title.text(0.5, 0.5, "Braking Bucket Focus Report", 
                 ha='center', va='center', fontsize=16, fontweight='bold')
    
    # Table section
    ax_table = fig.add_subplot(gs[1])
    ax_table.axis('off')
    
    # Create the table
    table_data = bucket_df.values.tolist()
    table = ax_table.table(cellText=table_data,
                          colLabels=bucket_df.columns.tolist(),
                          cellLoc='center',
                          loc='center',
                          bbox=[0, 0, 1, 1])
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)
    
    # Highlight the header row
    for i in range(len(bucket_df.columns)):
        table[(0, i)].set_facecolor('#40466e')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Chart section
    ax_chart = fig.add_subplot(gs[2])
    ax_chart.axis('off')
    
    # Add the chart image
    if os.path.exists(chart_filename):
        img = plt.imread(chart_filename)
        ax_chart.imshow(img, extent=[0.1, 0.9, 0.1, 0.9], aspect='auto')
    
    # Add a title for the chart section
    ax_chart.text(0.5, 0.95, "Visual Comparison", 
                 ha='center', va='top', fontsize=14, fontweight='bold',
                 transform=ax_chart.transAxes)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the PDF
    pdf_output_filename = f"{output_filename}_bucket_focus_report.pdf"
    plt.savefig(pdf_output_filename, format='pdf', bbox_inches='tight')
    plt.close(fig)
    
    print(f"Bucket focus report saved as {pdf_output_filename}")

In [11]:
def generate_detailed_bucket_report(vehicle_results, output_filename="reports/detailed_bucket_report"):
    """
    Generate a detailed report showing all buckets for each vehicle, but highlighting the mode bucket.
    
    Args:
        vehicle_results (list): List of dictionaries containing results for each vehicle.
        output_filename (str): The base name for the output files.
    """
    # Prepare data for the report
    all_data = []
    
    for result in vehicle_results:
        if not result or not result.get('bucket_info') or not result['bucket_info'].get('buckets'):
            continue
            
        bucket_info = result['bucket_info']
        vehicle_id = result['vehicle_id']
        buckets = bucket_info['buckets']
        mode_bucket = bucket_info.get('mode_bucket')
        
        for bucket, count in buckets.items():
            is_mode = bucket == mode_bucket
            all_data.append({
                'Vehicle ID': vehicle_id,
                'Bucket': bucket,
                'Count': count,
                'Is Mode': is_mode
            })
    
    if not all_data:
        print("No valid bucket data found for any vehicle.")
        return
    
    # Create a DataFrame for the report
    detailed_df = pd.DataFrame(all_data)
    
    # Create a pivot table for better visualization
    pivot_df = detailed_df.pivot_table(
        index='Vehicle ID', 
        columns='Bucket', 
        values='Count', 
        fill_value=0
    )
    
    # Display the table
    print("### Detailed Bucket Report")
    display(HTML(detailed_df.to_html(index=False)))
    
    # Create a grouped bar chart
    plt.figure(figsize=(14, 8))
    
    # Get unique vehicles and buckets
    vehicles = detailed_df['Vehicle ID'].unique()
    buckets = detailed_df['Bucket'].unique()
    
    # Set up the bar positions
    x = np.arange(len(buckets))
    width = 0.8 / len(vehicles)
    
    # Plot each vehicle's data
    for i, vehicle in enumerate(vehicles):
        vehicle_data = detailed_df[detailed_df['Vehicle ID'] == vehicle]
        # Create a dictionary for quick lookup
        bucket_counts = {row['Bucket']: row['Count'] for _, row in vehicle_data.iterrows()}
        counts = [bucket_counts.get(bucket, 0) for bucket in buckets]
        
        # Highlight the mode bucket
        colors = ['red' if bucket_counts.get(bucket, 0) == max(bucket_counts.values()) else 'skyblue' 
                 for bucket in buckets]
        
        plt.bar(x + i * width, counts, width, label=f'Vehicle {vehicle}', color=colors)
    
    plt.title('Detailed Bucket Distribution by Vehicle')
    plt.xlabel('Bucket')
    plt.ylabel('Event Count')
    plt.xticks(x + width * (len(vehicles) - 1) / 2, buckets)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Save the chart
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    chart_filename = f"{output_filename}_detailed_bucket_chart.png"
    plt.savefig(chart_filename)
    plt.close()
    
    # Generate a PDF report using matplotlib
    fig = plt.figure(figsize=(8.27, 11.69))  # A4 size in inches
    gs = fig.add_gridspec(3, 1, height_ratios=[0.1, 0.4, 0.5])
    
    # Title section
    ax_title = fig.add_subplot(gs[0])
    ax_title.axis('off')
    ax_title.text(0.5, 0.5, "Detailed Braking Bucket Report", 
                 ha='center', va='center', fontsize=16, fontweight='bold')
    
    # Table section
    ax_table = fig.add_subplot(gs[1])
    ax_table.axis('off')
    
    # Create the table using the pivot table
    table_data = [pivot_df.columns.tolist()] + pivot_df.reset_index().values.tolist()
    table = ax_table.table(cellText=table_data,
                          cellLoc='center',
                          loc='center',
                          bbox=[0, 0, 1, 1])
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)
    
    # Highlight the header row
    for i in range(len(pivot_df.columns) + 1):
        table[(0, i)].set_facecolor('#40466e')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Chart section
    ax_chart = fig.add_subplot(gs[2])
    ax_chart.axis('off')
    
    # Add the chart image
    if os.path.exists(chart_filename):
        img = plt.imread(chart_filename)
        ax_chart.imshow(img, extent=[0.1, 0.9, 0.1, 0.9], aspect='auto')
    
    # Add a title for the chart section
    ax_chart.text(0.5, 0.95, "Visual Comparison", 
                 ha='center', va='top', fontsize=14, fontweight='bold',
                 transform=ax_chart.transAxes)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the PDF
    pdf_output_filename = f"{output_filename}_detailed_bucket_report.pdf"
    plt.savefig(pdf_output_filename, format='pdf', bbox_inches='tight')
    plt.close(fig)
    
    print(f"Detailed bucket report saved as {pdf_output_filename}")

In [12]:
def generate_report_csv(events, output_filename):
    """
    Generates a CSV report with a summary of braking events.

    Args:
        events (list of pd.DataFrame): A list where each DataFrame is a single braking event.
        output_filename (str): The path to save the output CSV file.
    """
    # Use the core logic function to create the summary DataFrame
    results_df = create_summary_dataframe(events)

    if results_df.empty:
        logging.warning("No events to save to CSV.")
        return

    # Create directory if it doesn't exist
    import os
    directory = os.path.dirname(output_filename)
    if directory:
        os.makedirs(directory, exist_ok=True)

    results_df.to_csv(output_filename, index=False)
    logging.info(f"Combined CSV report saved as '{output_filename}'.")
    return results_df

In [13]:
def create_summary_dataframe(events):
    """
    Creates a summary DataFrame from a list of braking event DataFrames.

    Args:
        events (list of pd.DataFrame): A list where each DataFrame is a single braking event.

    Returns:
        pd.DataFrame: A summary DataFrame with calculated metrics for each event.
    """
    # Define constants for the kgf calculation
    BUS_MASS_KG = 13500
    G_ACCELERATION = 9.80665

    table_data = []

    for i, event_group in enumerate(events):
        start_time = event_group['IST'].iloc[0]
        end_time = event_group['IST'].iloc[-1]
        start_velocity = event_group['vehicle_speed_vcu'].iloc[0]
        peak_velocity = event_group['vehicle_speed_vcu'].max()
        max_brake_pedal_pos = event_group['brakepedalpos'].max()
        avg_brake_pedal_pos = event_group['brakepedalpos'].mean()

        event_group.loc[:, 'speed_mps'] = event_group['vehicle_speed_vcu'] * (1000 / 3600)
        time_diffs_sec = event_group['IST'].diff().dt.total_seconds().fillna(0)
        distance_covered_m = (event_group['speed_mps'] * time_diffs_sec).sum()
        total_time_s = (end_time - start_time).total_seconds()

        if total_time_s > 0:
            avg_deceleration = (peak_velocity * 1000/3600) / total_time_s
        else:
            avg_deceleration = 0

        braking_force_kgf = (BUS_MASS_KG * avg_deceleration) / G_ACCELERATION

        # Calculate max_bpp bucket
        max_bpp_bucket = f"{int(max_brake_pedal_pos // 5) * 5}-{int(max_brake_pedal_pos // 5) * 5 + 5}"

        table_data.append({
            'idx': i + 1,
            'start': start_time.strftime('%d/%m/%y %H:%M:%S'),
            'end': end_time.strftime('%d/%m/%y %H:%M:%S'),
            'duration_s': f"{total_time_s:.2f}",
            'max_bpp': f"{max_brake_pedal_pos:.2f}",
            'max_bpp_bucket': max_bpp_bucket,
            'avg_bpp': f"{avg_brake_pedal_pos:.2f}",
            'ttl_dist_m': f"{distance_covered_m:.2f}",
            'start_vel': f"{start_velocity:.2f}",
            'peak_vel': f"{peak_velocity:.2f}",
            'avg_decel_mps2': f"{avg_deceleration:.2f}",
            'braking_force_kgf': f"{braking_force_kgf:.2f}"
        })

    return pd.DataFrame(table_data)

In [14]:
# Main execution
if __name__ == "__main__":
    # Define time range and vehicle IDs
    start_time = '2025-09-22 00:00:00'
    end_time = '2025-09-24 00:00:00'
    vehicle_ids = ['18', '19']  # Add all vehicle IDs you want to analyze
    
    # Process each vehicle separately
    vehicle_results = []
    for vehicle_id in vehicle_ids:
        result = process_vehicle_data(
            vehicle_id=vehicle_id,
            start_time=start_time,
            end_time=end_time,
            top_speed=25.0,
            search_window_seconds=15.0
        )
        if result:
            vehicle_results.append(result)
    
    # Display individual vehicle reports in the notebook
    for result in vehicle_results:
        if result and result['events']:
            print(f"\n### Report for Vehicle {result['vehicle_id']} ###")
            display_report_in_notebook(
                events=result['events'],
                durations=result['durations'],
                bucket_info=result['bucket_info']
            )
    
    # # Generate a combined report comparing all vehicles (if needed)
    # generate_combined_report(vehicle_results)
    
    # # Generate the bucket focus report
    # generate_bucket_focus_report(vehicle_results)
    
    # # Generate the detailed bucket report
    # generate_detailed_bucket_report(vehicle_results)

Processing vehicle 18...


2025-10-22 13:35:11 - INFO - Combined CSV report saved as 'reports/vehicle_18_braking_analysis_combined_report.csv'.
2025-10-22 13:35:12 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-10-22 13:35:12 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-10-22 13:35:12 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-10-22 13:35:12 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-10


Combined PDF report saved as 'reports/vehicle_18_braking_analysis_combined_report.pdf'.

Chart successfully saved as 'reports/vehicle_braking_comparison_chart.png'.


  pdf.set_font("Arial", 'B', 16)
  pdf.cell(200, 10, "Braking Performance Comparison Report", 0, 1, 'C')
  pdf.set_font("Arial", '', 12)


TypeError: 1 cannot be converted to a Align