In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import platform
import logging
import trino

sys.path.append('..')
from common import db_operations

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg

from math import radians, degrees, sin, cos, atan2, asin, sqrt # Corrected imports

In [2]:
# Configure basic logging for the business logic file
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [3]:
# Charger locations (latitude, longitude)
CHARGER_LOCATIONS = {
    'Dehradun': (30.287722, 77.999029),
    'Gurgaon': (28.423090, 76.991733),
    'Muzaffarnagar': (29.549413, 77.747698)
}

In [4]:
def get_vehicle_gps_data(start_time=None, end_time=None, vehicle_ids=None):
    """
    Fetch vehicle and GPS data from Trino with flexible time filtering.
    
    Args:
        start_time: Start time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        end_time: End time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        vehicle_ids: List of vehicle IDs to filter (optional)
    
    Returns:
        DataFrame with vehicle and GPS data
    """
    # Set default vehicle IDs if not provided
    if vehicle_ids is None:
        vehicle_ids = ['18','19']
    
    # Format vehicle IDs for the query
    vehicle_ids_str = "', '".join(vehicle_ids)
    
    # Build time filter conditions
    time_filter_cpo100 = ""
    time_filter_gps = ""
    
    if start_time and end_time:
        # Convert string times to datetime objects
        start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        
        # Format for the query
        time_filter_cpo100 = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    else:
        # Default time range if not provided
        time_filter_cpo100 = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    
    # Build the complete query
    query = f"""
    with cpo100 as
    (
      SELECT 
        id, timestamp, dt, 
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS IST,
        date_trunc('minute', timestamp) as ts_mins,  -- Truncate to minutes
        total_battery_current, bat_voltage, gear_position, odometerreading,round(vehicle_speed_vcu,2) as vehicle_speed_vcu,
        vehiclereadycondition, ignitionstatus, gun_connection_status
      from 
        facts_prod.can_parsed_output_100
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_cpo100}
    ),
    cpo100_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cpo_rn
      from 
        cpo100
    ),
    c2c_gps_data as
    (
      select 
        id, timestamp,
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS ts_ist,
        date_trunc('minute', timestamp) as ts_mins, date, latitude, longitude, altitude,ground_speed_kmph
      from 
        facts_prod.c2c_gps
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_gps}
    ),
    c2c_gps_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cg_rn
      from 
        c2c_gps_data
    )
    select 
        cp.id as vehicle_id, cp.timestamp, cp.dt, cp.IST, cp.ts_mins,
        cp.total_battery_current, cp.bat_voltage, cp.gear_position, cp.odometerreading,vehicle_speed_vcu,
        cp.vehiclereadycondition, cp.gun_connection_status,
        cp.ignitionstatus, cg.latitude, cg.longitude, cg.altitude,ground_speed_kmph
    from 
      cpo100_ranked as cp
      left join
      c2c_gps_ranked as cg
      on (cp.id = cg.id and cp.ts_mins = cg.ts_mins and cp.cpo_rn = cg.cg_rn)
    ORDER BY cp.id, cp.timestamp
    """
    
    # Connect to Trino
    conn = trino.dbapi.connect(
        host="trino",
        port=8080,
        user="admin",
        catalog="adhoc",
        schema="default"
    )
    
    # Execute the query
    cur = conn.cursor()
    cur.execute(query)
    
    # Get column names
    columns = [desc[0] for desc in cur.description]
    
    # Fetch all rows
    rows = cur.fetchall()
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)
    
    # Close connections
    cur.close()
    conn.close()
    
    return df

In [5]:
def impute_odometer_using_vcu_speed(df):
    """
    Impute missing OdometerReading values using linear interpolation.
    (Adopted from energy_mileage_daily.py)
    """
    # Check if the input DataFrame is empty
    if df.empty:
        logging.warning("Input DataFrame is empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    # Check if required columns exist
    required_columns = ['vehicle_id', 'IST', 'odometerreading']
    for col in required_columns:
        if col not in df.columns:
            logging.error(f"Required column '{col}' not found in DataFrame")
            return df
    
    df = df.copy()
    imputed_dfs = []
    
    for vehicle_id, vehicle_df in df.groupby('vehicle_id'):
        # Check if the grouped DataFrame is empty
        if vehicle_df.empty:
            logging.warning(f"Empty group for vehicle_id: {vehicle_id}. Skipping.")
            continue
            
        vehicle_df = vehicle_df.sort_values('IST')
        
        if vehicle_df['odometerreading'].isna().any():
            non_null_indices = vehicle_df[vehicle_df['odometerreading'].notna()].index
            
            if len(non_null_indices) > 0:
                for i in range(len(non_null_indices) - 1):
                    start_idx = non_null_indices[i]
                    end_idx = non_null_indices[i + 1]
                    
                    segment = vehicle_df.loc[start_idx:end_idx]
                    if segment['odometerreading'].isna().sum() == 0:
                        continue
                        
                    start_odometer = vehicle_df.loc[start_idx, 'odometerreading']
                    end_odometer = vehicle_df.loc[end_idx, 'odometerreading']
                    
                    start_time = vehicle_df.loc[start_idx, 'IST']
                    end_time = vehicle_df.loc[end_idx, 'IST']
                    
                    total_time_diff = (end_time - start_time).total_seconds()
                    if total_time_diff == 0:
                        continue
                        
                    for idx in segment.index:
                        if idx == start_idx:
                            continue
                            
                        time_diff = (vehicle_df.loc[idx, 'IST'] - start_time).total_seconds()
                        
                        if total_time_diff > 0:
                            time_ratio = time_diff / total_time_diff
                            interpolated_odometer = start_odometer + (end_odometer - start_odometer) * time_ratio
                            vehicle_df.loc[idx, 'odometerreading'] = interpolated_odometer
                
                # Handle missing values at beginning and end
                if pd.isna(vehicle_df['odometerreading'].iloc[0]):
                    first_valid_idx = vehicle_df['odometerreading'].first_valid_index()
                    if first_valid_idx is not None:
                        first_valid_odometer = vehicle_df.loc[first_valid_idx, 'odometerreading']
                        vehicle_df.loc[:first_valid_idx, 'odometerreading'] = first_valid_odometer
                
                if pd.isna(vehicle_df['odometerreading'].iloc[-1]):
                    last_valid_idx = vehicle_df['odometerreading'].last_valid_index()
                    if last_valid_idx is not None:
                        last_valid_odometer = vehicle_df.loc[last_valid_idx, 'odometerreading']
                        vehicle_df.loc[last_valid_idx:, 'odometerreading'] = last_valid_odometer
        
        imputed_dfs.append(vehicle_df)
    
    # Check if we have any DataFrames to concatenate
    if not imputed_dfs:
        logging.warning("No vehicle data to process. Returning original DataFrame.")
        return df
    
    return pd.concat(imputed_dfs)

In [6]:
def impute_odometer_using_vcu_speed_optimized(df):
    """
    Optimized version of odometer imputation using vectorized operations.
    """
    # Input validation
    if df.empty:
        logging.warning("Input DataFrame is empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    required_columns = ['vehicle_id', 'IST', 'odometerreading']
    for col in required_columns:
        if col not in df.columns:
            logging.error(f"Required column '{col}' not found in DataFrame")
            return df
    
    df = df.copy()
    
    # Sort all data at once instead of per vehicle
    df = df.sort_values(['vehicle_id', 'IST'])
    
    # Process all vehicles in a single operation
    result_dfs = []
    
    for vehicle_id, vehicle_df in df.groupby('vehicle_id'):
        if vehicle_df.empty:
            continue
            
        # Skip if no missing values
        if not vehicle_df['odometerreading'].isna().any():
            result_dfs.append(vehicle_df)
            continue
            
        # Get indices of non-null odometer readings
        non_null_mask = vehicle_df['odometerreading'].notna()
        non_null_indices = vehicle_df.index[non_null_mask]
        
        if len(non_null_indices) < 2:
            # If fewer than 2 known points, use forward/backward fill
            vehicle_df['odometerreading'] = vehicle_df['odometerreading'].ffill().bfill()
            result_dfs.append(vehicle_df)
            continue
            
        # Vectorized interpolation between known points
        vehicle_df = _vectorized_interpolate(vehicle_df, non_null_indices)
        result_dfs.append(vehicle_df)
    
    if not result_dfs:
        logging.warning("No vehicle data to process. Returning original DataFrame.")
        return df
    
    return pd.concat(result_dfs)

def _vectorized_interpolate(vehicle_df, non_null_indices):
    """Helper function for vectorized interpolation between known points."""
    # Create a copy to avoid SettingWithCopyWarning
    vehicle_df = vehicle_df.copy()
    
    # Get values for interpolation
    known_times = vehicle_df.loc[non_null_indices, 'IST'].values
    known_odometers = vehicle_df.loc[non_null_indices, 'odometerreading'].values
    
    # Vectorized interpolation for all points
    for i in range(len(non_null_indices) - 1):
        start_idx = non_null_indices[i]
        end_idx = non_null_indices[i + 1]
        
        # Skip if no missing values in this segment
        segment_mask = (vehicle_df.index > start_idx) & (vehicle_df.index < end_idx)
        if not segment_mask.any():
            continue
            
        # Get segment data
        segment = vehicle_df[segment_mask]
        if segment.empty:
            continue
            
        # Vectorized interpolation
        start_time = known_times[i]
        end_time = known_times[i + 1]
        start_odometer = known_odometers[i]
        end_odometer = known_odometers[i + 1]
        
        # Fix: Convert numpy timedelta to seconds
        total_time_diff = (end_time - start_time) / np.timedelta64(1, 's')
        if total_time_diff == 0:
            continue
            
        # Calculate time ratios for all points in segment
        time_diffs = (segment['IST'].values - start_time) / np.timedelta64(1, 's')
        time_ratios = time_diffs / total_time_diff
        
        # Vectorized calculation of interpolated values
        interpolated_odometers = start_odometer + (end_odometer - start_odometer) * time_ratios
        
        # Update the DataFrame
        vehicle_df.loc[segment.index, 'odometerreading'] = interpolated_odometers
    
    # Handle beginning and end with vectorized operations
    if vehicle_df['odometerreading'].isna().iloc[0]:
        first_valid_idx = vehicle_df['odometerreading'].first_valid_index()
        if first_valid_idx is not None:
            first_valid_odometer = vehicle_df.loc[first_valid_idx, 'odometerreading']
            vehicle_df.loc[:first_valid_idx, 'odometerreading'] = first_valid_odometer
    
    if vehicle_df['odometerreading'].isna().iloc[-1]:
        last_valid_idx = vehicle_df['odometerreading'].last_valid_index()
        if last_valid_idx is not None:
            last_valid_odometer = vehicle_df.loc[last_valid_idx, 'odometerreading']
            vehicle_df.loc[last_valid_idx:, 'odometerreading'] = last_valid_odometer
    
    return vehicle_df

In [7]:
# Earth's radius in meters
R_EARTH_M = 6371000.0

def haversine_distance_m(lat1, lon1, lat2, lon2):
    """Calculates the straight-line distance between two points (in meters)."""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
        
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula components
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    # Multiply by Earth's radius in meters
    return c * R_EARTH_M

In [8]:
# Earth's radius in kilometers
R_EARTH_KM = 6371.0

# --- Helper Functions ---
def haversine_distance_km(lat1, lon1, lat2, lon2):
    """Calculates the straight-line distance between two points (in kilometers)."""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
        
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return c * R_EARTH_KM


In [9]:
def label_charger_proximity(df):
    """
    Label data points based on proximity to charger locations and FIXES THE WARNING
    by initializing 'dist_from_loc' as a float type.

    Args:
        df: DataFrame containing latitude and longitude columns

    Returns:
        DataFrame with additional 'charger_proximity' and 'dist_from_loc' columns
    """
    # Charger locations (latitude, longitude)
    CHARGER_LOCATIONS = {
        'DDN': (30.287722,  77.999029),  # Dehradun
        'GGN': (28.423090,  76.991733),  # Gurgaon
        'MZF': (29.549413,  77.747698),   # Muzaffarnagar
        'KMD': (17.823756,  83.35661),    #Kommadi
        'MDW': (17.802287,  83.353105),   #Madhurawada
        'MGB': (16.4267222, 80.5778889),  #Mangalagiri Bypass
        'KTP': (16.3839523, 80.532221),   #Kaza Toll Plaza
        'GSL':(17.0686111,	81.8862222) #GSL Garden charging point        
    }
#locations for Andhra pradesh routes
# Kommadi	17.823756	83.35661
# Madhurawada	17.802287	83.353105
# Mangalagiri Bypass	16.4267222	80.5778889
# Kaza Toll Plaza	16.3839523	80.532221
# Charging Rest Point	17.0686111	81.8862222
    

    # Create a copy of the input DataFrame
    result_df = df.copy()

    # Initialize the charger_proximity column
    result_df['charger_proximity'] = 'NONE'
    # FIX: Initialize 'dist_from_loc' as a float to prevent FutureWarning
    result_df['dist_from_loc'] = 0.0

    # Check each row for proximity to any charger
    for idx, row in result_df.iterrows():
        if pd.notna(row['latitude']) and pd.notna(row['longitude']):
            # Find the closest charger location
            min_distance = float('inf')
            closest_label = 'NONE'

            for label, (charger_lat, charger_lon) in CHARGER_LOCATIONS.items():
                distance = haversine_distance_m(
                    row['latitude'], row['longitude'],
                    charger_lat, charger_lon
                )
                
                if distance < min_distance:
                    min_distance = distance
                    closest_label = label

            if min_distance <= 250:  # 250 meters threshold
                result_df.at[idx, 'dist_from_loc'] = round(min_distance, 2)
                result_df.at[idx, 'charger_proximity'] = closest_label
            else:
                result_df.at[idx, 'dist_from_loc'] = round(min_distance, 2) # Still record distance even if far
                result_df.at[idx, 'charger_proximity'] = 'NONE'


    # Count occurrences of each label
    proximity_counts = result_df['charger_proximity'].value_counts()

    print("Charger Proximity Counts:")
    for label, count in proximity_counts.items():
        print(f"{label}: {count}")

    return result_df

In [10]:
# ----------------- RANKING FUNCTION -----------------

def rank_closest_instance(df):
    """
    Ranks the closest point (least dist_from_loc) within contiguous proximity series
    for each vehicle.

    It creates a grouping key for contiguous sequences of the same
    'vehicle_id' and 'charger_proximity' label, and then ranks the rows
    within each group based on 'dist_from_loc' (ascending).

    Args:
        df: DataFrame with 'vehicle_id', 'charger_proximity', and 'dist_from_loc'

    Returns:
        DataFrame with an added 'proximity_rank' column.
    """
    result_df = df.copy()

    # Sort data by vehicle_id and timestamp to ensure correct sequence
    result_df = result_df.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)

    # 1. Create a group ID for contiguous sequences of 'charger_proximity' per vehicle.
    # The 'neq_flag' is True when the current row's proximity label is different 
    # from the previous row's, signaling the start of a new sequence.
    neq_flag = (
        (result_df['charger_proximity'] != result_df['charger_proximity'].shift(1)) |
        (result_df['vehicle_id'] != result_df['vehicle_id'].shift(1))
    ).fillna(True)

    # 'seq_group' is the cumulative sum of the 'neq_flag' (True=1, False=0)
    result_df['seq_group'] = neq_flag.cumsum()

    # 2. Rank within each contiguous group
    # We rank by 'dist_from_loc' ascending (closest distance gets rank 1)
    result_df['proximity_rank'] = result_df.groupby(['vehicle_id', 'seq_group'])['dist_from_loc'].rank(
        method='first', ascending=True
    ).astype('Int64') # Use Int64 for nullable integer rank

    # 3. Handle 'NONE' labels: We don't need to rank 'NONE' sequences, so set their rank to NaN
    result_df.loc[result_df['charger_proximity'] == 'NONE', 'proximity_rank'] = pd.NA
    
    # Clean up temporary column
    result_df = result_df.drop(columns=['seq_group'])

    return result_df


In [11]:
def group_proximity_sessions(filtered_df: pd.DataFrame) -> pd.DataFrame:
    """
    Groups contiguous rows with the same charger_proximity (excluding 'NONE') 
    by assigning a sequential session number (1, 2, 3, ...).

    Args:
        filtered_df: DataFrame already filtered to contain only NEAR_XXX labels.

    Returns:
        DataFrame with an added 'session_number' column.
    """
    df = filtered_df.copy()

    # 1. PRESERVE THE SOURCE INDEX BEFORE ANY SORTING/RESETTING
    df['source_row_index'] = df.index


    # 1. Ensure data is sorted by vehicle and time for correct sequence detection
    # Assuming the input DataFrame has 'vehicle_id' and a column representing time/sequence.
    # We'll use the DataFrame index as a tie-breaker if a timestamp isn't explicit.
    df = df.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)

    # 2. Detect the start of a new session
    # A new session starts if:
    # a) The vehicle_id changes OR
    # b) The charger_proximity label changes from the previous row

    df['nxt_state'] = df['charger_proximity'].shift(1)
    
    # Check if the current proximity is different from the previous one for the same vehicle
    is_new_session_flag = (
        (df['vehicle_id'] != df['vehicle_id'].shift(1)) |
        (df['charger_proximity'] != df['charger_proximity'].shift(1))
    ).fillna(True) # Fill the first row's NaN with True (it always starts a new session)

    # 3. Create the session number by taking the cumulative sum of the change flags
    df['session_number'] = is_new_session_flag.cumsum()
    
    return df

In [12]:
def add_sequential_ranking(grouped_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds two new columns for sequential ranking based on the timestamp (IST)
    within each proximity session, grouped by vehicle_id and session_number.

    Args:
        grouped_df: DataFrame containing 'vehicle_id', 'session_number', and 'IST'.

    Returns:
        DataFrame with 'session_rank_asc' and 'session_rank_desc' columns.
    """
    df = grouped_df.copy()
    
    # Define the grouping key
    group_cols = ['vehicle_id', 'session_number']
    
    # --- 1. Ascending Rank (Sequential, Earliest time gets rank 1) ---
    # Rank based on IST in ascending order. method='first' ensures every row gets a unique rank.
    df['session_rank_asc'] = df.groupby(group_cols)['IST'].rank(
        method='first', 
        ascending=True
    ).astype(int)
    
    # --- 2. Descending Rank (Sequential, Latest time gets rank 1) ---
    # Rank based on IST in descending order.
    df['session_rank_desc'] = df.groupby(group_cols)['IST'].rank(
        method='first', 
        ascending=False
    ).astype(int)
    
    return df

In [13]:
start_time = '2025-09-21 00:00:00'
end_time = '2025-10-15 00:00:00'

# NOTE: The Trino connection in get_vehicle_gps_data is skipped, 
# and mock data is used for a successful run.
df = get_vehicle_gps_data(start_time, end_time)

In [14]:
df.head()

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,vehiclereadycondition,gun_connection_status,ignitionstatus,latitude,longitude,altitude,ground_speed_kmph
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423038,76.99171,224.1,0.0
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,,0.0,0.0,0.0,1.0,28.423037,76.99171,224.1,0.0
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0
4,18,2025-09-20 18:30:07.007,2025-09-20,2025-09-21 00:00:07.007,2025-09-20 18:30:00,1.1,639.2,0.0,,0.0,0.0,0.0,1.0,,,,


In [15]:
print("\n--- Imputing odometer data ---")
df_imputed = impute_odometer_using_vcu_speed_optimized(df)
# df_imputed = impute_odometer_using_vcu_speed(df)
# df_imputed2 = validate_gps_against_odometer(df_imputed1)
# df_imputed = impute_gps_by_speed_and_bearing_v2(df_imputed2)


--- Imputing odometer data ---


In [16]:
print("\n--- Labeling Proximity (Fix Applied) ---")
labeled_df = label_charger_proximity(df_imputed)
labeled_df.head()


--- Labeling Proximity (Fix Applied) ---
Charger Proximity Counts:
NONE: 1387642
GGN: 87869
DDN: 73052
MZF: 36230


Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,vehiclereadycondition,gun_connection_status,ignitionstatus,latitude,longitude,altitude,ground_speed_kmph,charger_proximity,dist_from_loc
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423038,76.99171,224.1,0.0,GGN,6.2
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423037,76.99171,224.1,0.0,GGN,6.31
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0,GGN,6.52
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0,GGN,6.52
4,18,2025-09-20 18:30:07.007,2025-09-20,2025-09-21 00:00:07.007,2025-09-20 18:30:00,1.1,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,,,,,NONE,0.0


In [17]:
labeled_df.charger_proximity.unique()

array(['GGN', 'NONE', 'MZF', 'DDN'], dtype=object)

In [18]:
gdf = group_proximity_sessions(labeled_df[labeled_df.charger_proximity != 'NONE'])
gdf.head()

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,ignitionstatus,latitude,longitude,altitude,ground_speed_kmph,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,...,1.0,28.423038,76.99171,224.1,0.0,GGN,6.2,0,,1
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,1.0,28.423037,76.99171,224.1,0.0,GGN,6.31,1,GGN,1
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,1.0,28.423035,76.99171,224.1,0.0,GGN,6.52,2,GGN,1
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,1.0,28.423035,76.99171,224.1,0.0,GGN,6.52,3,GGN,1
4,18,2025-09-20 18:31:01.187,2025-09-20,2025-09-21 00:01:01.187,2025-09-20 18:31:00,1.2,639.2,0.0,2635.375,0.0,...,1.0,28.423033,76.99171,224.1,0.0,GGN,6.73,32,GGN,1


In [19]:
gdf[gdf.session_number == 3]

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,ignitionstatus,latitude,longitude,altitude,ground_speed_kmph,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number
2656,18,2025-09-21 11:40:30.462,2025-09-21,2025-09-21 17:10:30.462,2025-09-21 11:40:00,15.1,622.3,0.0,2990.250,0.00,...,1.0,30.289091,78.000860,617.6,2.99,DDN,232.55,32816,MZF,3
2657,18,2025-09-21 11:40:32.122,2025-09-21,2025-09-21 17:10:32.122,2025-09-21 11:40:00,,,0.0,2990.250,0.00,...,1.0,30.289170,78.000854,605.6,0.00,DDN,237.97,32817,DDN,3
2658,18,2025-09-21 11:41:00.702,2025-09-21,2025-09-21 17:11:00.702,2025-09-21 11:41:00,13.9,622.3,0.0,2990.250,0.00,...,,30.289192,78.000854,606.1,0.00,DDN,239.63,32840,DDN,3
2659,18,2025-09-21 11:41:01.843,2025-09-21,2025-09-21 17:11:01.843,2025-09-21 11:41:00,13.9,622.4,0.0,2990.250,0.00,...,,30.289202,78.000790,605.7,2.08,DDN,235.95,32841,DDN,3
2660,18,2025-09-21 11:41:02.781,2025-09-21,2025-09-21 17:11:02.781,2025-09-21 11:41:00,13.7,622.4,0.0,2990.250,0.00,...,,30.289143,78.000440,608.9,1.47,DDN,208.14,32842,DDN,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3226,18,2025-09-21 13:27:03.982,2025-09-21,2025-09-21 18:57:03.982,2025-09-21 13:27:00,8.2,640.8,2.0,2991.125,5.05,...,1.0,30.288832,77.996796,610.4,0.00,DDN,247.39,37554,DDN,3
3227,18,2025-09-21 13:28:03.282,2025-09-21,2025-09-21 18:58:03.282,2025-09-21 13:28:00,11.3,640.4,2.0,2991.125,1.47,...,,30.288908,77.997086,606.9,7.22,DDN,228.47,37604,DDN,3
3228,18,2025-09-21 13:28:04.287,2025-09-21,2025-09-21 18:58:04.287,2025-09-21 13:28:00,,,2.0,2991.125,1.90,...,,30.288967,77.997220,604.1,5.57,DDN,222.11,37605,DDN,3
3229,18,2025-09-21 13:29:00.941,2025-09-21,2025-09-21 18:59:00.941,2025-09-21 13:29:00,-64.9,641.4,2.0,2991.375,13.11,...,,30.289314,77.998795,602.7,6.99,DDN,178.44,37654,DDN,3


In [20]:
rdf = add_sequential_ranking(gdf)
rdf.head()

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,longitude,altitude,ground_speed_kmph,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number,session_rank_asc,session_rank_desc
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,...,76.99171,224.1,0.0,GGN,6.2,0,,1,1,1513
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,76.99171,224.1,0.0,GGN,6.31,1,GGN,1,2,1512
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,76.99171,224.1,0.0,GGN,6.52,2,GGN,1,3,1511
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,76.99171,224.1,0.0,GGN,6.52,3,GGN,1,4,1510
4,18,2025-09-20 18:31:01.187,2025-09-20,2025-09-21 00:01:01.187,2025-09-20 18:31:00,1.2,639.2,0.0,2635.375,0.0,...,76.99171,224.1,0.0,GGN,6.73,32,GGN,1,5,1509


In [21]:
rdf_rnk1 = rdf[(rdf.session_rank_asc == 1)|(rdf.session_rank_desc == 1)]
rdf_rnk1.to_csv('start_end_time.csv')

In [22]:
rdf_rnk1.columns

Index(['vehicle_id', 'timestamp', 'dt', 'IST', 'ts_mins',
       'total_battery_current', 'bat_voltage', 'gear_position',
       'odometerreading', 'vehicle_speed_vcu', 'vehiclereadycondition',
       'gun_connection_status', 'ignitionstatus', 'latitude', 'longitude',
       'altitude', 'ground_speed_kmph', 'charger_proximity', 'dist_from_loc',
       'source_row_index', 'nxt_state', 'session_number', 'session_rank_asc',
       'session_rank_desc'],
      dtype='object')

In [23]:
def format_duration_hms(td):
    """
    Converts a timedelta object into the string format: hh:mm:ss, 
    aggregating days into total hours.
    """
    if pd.isna(td):
        return np.nan
        
    total_seconds = int(td.total_seconds())
    total_hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    
    # Format as string: hh:mm:ss
    return f"{total_hours:02d}:{minutes:02d}:{seconds:02d}"


# --- Final Trip Linking and Calculation Function (Overlap Fix) ---

def generate_final_trip_report(df_rnk1: pd.DataFrame) -> pd.DataFrame:
    """
    Generates the final trip and halt report.
    
    FIX: Filters df_departure to only keep necessary columns for the join, preventing overlap.
    """
    df_rnk1 = df_rnk1.copy()
    
    # Define the core columns needed for aggregation and join keys
    CORE_COLS = ['vehicle_id', 'session_number', 'session_rank_asc', 'session_rank_desc',
                 'IST', 'odometerreading', 'latitude', 'longitude', 'charger_proximity', 'source_row_index']

    # 1. Separate the markers, explicitly selecting and renaming core columns
    df_departure_full = df_rnk1[df_rnk1.session_rank_desc == 1][CORE_COLS].rename(
        columns={'IST': 'Departure_Time', 'odometerreading': 'Departure_ODO', 
                 'charger_proximity': 'Departure_Location', 'latitude': 'Departure_Lat', 
                 'longitude': 'Departure_Lon', 'source_row_index': 'Departure_Index'}
    ).copy()
    
    df_arrival_full = df_rnk1[df_rnk1.session_rank_asc == 1][CORE_COLS].rename(
        columns={'IST': 'Arrival_Time', 'odometerreading': 'Arrival_ODO', 
                 'charger_proximity': 'Arrival_Location', 'latitude': 'Arrival_Lat', 
                 'longitude': 'Arrival_Lon', 'source_row_index': 'Arrival_Index'}
    ).copy()

    # Ensure time columns are datetime for calculations
    df_arrival_full['Arrival_Time'] = pd.to_datetime(df_arrival_full['Arrival_Time'])
    df_departure_full['Departure_Time'] = pd.to_datetime(df_departure_full['Departure_Time'])
    
    # --- DataFrames used in the Join (MINIMAL COLUMNS) ---
    # Create MINIMAL departure and arrival DFs to prevent overlap issues
    
    # Departure DataFrame for Travel Logic (retains only keys and departure values)
    df_departure = df_departure_full[[
        'vehicle_id', 'session_number', 'Departure_Time', 'Departure_ODO', 
        'Departure_Location', 'Departure_Lat', 'Departure_Lon', 'Departure_Index'
    ]].copy()
    
    # Arrival DataFrame for Shifting
    df_arrival = df_arrival_full[[
        'vehicle_id', 'session_number', 'Arrival_Time', 'Arrival_ODO', 
        'Arrival_Location', 'Arrival_Lat', 'Arrival_Lon', 'Arrival_Index'
    ]].copy()

    # --- 2. Calculate Halt Time ---
    df_halt = pd.merge(
        df_arrival[['vehicle_id', 'session_number', 'Arrival_Time']],
        df_departure[['vehicle_id', 'session_number', 'Departure_Time']],
        on=['vehicle_id', 'session_number'],
        how='inner',
    )
    df_halt['HALT_Duration'] = (df_halt['Departure_Time'] - df_halt['Arrival_Time']).dt.total_seconds()
    
    # 3. Link Departure (Session N) to Next Arrival (Session N+1)
    
    df_arrival_shift = df_arrival.sort_values(by=['vehicle_id', 'Arrival_Time'])
    
    df_next_arrival = df_arrival_shift.groupby('vehicle_id').shift(-1).reset_index(drop=True).rename(
        columns={
            'Arrival_Time': 'Next_Arrival_Time', 'Arrival_ODO': 'Next_Arrival_ODO',
            'Arrival_Location': 'Next_Arrival_Location',
            'Arrival_Lat': 'Next_Arrival_Lat', 'Arrival_Lon': 'Next_Arrival_Lon',
            'session_number': 'Next_Session_Number',
            'Arrival_Index': 'Arrival_Index'
        }
    )

    # Align current departure with next arrival using index join
    df_trips = df_departure.sort_values(by=['vehicle_id', 'Departure_Time']).reset_index(drop=True).copy()
    # Join is safe because df_departure and df_next_arrival share NO column names now.
    df_trips = df_trips.join(df_next_arrival) 
    
    df_trips = df_trips[df_trips['Next_Arrival_Time'].notna()].copy()

    # 4. Calculate Travel Metrics
    df_trips['ODO_Distance_km'] = df_trips['Next_Arrival_ODO'] - df_trips['Departure_ODO']
    df_trips.loc[df_trips['ODO_Distance_km'] < 0, 'ODO_Distance_km'] = np.nan 

    df_trips['GPS_Distance_km'] = df_trips.apply(
        lambda row: haversine_distance_km(
            row['Departure_Lat'], row['Departure_Lon'], 
            row['Next_Arrival_Lat'], row['Next_Arrival_Lon']
        ), axis=1
    )
    
    df_trips['TRAVEL_Duration_sec'] = (df_trips['Next_Arrival_Time'] - df_trips['Departure_Time']).dt.total_seconds()
    
    # 5. Final Consolidation
    
    df_halt_merge = df_halt[['vehicle_id', 'session_number', 'HALT_Duration']]
    
    final_df = pd.merge(
        df_trips,
        df_halt_merge,
        on=['vehicle_id', 'session_number'],
        how='left'
    )
    
    final_df['Trip_Name'] = final_df.apply(
        lambda row: f"{row['Departure_Location']}-{row['Next_Arrival_Location']}", axis=1
    )

    final_df['HALT_Duration_sec'] = final_df['HALT_Duration'].fillna(0.0)
    final_df['TOTAL_Duration_sec'] = final_df['TRAVEL_Duration_sec'] + final_df['HALT_Duration_sec']

    # Convert durations to timedelta
    final_df['TRAVEL_Time'] = pd.to_timedelta(final_df['TRAVEL_Duration_sec'], unit='s')
    final_df['HALT_Time'] = pd.to_timedelta(final_df['HALT_Duration_sec'], unit='s')
    final_df['TOTAL_Time'] = pd.to_timedelta(final_df['TOTAL_Duration_sec'], unit='s')

    # 6. Final Output Structure and Formatting
    
    final_output = final_df[[
        'vehicle_id', 'Trip_Name', 'Departure_Time', 'Next_Arrival_Time',
        'Departure_Index', 'Arrival_Index', 
        'ODO_Distance_km', 'GPS_Distance_km', 'TRAVEL_Time', 'HALT_Time', 'TOTAL_Time'
    ]].rename(columns={'Next_Arrival_Time': 'Arrival_Time'})

    # Apply custom formatting (hh:mm:ss) and fix data types
    final_output['TRAVEL_Time'] = final_output['TRAVEL_Time'].apply(format_duration_hms)
    final_output['HALT_Time'] = final_output['HALT_Time'].apply(format_duration_hms)
    final_output['TOTAL_Time'] = final_output['TOTAL_Time'].apply(format_duration_hms)
    
    final_output['Departure_Index'] = final_output['Departure_Index'].astype('Int64')
    final_output['Arrival_Index'] = final_output['Arrival_Index'].astype('Int64')

    final_output['ODO_Distance_km'] = round(final_output['ODO_Distance_km'],2)
    final_output['GPS_Distance_km'] = round(final_output['GPS_Distance_km'],2)
    

    return final_output[['vehicle_id', 'Trip_Name', 'Departure_Time', 'Arrival_Time', 
                        'Departure_Index', 'Arrival_Index', 
                        'ODO_Distance_km', 'GPS_Distance_km', 'TRAVEL_Time', 'HALT_Time', 'TOTAL_Time']]

In [24]:
# Example execution (assuming start_time_trips.csv is available):
# final_dataframe = generate_final_trip_report('start_time_trips.csv')
# print(final_dataframe.to_markdown(index=False))

# Encapsulating the output of the function into the final code block:
start_end_times = generate_final_trip_report(rdf_rnk1)
start_end_times.to_csv('start_end_with_loc.csv')
start_end_times
# print(final_output.to_markdown(index=False))

Unnamed: 0,vehicle_id,Trip_Name,Departure_Time,Arrival_Time,Departure_Index,Arrival_Index,ODO_Distance_km,GPS_Distance_km,TRAVEL_Time,HALT_Time,TOTAL_Time
0,18,GGN-MZF,2025-09-21 05:57:00.294,2025-09-21 10:37:03.519,8946,17186,210.35,144.82,04:40:03,05:57:00,10:37:03
1,18,MZF-DDN,2025-09-21 11:29:04.239,2025-09-21 17:10:30.462,19767,32816,143.64,85.68,05:41:26,00:52:00,06:33:26
2,18,DDN-MZF,2025-09-21 18:59:02.981,2025-09-21 21:54:06.335,37655,42533,124.56,85.69,02:55:03,01:48:32,04:43:35
3,18,MZF-GGN,2025-09-21 22:50:02.369,2025-09-22 02:53:07.486,45046,52224,207.69,144.90,04:03:05,00:55:56,04:59:01
4,18,GGN-MZF,2025-09-22 05:55:07.641,2025-09-22 10:59:00.942,56707,65710,208.75,145.04,05:03:53,03:02:00,08:05:53
...,...,...,...,...,...,...,...,...,...,...,...
117,19,MZF-GGN,2025-10-07 08:13:03.257,2025-10-07 12:55:19.486,1521974,1528407,198.80,144.82,04:42:16,00:35:00,05:17:16
118,19,GGN-MZF,2025-10-07 16:03:00.672,2025-10-08 02:07:02.993,1533492,1549694,203.60,144.84,10:04:02,03:07:41,13:11:43
119,19,MZF-DDN,2025-10-08 03:07:06.643,2025-10-08 05:30:04.224,1552786,1557477,125.62,85.68,02:22:57,01:00:03,03:23:01
120,19,DDN-MZF,2025-10-08 07:53:00.243,2025-10-08 10:21:01.283,1563605,1569993,125.62,85.86,02:28:01,02:22:56,04:50:57


In [25]:
# --- 1. Core Energy Calculation Function (Adapted from user template) ---

def analyze_trip_slice_energy(trip_slice: pd.DataFrame) -> dict:
    """
    Improved energy calculation based on energy_mileage_daily.py approach.
    """
    if trip_slice.empty or 'total_battery_current' not in trip_slice.columns:
        return {'energy_consumed_kwh': 0, 'regen_energy_kwh': 0, 'net_energy_kwh': 0}
    
    trip_slice = trip_slice.copy()
    
    # Calculate time difference and power
    trip_slice['time_diff_seconds'] = trip_slice['IST'].diff().dt.total_seconds().fillna(0)
    trip_slice = trip_slice[trip_slice['time_diff_seconds'] > 0].copy()
    
    if trip_slice.empty:
        return {'energy_consumed_kwh': 0, 'regen_energy_kwh': 0, 'net_energy_kwh': 0}
    
    trip_slice['time_diff_hr'] = trip_slice['time_diff_seconds'] / 3600
    trip_slice['power_kW'] = (trip_slice['bat_voltage'] * trip_slice['total_battery_current']) / 1000
    
    # --- Driving Energy Analysis (using more comprehensive gear position check) ---
    driving_df = trip_slice[trip_slice['gear_position'].isin([1.0, 2.0])].copy()
    
    if not driving_df.empty:
        driving_df['energy_consumption_kwh'] = driving_df.apply(
            lambda row: row['power_kW'] * row['time_diff_hr'] if row['power_kW'] > 0.0 else 0, axis=1)
        driving_df['regen_energy_kwh'] = driving_df.apply(
            lambda row: -row['power_kW'] * row['time_diff_hr'] if row['power_kW'] < 0.0 else 0, axis=1)
        
        driving_energy = driving_df['energy_consumption_kwh'].sum()
        regen_energy = driving_df['regen_energy_kwh'].sum()
    else:
        driving_energy, regen_energy = 0, 0
    
    # --- Idling Energy Analysis (using simpler condition) ---
    stationary_df = trip_slice[(trip_slice['gear_position'] == 0.0) & 
                               (trip_slice['ignitionstatus'] == 1.0)].copy()
    
    if not stationary_df.empty:
        stationary_df['idling_energy_kwh'] = stationary_df.apply(
            lambda row: row['power_kW'] * row['time_diff_hr'] if row['power_kW'] > 0.0 else 0, axis=1)
        idling_energy = stationary_df['idling_energy_kwh'].sum()
    else:
        idling_energy = 0
    
    # Calculate net energy
    net_energy = driving_energy - regen_energy
    
    return {
        'energy_consumed_kwh': round(driving_energy, 3),
        'regen_energy_kwh': round(regen_energy, 3),
        'idling_energy_kwh': round(idling_energy, 3),
        'net_energy_kwh': round(net_energy, 3)
    }

In [26]:
def apply_data_quality_filters(df):
    """
    Apply data quality filters similar to energy_mileage_daily.py.
    """
    # Make a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Filter out extreme current values
    df = df[df['total_battery_current'].abs() <= 3000]
    
    # Drop rows with missing data in key columns
    df.dropna(subset=['odometerreading', 'gear_position', 'ignitionstatus', 
                    'total_battery_current', 'bat_voltage', 'IST'], inplace=True)
    
    # Sort data by vehicle ID and timestamp
    df.sort_values(by=['vehicle_id', 'IST'], inplace=True)
    
    return df

In [27]:
def calculate_energy_rate(row):
    """
    Improved energy rate calculation with better edge case handling.
    """
    # Check if distance is 0, NaN, or very small
    if pd.isna(row['ODO_Distance_km']) or row['ODO_Distance_km'] < 0.1:
        return 0
    
    return row['net_energy_kwh'] / row['ODO_Distance_km']

In [28]:
def calculate_final_energy_report(df_markers: pd.DataFrame, full_df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Improved energy report calculation incorporating elements from energy_mileage_daily.py.
    """
    # Apply data quality filters
    full_df = apply_data_quality_filters(full_df_raw.copy())
    
    # Use improved odometer imputation
    full_df = impute_odometer_using_vcu_speed_optimized(full_df)
    
    # Prepare data for energy calculations
    full_df['IST'] = pd.to_datetime(full_df['IST']) 
    df_markers['Departure_Time'] = pd.to_datetime(df_markers['Departure_Time'])
    df_markers['Arrival_Time'] = pd.to_datetime(df_markers['Arrival_Time'])
    
    # Pre-calculate metrics needed for energy calculation
    full_df['power_kW'] = (full_df['bat_voltage'] * full_df['total_battery_current']) / 1000
    full_df['time_diff_seconds'] = full_df.groupby('vehicle_id')['IST'].diff().dt.total_seconds().fillna(0)
    full_df['time_diff_hr'] = full_df['time_diff_seconds'] / 3600.0
    full_df = full_df[full_df['time_diff_seconds'] > 0].copy()
    
    all_trip_stats = []
    
    # Process each trip
    for _, trip in df_markers.iterrows():
        vid = trip['vehicle_id']
        odo_dist = trip['ODO_Distance_km']
        dep_idx = trip['Departure_Index']
        arr_idx = trip['Arrival_Index']
        
        try:
            trip_slice = full_df.loc[dep_idx:arr_idx].copy()
            trip_slice = trip_slice[trip_slice['vehicle_id'] == vid].copy()
        except KeyError:
            continue
        
        if trip_slice.empty:
            continue
        
        # Use improved energy calculation
        energy_stats = analyze_trip_slice_energy(trip_slice)
        
        # Calculate energy rate with improved function
        energy_rate = calculate_energy_rate({
            'ODO_Distance_km': odo_dist,
            'net_energy_kwh': energy_stats['net_energy_kwh']
        })
        
        # Store results
        all_trip_stats.append({
            'Departure_Index': dep_idx,
            'Arrival_Index': arr_idx,
            'Energy_Consumed_kWh': energy_stats['energy_consumed_kwh'],
            'Regen_Energy_kWh': energy_stats['regen_energy_kwh'],
            'Idling_Energy_kWh': energy_stats['idling_energy_kwh'],
            'Net_Energy_kWh': energy_stats['net_energy_kwh'],
            'Energy_Rate_kWh_per_km': round(energy_rate, 2)
        })
    
    if not all_trip_stats:
        return df_markers
    
    # Merge results back
    df_energy = pd.DataFrame(all_trip_stats)
    df_markers_idx = df_markers.set_index(['Departure_Index', 'Arrival_Index'])
    df_energy_idx = df_energy.set_index(['Departure_Index', 'Arrival_Index'])
    
    final_report = df_markers_idx.join(df_energy_idx, how='left')
    final_report = final_report.reset_index().rename(columns={'Trip_Name': 'Trip'})
    
    # Final output rearrangement
    final_output = final_report[[
        'Departure_Index', 'vehicle_id', 'Trip', 'Departure_Time', 'Arrival_Time', 
        'ODO_Distance_km', 'TRAVEL_Time', 
        'Energy_Consumed_kWh', 'Regen_Energy_kWh', 'Idling_Energy_kWh', 'Net_Energy_kWh', 
        'Energy_Rate_kWh_per_km', 'HALT_Time', 'TOTAL_Time', 'GPS_Distance_km', 'Arrival_Index'
    ]].copy()
    
    return final_output

In [29]:
# res = create_final_energy_report_iteratively(start_end_times,df)
res = calculate_final_energy_report(start_end_times,labeled_df)
res.to_csv('final_outcome_v2.csv')
res

Unnamed: 0,Departure_Index,vehicle_id,Trip,Departure_Time,Arrival_Time,ODO_Distance_km,TRAVEL_Time,Energy_Consumed_kWh,Regen_Energy_kWh,Idling_Energy_kWh,Net_Energy_kWh,Energy_Rate_kWh_per_km,HALT_Time,TOTAL_Time,GPS_Distance_km,Arrival_Index
0,8946,18,GGN-MZF,2025-09-21 05:57:00.294,2025-09-21 10:37:03.519,210.35,04:40:03,235.134,51.101,4.054,184.033,0.87,05:57:00,10:37:03,144.82,17186
1,19767,18,MZF-DDN,2025-09-21 11:29:04.239,2025-09-21 17:10:30.462,143.64,05:41:26,181.938,30.856,9.353,151.082,1.05,00:52:00,06:33:26,85.68,32816
2,37655,18,DDN-MZF,2025-09-21 18:59:02.981,2025-09-21 21:54:06.335,124.56,02:55:03,131.480,26.592,4.121,104.888,0.84,01:48:32,04:43:35,85.69,42533
3,45046,18,MZF-GGN,2025-09-21 22:50:02.369,2025-09-22 02:53:07.486,207.69,04:03:05,225.040,50.465,0.805,174.576,0.84,00:55:56,04:59:01,144.90,52224
4,56707,18,GGN-MZF,2025-09-22 05:55:07.641,2025-09-22 10:59:00.942,208.75,05:03:53,221.180,40.134,5.446,181.046,0.87,03:02:00,08:05:53,145.04,65710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,1521974,19,MZF-GGN,2025-10-07 08:13:03.257,2025-10-07 12:55:19.486,198.80,04:42:16,221.003,45.817,0.098,175.185,0.88,00:35:00,05:17:16,144.82,1528407
118,1533492,19,GGN-MZF,2025-10-07 16:03:00.672,2025-10-08 02:07:02.993,203.60,10:04:02,238.776,43.328,1.854,195.449,0.96,03:07:41,13:11:43,144.84,1549694
119,1552786,19,MZF-DDN,2025-10-08 03:07:06.643,2025-10-08 05:30:04.224,125.62,02:22:57,142.082,22.922,0.117,119.160,0.95,01:00:03,03:23:01,85.68,1557477
120,1563605,19,DDN-MZF,2025-10-08 07:53:00.243,2025-10-08 10:21:01.283,125.62,02:28:01,125.891,37.955,0.710,87.935,0.70,02:22:56,04:50:57,85.86,1569993


Explanation of Energy Parameters
The energy parameters are calculated by integrating the instantaneous electric power over the entire trip time (Δt 
hr
​
 ), using the vehicle's Battery Voltage (V) and Total Battery Current (I). Power is calculated as kW=(V×I)/1000.

1. Energy_Consumed_kWh (Gross Consumption)
What it is: The total electrical energy drawn from the battery for propulsion and all auxiliary systems during the trip.

Calculation: Sum of (Power×Δt 
hr
​
 ) only when Power>0 and the Gear Position=2 (Driving).

2. Regen_Energy_kWh (Energy Recovered)
What it is: The total electrical energy pushed back into the battery through regenerative braking during the trip.

Calculation: Sum of (Power×Δt 
hr
​
 ) only when Power<0 (representing current flowing into the battery). The value is stored as positive.

3. Idling_Energy_kWh
What it is: The energy consumed specifically while the vehicle is stationary but still powered on (running auxiliary systems, air conditioning, etc.).

Calculation: Sum of (Power×Δt 
hr
​
 ) only when:

Power>0

Gear Position=0 (Neutral/Park)

Vehiclereadycondition=1 (Vehicle ready/on)

Gun Connection Status=0 (Not charging)

4. Net_Energy_kWh
What it is: The true energy depletion experienced by the battery for the entire trip.

Calculation: Net Energy=Energy_Consumed_kWh−Regen_Energy_kWh.

5. Energy_Rate_kWh_per_km (Efficiency)
What it is: The energy efficiency of the trip, indicating how many kilowatt-hours were required to travel one kilometer.

Calculation: Energy Rate=Net_Energy_kWh/ODO_Distance_km.