In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import platform
import logging
import trino

sys.path.append('..')
from common import db_operations

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg

from math import radians, degrees, sin, cos, atan2, asin, sqrt # Corrected imports

In [2]:
# Configure basic logging for the business logic file
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [3]:
# Charger locations (latitude, longitude)
CHARGER_LOCATIONS = {
    'Dehradun': (30.287722, 77.999029),
    'Gurgaon': (28.423090, 76.991733),
    'Muzaffarnagar': (29.549413, 77.747698)
}

In [4]:
def get_vehicle_gps_data(start_time=None, end_time=None, vehicle_ids=None):
    """
    Fetch vehicle and GPS data from Trino with flexible time filtering.
    
    Args:
        start_time: Start time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        end_time: End time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        vehicle_ids: List of vehicle IDs to filter (optional)
    
    Returns:
        DataFrame with vehicle and GPS data
    """
    # Set default vehicle IDs if not provided
    if vehicle_ids is None:
        vehicle_ids = ['18','19']
    
    # Format vehicle IDs for the query
    vehicle_ids_str = "', '".join(vehicle_ids)
    
    # Build time filter conditions
    time_filter_cpo100 = ""
    time_filter_gps = ""
    
    if start_time and end_time:
        # Convert string times to datetime objects
        start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        
        # Format for the query
        time_filter_cpo100 = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    else:
        # Default time range if not provided
        time_filter_cpo100 = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    
    # Build the complete query
    query = f"""
    with cpo100 as
    (
      SELECT 
        id, timestamp, dt, 
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS IST,
        date_trunc('minute', timestamp) as ts_mins,  -- Truncate to minutes
        total_battery_current, bat_voltage, gear_position, odometerreading,round(vehicle_speed_vcu,2) as vehicle_speed_vcu,
        vehiclereadycondition, ignitionstatus, gun_connection_status
      from 
        facts_prod.can_parsed_output_100
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_cpo100}
    ),
    cpo100_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cpo_rn
      from 
        cpo100
    ),
    c2c_gps_data as
    (
      select 
        id, timestamp,
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS ts_ist,
        date_trunc('minute', timestamp) as ts_mins, date, latitude, longitude, altitude,ground_speed_kmph
      from 
        facts_prod.c2c_gps
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_gps}
    ),
    c2c_gps_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cg_rn
      from 
        c2c_gps_data
    )
    select 
        cp.id as vehicle_id, cp.timestamp, cp.dt, cp.IST, cp.ts_mins,
        cp.total_battery_current, cp.bat_voltage, cp.gear_position, cp.odometerreading,vehicle_speed_vcu,
        cp.vehiclereadycondition, cp.gun_connection_status,
        cp.ignitionstatus, cg.latitude, cg.longitude, cg.altitude,ground_speed_kmph
    from 
      cpo100_ranked as cp
      left join
      c2c_gps_ranked as cg
      on (cp.id = cg.id and cp.ts_mins = cg.ts_mins and cp.cpo_rn = cg.cg_rn)
    ORDER BY cp.id, cp.timestamp
    """
    
    # Connect to Trino
    conn = trino.dbapi.connect(
        host="trino",
        port=8080,
        user="admin",
        catalog="adhoc",
        schema="default"
    )
    
    # Execute the query
    cur = conn.cursor()
    cur.execute(query)
    
    # Get column names
    columns = [desc[0] for desc in cur.description]
    
    # Fetch all rows
    rows = cur.fetchall()
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)
    
    # Close connections
    cur.close()
    conn.close()
    
    return df

In [27]:
def impute_odometer_using_vcu_speed(df: pd.DataFrame) -> pd.DataFrame:
    """
    Imputes missing Odometer and GPS coordinates iteratively using the 
    vehicle_speed_vcu data and the time delta between readings.
    """
    df_imputed = df.copy()

    # --- 1. Prepare Data and Preliminary Imputation (Status columns for completeness) ---

    # Ensure timestamp is datetime and sort for sequential processing
    df_imputed['IST'] = pd.to_datetime(df_imputed['IST'])
    df_imputed = df_imputed.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)

    # Create a mask for rows where odometer was originally missing
    missing_odometer_mask = df['odometerreading'].isnull()
    
    # Preliminary Imputation for status columns (using ffill/bfill for robustness)
    status_cols = ['vehiclereadycondition', 'gun_connection_status', 'ignitionstatus', 'gear_position', 'vehicle_speed_vcu']
    for col in status_cols:
         df_imputed[col] = df_imputed[col].ffill().bfill()
         df_imputed[col] = df_imputed[col].astype(float, errors='ignore') # Keep as float for speed calculation

    # Conditional Imputation for status (Rules from previous request, omitted for brevity)
    # ... (status imputation logic goes here) ...

    # --- 2. Iterative Odometer/GPS Imputation ---
    
    # Dictionary to track the last known valid points for each vehicle
    last_known_data = {} 

    for index, row in df_imputed.iterrows():
        vid = row['vehicle_id']
        
        if vid not in last_known_data:
            # Initialize with (NaN, NaN, NaN) for GPS and ODO, and last known timestamp
            last_known_data[vid] = {
                'gps': (np.nan, np.nan, np.nan),
                'odo': np.nan,
                'time': row['IST']
            }

        last_gps = last_known_data[vid]['gps']
        last_odo = last_known_data[vid]['odo']
        last_time = last_known_data[vid]['time']
        
        current_lat, current_lon, current_alt = row['latitude'], row['longitude'], row['altitude']
        current_odo = row['odometerreading']
        current_speed = row['vehicle_speed_vcu']
        current_time = row['IST']

        time_delta_sec = (current_time - last_time).total_seconds()
        
        
        # --- ODOMETER IMPUTATION ---
        
        # If ODO is missing, and we have a last valid ODO to build upon
        if pd.isna(current_odo) and not pd.isna(last_odo) and time_delta_sec > 0:
            
            # Distance traveled (km) = Speed (km/h) * Time Delta (hr)
            # Speed is usually reported in km/h, Time Delta is in seconds, so divide by 3600
            time_delta_hr = time_delta_sec / 3600.0
            
            # Note: We use the reported speed for the entire interval
            distance_traveled = current_speed * time_delta_hr 
            
            new_odo = last_odo + distance_traveled
            df_imputed.loc[index, 'odometerreading'] = round(new_odo,2)
            current_odo = new_odo # Use imputed value for subsequent updates
            # Add boolean column marking imputed rows
            df_imputed['is_imputed_odometer'] = missing_odometer_mask
                        
        
        # --- GPS IMPUTATION (Stationary Case) ---

        # If GPS is missing, and the reported speed is zero, use last known GPS
        if pd.isna(current_lat) and (current_speed == 0.0) and not pd.isna(last_gps[0]):
            df_imputed.loc[index, ['latitude', 'longitude', 'altitude']] = last_gps
            current_lat, current_lon, current_alt = last_gps

        # --- Update Last Known Valid Data ---
        
        # If the current row has a valid ODO (known or imputed), update the last known ODO
        if not pd.isna(current_odo):
            last_known_data[vid]['odo'] = round(current_odo,2)
            
        # If the current row has valid GPS (known or imputed), update the last known GPS
        if not pd.isna(df_imputed.loc[index, 'latitude']):
            last_known_data[vid]['gps'] = (
                df_imputed.loc[index, 'latitude'], 
                df_imputed.loc[index, 'longitude'], 
                df_imputed.loc[index, 'altitude']
            )

        # Update the last known time point
        last_known_data[vid]['time'] = current_time

    return df_imputed

In [6]:
def validate_gps_against_odometer(df: pd.DataFrame) -> pd.DataFrame:
    """
    Invalidates (sets to NaN) GPS coordinates if the Haversine distance 
    significantly exceeds the Odometer distance traveled since the last reading.

    Assumption: Odometer is in kilometers (km).
    """
    df_validated = df.copy()
    
    # 1. Ensure Odometer and GPS are sorted and shifted
    df_validated = df_validated.sort_values(by=['vehicle_id', 'IST'])
    
    # Calculate difference columns within each vehicle group
    df_validated['odo_diff_m'] = (df_validated.groupby('vehicle_id')['odometerreading'].diff() * 1000.0)
    df_validated['prev_lat'] = df_validated.groupby('vehicle_id')['latitude'].shift(1)
    df_validated['prev_lon'] = df_validated.groupby('vehicle_id')['longitude'].shift(1)
    
    # 2. Calculate Haversine Distance between current and previous GPS point
    df_validated['gps_diff_m'] = df_validated.apply(
        lambda row: haversine_distance_m(row['latitude'], row['longitude'], row['prev_lat'], row['prev_lon']),
        axis=1
    )
    
    # 3. Validation Rule: Flag if GPS jump is vastly greater than ODO change
    # Tolerance: Flag the GPS reading if the GPS distance is more than 5 times 
    # the ODO distance OR the GPS distance exceeds the ODO distance by more than 50 meters (absolute error).
    # This prevents flagging small movements where ODO is often static.
    
    # We focus on rows where the odometer suggests movement (or a small error)
    movement_mask = (df_validated['odo_diff_m'].abs() > 0.0)
    
    # Rule: If ODO moved, but GPS moved way too much
    inconsistent_gps_mask = movement_mask & (
        (df_validated['gps_diff_m'] > df_validated['odo_diff_m'] * 5.0) |  # 5x relative error
        (df_validated['gps_diff_m'] - df_validated['odo_diff_m'] > 50.0)   # 50m absolute error
    )
    
    # Rule for the specific case mentioned: Row 58 has 0.01 km ODO diff (10m) and 680m GPS diff.
    # The jump is 68x the ODO diff, which the rule above catches.
    
    # 4. Invalidate the Inconsistent GPS Reading (set to NaN)
    df_validated.loc[inconsistent_gps_mask, ['latitude', 'longitude', 'altitude']] = np.nan
    
    print(f"Validated: {inconsistent_gps_mask.sum()} inconsistent GPS points were invalidated (set to NaN).")
    
    # Clean up temporary columns
    df_validated = df_validated.drop(columns=['odo_diff_m', 'prev_lat', 'prev_lon', 'gps_diff_m'])
    
    return df_validated.ffill().bfill() # Re-fill the auxiliary columns for continuity

In [7]:
# Earth's radius in kilometers (assuming ODO/Speed is in km/h)
R_EARTH_KM = 6371.0

# --- Geodesic Helper Functions (Unchanged) ---

def calculate_initial_bearing(lat1, lon1, lat2, lon2):
    """Calculates the bearing from point 1 to point 2 (in degrees)."""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
        
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    y = sin(lon2 - lon1) * cos(lat2)
    x = cos(lat1) * sin(lat2) - sin(lat1) * cos(lat2) * cos(lon2 - lon1)
    
    return (degrees(atan2(y, x)) + 360) % 360

def calculate_destination_point(lat, lon, bearing, distance_km):
    """Calculates destination point given a start point, bearing, and distance."""
    if pd.isna(lat) or pd.isna(lon) or pd.isna(bearing) or pd.isna(distance_km):
        return np.nan, np.nan
        
    lat, lon = map(radians, [lat, lon])
    bearing = radians(bearing)
    
    angular_distance = distance_km / R_EARTH_KM
    
    lat2 = asin(sin(lat) * cos(angular_distance) +
                cos(lat) * sin(angular_distance) * cos(bearing))
    
    lon2 = lon + atan2(sin(bearing) * sin(angular_distance) * cos(lat),
                       cos(angular_distance) - sin(lat) * sin(lat2))
    
    return degrees(lat2), degrees(lon2)

# --- Main Imputation Function (Corrected) ---

def impute_gps_by_speed_and_bearing_v2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Imputes missing GPS coordinates by calculating distance from VCU speed and 
    assuming a constant straight-line bearing.

    FIX: Ensures the vehicle_id column is retained after the groupby.apply operation.
    """
    with pd.option_context('future.no_silent_downcasting', True):
        df_imputed = df.copy()

        # --- Preliminary Setup ---
        df_imputed['IST'] = pd.to_datetime(df_imputed['IST'])
        df_imputed = df_imputed.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)
        
        # Fill NaNs in speed/altitude (essential for calculation)
        df_imputed['vehicle_speed_vcu'] = df_imputed.groupby('vehicle_id')['vehicle_speed_vcu'].ffill().bfill().fillna(0.0)
        df_imputed['altitude'] = df_imputed.groupby('vehicle_id')['altitude'].ffill().bfill() 

        # Flag known GPS points
        df_imputed['gps_known'] = df_imputed['latitude'].notna()
        
        # --- Iterative Speed and Bearing GPS Imputation ---
        
        def impute_gps_segment(group):
            # The logic relies on the index being preserved within the group for loc[]
            
            known_indices = group[group['gps_known']].index
            
            if len(known_indices) < 2:
                # If less than 2 known GPS points, cannot determine bearing/path, use simple fill
                return group.ffill().bfill() 

            for i in range(len(known_indices) - 1):
                start_idx = known_indices[i]
                end_idx = known_indices[i+1]
                
                start_row = group.loc[start_idx]
                end_row = group.loc[end_idx]

                # 1. Calculate the overall straight-line bearing for the segment
                initial_bearing = calculate_initial_bearing(
                    start_row['latitude'], start_row['longitude'], 
                    end_row['latitude'], end_row['longitude']
                )

                # Initialize current position at the start of the gap
                lat_curr, lon_curr = start_row['latitude'], start_row['longitude']
                time_curr = start_row['IST']

                # 2. Iterate through the rows within the gap
                for j in range(start_idx + 1, end_idx):
                    next_row = group.loc[j]
                    
                    time_next = next_row['IST']
                    speed_vcu = next_row['vehicle_speed_vcu']
                    
                    time_delta_sec = (time_next - time_curr).total_seconds()
                    time_delta_hr = time_delta_sec / 3600.0
                    
                    distance_traveled_km = speed_vcu * time_delta_hr
                    
                    # 3. Calculate the new coordinates
                    new_lat, new_lon = calculate_destination_point(
                        lat_curr, lon_curr, initial_bearing, distance_traveled_km
                    )
                    
                    # Impute the GPS coordinates
                    group.loc[j, 'latitude'] = round(new_lat,2)
                    group.loc[j, 'longitude'] = round(new_lon,2)
                    
                    # 4. Update current position and time for the next iteration
                    lat_curr, lon_curr = new_lat, new_lon
                    time_curr = time_next
                    
            return group

        # Apply the logic grouped by vehicle_id
        # FIX: Ensure the vehicle_id column is retained by calling reset_index after apply.
        df_imputed = df_imputed.groupby('vehicle_id').apply(
            impute_gps_segment, 
            include_groups=True # Retain the grouping key in the index
        ).reset_index(level=0, drop=True) # Move vehicle_id from index back to column and drop the original index level
        
        # Final pass for any remaining NaNs (start/end of data)
        df_imputed = df_imputed.ffill().bfill()

    return df_imputed

In [8]:
# Earth's radius in meters
R_EARTH_M = 6371000.0

def haversine_distance_m(lat1, lon1, lat2, lon2):
    """Calculates the straight-line distance between two points (in meters)."""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
        
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula components
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    # Multiply by Earth's radius in meters
    return c * R_EARTH_M

In [9]:
# Earth's radius in kilometers
R_EARTH_KM = 6371.0

# --- Helper Functions ---
def haversine_distance_km(lat1, lon1, lat2, lon2):
    """Calculates the straight-line distance between two points (in kilometers)."""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
        
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return c * R_EARTH_KM


In [10]:
def label_charger_proximity(df):
    """
    Label data points based on proximity to charger locations and FIXES THE WARNING
    by initializing 'dist_from_loc' as a float type.

    Args:
        df: DataFrame containing latitude and longitude columns

    Returns:
        DataFrame with additional 'charger_proximity' and 'dist_from_loc' columns
    """
    # Charger locations (latitude, longitude)
    CHARGER_LOCATIONS = {
        'DDN': (30.287722, 77.999029),  # Dehradun
        'GGN': (28.423090, 76.991733),  # Gurgaon
        'MZF': (29.549413, 77.747698)   # Muzaffarnagar
    }

    # Create a copy of the input DataFrame
    result_df = df.copy()

    # Initialize the charger_proximity column
    result_df['charger_proximity'] = 'NONE'
    # FIX: Initialize 'dist_from_loc' as a float to prevent FutureWarning
    result_df['dist_from_loc'] = 0.0

    # Check each row for proximity to any charger
    for idx, row in result_df.iterrows():
        if pd.notna(row['latitude']) and pd.notna(row['longitude']):
            # Find the closest charger location
            min_distance = float('inf')
            closest_label = 'NONE'

            for label, (charger_lat, charger_lon) in CHARGER_LOCATIONS.items():
                distance = haversine_distance_m(
                    row['latitude'], row['longitude'],
                    charger_lat, charger_lon
                )
                
                if distance < min_distance:
                    min_distance = distance
                    closest_label = label

            if min_distance <= 250:  # 250 meters threshold
                result_df.at[idx, 'dist_from_loc'] = round(min_distance, 2)
                result_df.at[idx, 'charger_proximity'] = closest_label
            else:
                result_df.at[idx, 'dist_from_loc'] = round(min_distance, 2) # Still record distance even if far
                result_df.at[idx, 'charger_proximity'] = 'NONE'


    # Count occurrences of each label
    proximity_counts = result_df['charger_proximity'].value_counts()

    print("Charger Proximity Counts:")
    for label, count in proximity_counts.items():
        print(f"{label}: {count}")

    return result_df

In [11]:
# ----------------- RANKING FUNCTION -----------------

def rank_closest_instance(df):
    """
    Ranks the closest point (least dist_from_loc) within contiguous proximity series
    for each vehicle.

    It creates a grouping key for contiguous sequences of the same
    'vehicle_id' and 'charger_proximity' label, and then ranks the rows
    within each group based on 'dist_from_loc' (ascending).

    Args:
        df: DataFrame with 'vehicle_id', 'charger_proximity', and 'dist_from_loc'

    Returns:
        DataFrame with an added 'proximity_rank' column.
    """
    result_df = df.copy()

    # Sort data by vehicle_id and timestamp to ensure correct sequence
    result_df = result_df.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)

    # 1. Create a group ID for contiguous sequences of 'charger_proximity' per vehicle.
    # The 'neq_flag' is True when the current row's proximity label is different 
    # from the previous row's, signaling the start of a new sequence.
    neq_flag = (
        (result_df['charger_proximity'] != result_df['charger_proximity'].shift(1)) |
        (result_df['vehicle_id'] != result_df['vehicle_id'].shift(1))
    ).fillna(True)

    # 'seq_group' is the cumulative sum of the 'neq_flag' (True=1, False=0)
    result_df['seq_group'] = neq_flag.cumsum()

    # 2. Rank within each contiguous group
    # We rank by 'dist_from_loc' ascending (closest distance gets rank 1)
    result_df['proximity_rank'] = result_df.groupby(['vehicle_id', 'seq_group'])['dist_from_loc'].rank(
        method='first', ascending=True
    ).astype('Int64') # Use Int64 for nullable integer rank

    # 3. Handle 'NONE' labels: We don't need to rank 'NONE' sequences, so set their rank to NaN
    result_df.loc[result_df['charger_proximity'] == 'NONE', 'proximity_rank'] = pd.NA
    
    # Clean up temporary column
    result_df = result_df.drop(columns=['seq_group'])

    return result_df


In [12]:
def group_proximity_sessions(filtered_df: pd.DataFrame) -> pd.DataFrame:
    """
    Groups contiguous rows with the same charger_proximity (excluding 'NONE') 
    by assigning a sequential session number (1, 2, 3, ...).

    Args:
        filtered_df: DataFrame already filtered to contain only NEAR_XXX labels.

    Returns:
        DataFrame with an added 'session_number' column.
    """
    df = filtered_df.copy()

    # 1. PRESERVE THE SOURCE INDEX BEFORE ANY SORTING/RESETTING
    df['source_row_index'] = df.index


    # 1. Ensure data is sorted by vehicle and time for correct sequence detection
    # Assuming the input DataFrame has 'vehicle_id' and a column representing time/sequence.
    # We'll use the DataFrame index as a tie-breaker if a timestamp isn't explicit.
    df = df.sort_values(by=['vehicle_id', 'IST']).reset_index(drop=True)

    # 2. Detect the start of a new session
    # A new session starts if:
    # a) The vehicle_id changes OR
    # b) The charger_proximity label changes from the previous row

    df['nxt_state'] = df['charger_proximity'].shift(1)
    
    # Check if the current proximity is different from the previous one for the same vehicle
    is_new_session_flag = (
        (df['vehicle_id'] != df['vehicle_id'].shift(1)) |
        (df['charger_proximity'] != df['charger_proximity'].shift(1))
    ).fillna(True) # Fill the first row's NaN with True (it always starts a new session)

    # 3. Create the session number by taking the cumulative sum of the change flags
    df['session_number'] = is_new_session_flag.cumsum()
    
    return df

In [13]:
def add_sequential_ranking(grouped_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds two new columns for sequential ranking based on the timestamp (IST)
    within each proximity session, grouped by vehicle_id and session_number.

    Args:
        grouped_df: DataFrame containing 'vehicle_id', 'session_number', and 'IST'.

    Returns:
        DataFrame with 'session_rank_asc' and 'session_rank_desc' columns.
    """
    df = grouped_df.copy()
    
    # Define the grouping key
    group_cols = ['vehicle_id', 'session_number']
    
    # --- 1. Ascending Rank (Sequential, Earliest time gets rank 1) ---
    # Rank based on IST in ascending order. method='first' ensures every row gets a unique rank.
    df['session_rank_asc'] = df.groupby(group_cols)['IST'].rank(
        method='first', 
        ascending=True
    ).astype(int)
    
    # --- 2. Descending Rank (Sequential, Latest time gets rank 1) ---
    # Rank based on IST in descending order.
    df['session_rank_desc'] = df.groupby(group_cols)['IST'].rank(
        method='first', 
        ascending=False
    ).astype(int)
    
    return df

In [72]:
start_time = '2025-09-21 00:00:00'
end_time = '2025-10-10 00:00:00'

# NOTE: The Trino connection in get_vehicle_gps_data is skipped, 
# and mock data is used for a successful run.
df = get_vehicle_gps_data(start_time, end_time)

In [73]:
print("\n--- Imputing odometer data ---")
df_imputed = impute_odometer_using_vcu_speed(df)
# df_imputed2 = validate_gps_against_odometer(df_imputed1)
# df_imputed = impute_gps_by_speed_and_bearing_v2(df_imputed2)


--- Imputing odometer data ---


In [74]:
print("\n--- Labeling Proximity (Fix Applied) ---")
labeled_df = label_charger_proximity(df_imputed)
labeled_df.head()


--- Labeling Proximity (Fix Applied) ---
Charger Proximity Counts:
NONE: 959129
GGN: 259182
DDN: 248443
MZF: 86034


Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,vehiclereadycondition,gun_connection_status,ignitionstatus,latitude,longitude,altitude,ground_speed_kmph,is_imputed_odometer,charger_proximity,dist_from_loc
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423038,76.99171,224.1,0.0,False,GGN,6.2
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,0.0,0.0,1.0,28.423037,76.99171,224.1,0.0,True,GGN,6.31
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0,True,GGN,6.52
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,0.0,False,GGN,6.52
4,18,2025-09-20 18:30:07.007,2025-09-20,2025-09-21 00:00:07.007,2025-09-20 18:30:00,1.1,639.2,0.0,2635.38,0.0,0.0,0.0,1.0,28.423035,76.99171,224.1,,True,GGN,6.52


In [75]:
labeled_df.charger_proximity.unique()

array(['GGN', 'NONE', 'MZF', 'DDN'], dtype=object)

In [76]:
gdf = group_proximity_sessions(labeled_df[labeled_df.charger_proximity != 'NONE'])
gdf.head()

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,latitude,longitude,altitude,ground_speed_kmph,is_imputed_odometer,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,...,28.423038,76.99171,224.1,0.0,False,GGN,6.2,0,,1
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,...,28.423037,76.99171,224.1,0.0,True,GGN,6.31,1,GGN,1
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,...,28.423035,76.99171,224.1,0.0,True,GGN,6.52,2,GGN,1
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,28.423035,76.99171,224.1,0.0,False,GGN,6.52,3,GGN,1
4,18,2025-09-20 18:30:07.007,2025-09-20,2025-09-21 00:00:07.007,2025-09-20 18:30:00,1.1,639.2,0.0,2635.38,0.0,...,28.423035,76.99171,224.1,,True,GGN,6.52,4,GGN,1


In [77]:
gdf[gdf.session_number == 3]

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,latitude,longitude,altitude,ground_speed_kmph,is_imputed_odometer,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number
11328,18,2025-09-21 11:40:30.462,2025-09-21,2025-09-21 17:10:30.462,2025-09-21 11:40:00,15.1,622.3,0.0,2990.250,0.00,...,30.289091,78.000860,617.6,2.99,False,DDN,232.55,32816,MZF,3
11329,18,2025-09-21 11:40:32.122,2025-09-21,2025-09-21 17:10:32.122,2025-09-21 11:40:00,,,0.0,2990.250,0.00,...,30.289170,78.000854,605.6,0.00,True,DDN,237.97,32817,DDN,3
11330,18,2025-09-21 11:40:33.182,2025-09-21,2025-09-21 17:10:33.182,2025-09-21 11:40:00,,,0.0,2990.250,0.00,...,30.289170,78.000854,605.6,,True,DDN,237.97,32818,DDN,3
11331,18,2025-09-21 11:40:34.142,2025-09-21,2025-09-21 17:10:34.142,2025-09-21 11:40:00,15.2,622.3,0.0,2990.250,0.00,...,30.289170,78.000854,605.6,,False,DDN,237.97,32819,DDN,3
11332,18,2025-09-21 11:40:35.802,2025-09-21,2025-09-21 17:10:35.802,2025-09-21 11:40:00,15.2,622.4,0.0,2990.250,0.00,...,30.289170,78.000854,605.6,,False,DDN,237.97,32820,DDN,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15669,18,2025-09-21 13:27:55.036,2025-09-21,2025-09-21 18:57:55.036,2025-09-21 13:27:00,,,2.0,2991.120,0.00,...,30.288832,77.996796,610.4,,True,DDN,247.39,37597,DDN,3
15670,18,2025-09-21 13:28:03.282,2025-09-21,2025-09-21 18:58:03.282,2025-09-21 13:28:00,11.3,640.4,2.0,2991.120,1.47,...,30.288908,77.997086,606.9,7.22,True,DDN,228.47,37604,DDN,3
15671,18,2025-09-21 13:28:04.287,2025-09-21,2025-09-21 18:58:04.287,2025-09-21 13:28:00,,,2.0,2991.120,1.90,...,30.288967,77.997220,604.1,5.57,True,DDN,222.11,37605,DDN,3
15672,18,2025-09-21 13:29:00.941,2025-09-21,2025-09-21 18:59:00.941,2025-09-21 13:29:00,-64.9,641.4,2.0,2991.375,13.11,...,30.289314,77.998795,602.7,6.99,False,DDN,178.44,37654,DDN,3


In [78]:
rdf = add_sequential_ranking(gdf)
rdf.head()

Unnamed: 0,vehicle_id,timestamp,dt,IST,ts_mins,total_battery_current,bat_voltage,gear_position,odometerreading,vehicle_speed_vcu,...,altitude,ground_speed_kmph,is_imputed_odometer,charger_proximity,dist_from_loc,source_row_index,nxt_state,session_number,session_rank_asc,session_rank_desc
0,18,2025-09-20 18:30:00.247,2025-09-20,2025-09-21 00:00:00.247,2025-09-20 18:30:00,1.3,639.2,0.0,2635.375,0.0,...,224.1,0.0,False,GGN,6.2,0,,1,1,8806
1,18,2025-09-20 18:30:02.287,2025-09-20,2025-09-21 00:00:02.287,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,...,224.1,0.0,True,GGN,6.31,1,GGN,1,2,8805
2,18,2025-09-20 18:30:04.147,2025-09-20,2025-09-21 00:00:04.147,2025-09-20 18:30:00,1.2,639.2,0.0,2635.38,0.0,...,224.1,0.0,True,GGN,6.52,2,GGN,1,3,8804
3,18,2025-09-20 18:30:05.707,2025-09-20,2025-09-21 00:00:05.707,2025-09-20 18:30:00,1.2,639.2,0.0,2635.375,0.0,...,224.1,0.0,False,GGN,6.52,3,GGN,1,4,8803
4,18,2025-09-20 18:30:07.007,2025-09-20,2025-09-21 00:00:07.007,2025-09-20 18:30:00,1.1,639.2,0.0,2635.38,0.0,...,224.1,,True,GGN,6.52,4,GGN,1,5,8802


In [79]:
rdf_rnk1 = rdf[(rdf.session_rank_asc == 1)|(rdf.session_rank_desc == 1)]
rdf_rnk1.to_csv('start_end_time.csv')

In [80]:
rdf_rnk1.columns

Index(['vehicle_id', 'timestamp', 'dt', 'IST', 'ts_mins',
       'total_battery_current', 'bat_voltage', 'gear_position',
       'odometerreading', 'vehicle_speed_vcu', 'vehiclereadycondition',
       'gun_connection_status', 'ignitionstatus', 'latitude', 'longitude',
       'altitude', 'ground_speed_kmph', 'is_imputed_odometer',
       'charger_proximity', 'dist_from_loc', 'source_row_index', 'nxt_state',
       'session_number', 'session_rank_asc', 'session_rank_desc'],
      dtype='object')

In [93]:
def format_duration_hms(td):
    """
    Converts a timedelta object into the string format: hh:mm:ss, 
    aggregating days into total hours.
    """
    if pd.isna(td):
        return np.nan
        
    total_seconds = int(td.total_seconds())
    total_hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    
    # Format as string: hh:mm:ss
    return f"{total_hours:02d}:{minutes:02d}:{seconds:02d}"


# --- Final Trip Linking and Calculation Function (Overlap Fix) ---

def generate_final_trip_report(df_rnk1: pd.DataFrame) -> pd.DataFrame:
    """
    Generates the final trip and halt report.
    
    FIX: Filters df_departure to only keep necessary columns for the join, preventing overlap.
    """
    df_rnk1 = df_rnk1.copy()
    
    # Define the core columns needed for aggregation and join keys
    CORE_COLS = ['vehicle_id', 'session_number', 'session_rank_asc', 'session_rank_desc',
                 'IST', 'odometerreading', 'latitude', 'longitude', 'charger_proximity', 'source_row_index']

    # 1. Separate the markers, explicitly selecting and renaming core columns
    df_departure_full = df_rnk1[df_rnk1.session_rank_desc == 1][CORE_COLS].rename(
        columns={'IST': 'Departure_Time', 'odometerreading': 'Departure_ODO', 
                 'charger_proximity': 'Departure_Location', 'latitude': 'Departure_Lat', 
                 'longitude': 'Departure_Lon', 'source_row_index': 'Departure_Index'}
    ).copy()
    
    df_arrival_full = df_rnk1[df_rnk1.session_rank_asc == 1][CORE_COLS].rename(
        columns={'IST': 'Arrival_Time', 'odometerreading': 'Arrival_ODO', 
                 'charger_proximity': 'Arrival_Location', 'latitude': 'Arrival_Lat', 
                 'longitude': 'Arrival_Lon', 'source_row_index': 'Arrival_Index'}
    ).copy()

    # Ensure time columns are datetime for calculations
    df_arrival_full['Arrival_Time'] = pd.to_datetime(df_arrival_full['Arrival_Time'])
    df_departure_full['Departure_Time'] = pd.to_datetime(df_departure_full['Departure_Time'])
    
    # --- DataFrames used in the Join (MINIMAL COLUMNS) ---
    # Create MINIMAL departure and arrival DFs to prevent overlap issues
    
    # Departure DataFrame for Travel Logic (retains only keys and departure values)
    df_departure = df_departure_full[[
        'vehicle_id', 'session_number', 'Departure_Time', 'Departure_ODO', 
        'Departure_Location', 'Departure_Lat', 'Departure_Lon', 'Departure_Index'
    ]].copy()
    
    # Arrival DataFrame for Shifting
    df_arrival = df_arrival_full[[
        'vehicle_id', 'session_number', 'Arrival_Time', 'Arrival_ODO', 
        'Arrival_Location', 'Arrival_Lat', 'Arrival_Lon', 'Arrival_Index'
    ]].copy()

    # --- 2. Calculate Halt Time ---
    df_halt = pd.merge(
        df_arrival[['vehicle_id', 'session_number', 'Arrival_Time']],
        df_departure[['vehicle_id', 'session_number', 'Departure_Time']],
        on=['vehicle_id', 'session_number'],
        how='inner',
    )
    df_halt['HALT_Duration'] = (df_halt['Departure_Time'] - df_halt['Arrival_Time']).dt.total_seconds()
    
    # 3. Link Departure (Session N) to Next Arrival (Session N+1)
    
    df_arrival_shift = df_arrival.sort_values(by=['vehicle_id', 'Arrival_Time'])
    
    df_next_arrival = df_arrival_shift.groupby('vehicle_id').shift(-1).reset_index(drop=True).rename(
        columns={
            'Arrival_Time': 'Next_Arrival_Time', 'Arrival_ODO': 'Next_Arrival_ODO',
            'Arrival_Location': 'Next_Arrival_Location',
            'Arrival_Lat': 'Next_Arrival_Lat', 'Arrival_Lon': 'Next_Arrival_Lon',
            'session_number': 'Next_Session_Number',
            'Arrival_Index': 'Arrival_Index'
        }
    )

    # Align current departure with next arrival using index join
    df_trips = df_departure.sort_values(by=['vehicle_id', 'Departure_Time']).reset_index(drop=True).copy()
    # Join is safe because df_departure and df_next_arrival share NO column names now.
    df_trips = df_trips.join(df_next_arrival) 
    
    df_trips = df_trips[df_trips['Next_Arrival_Time'].notna()].copy()

    # 4. Calculate Travel Metrics
    df_trips['ODO_Distance_km'] = df_trips['Next_Arrival_ODO'] - df_trips['Departure_ODO']
    df_trips.loc[df_trips['ODO_Distance_km'] < 0, 'ODO_Distance_km'] = np.nan 

    df_trips['GPS_Distance_km'] = df_trips.apply(
        lambda row: haversine_distance_km(
            row['Departure_Lat'], row['Departure_Lon'], 
            row['Next_Arrival_Lat'], row['Next_Arrival_Lon']
        ), axis=1
    )
    
    df_trips['TRAVEL_Duration_sec'] = (df_trips['Next_Arrival_Time'] - df_trips['Departure_Time']).dt.total_seconds()
    
    # 5. Final Consolidation
    
    df_halt_merge = df_halt[['vehicle_id', 'session_number', 'HALT_Duration']]
    
    final_df = pd.merge(
        df_trips,
        df_halt_merge,
        on=['vehicle_id', 'session_number'],
        how='left'
    )
    
    final_df['Trip_Name'] = final_df.apply(
        lambda row: f"{row['Departure_Location']}-{row['Next_Arrival_Location']}", axis=1
    )

    final_df['HALT_Duration_sec'] = final_df['HALT_Duration'].fillna(0.0)
    final_df['TOTAL_Duration_sec'] = final_df['TRAVEL_Duration_sec'] + final_df['HALT_Duration_sec']

    # Convert durations to timedelta
    final_df['TRAVEL_Time'] = pd.to_timedelta(final_df['TRAVEL_Duration_sec'], unit='s')
    final_df['HALT_Time'] = pd.to_timedelta(final_df['HALT_Duration_sec'], unit='s')
    final_df['TOTAL_Time'] = pd.to_timedelta(final_df['TOTAL_Duration_sec'], unit='s')

    # 6. Final Output Structure and Formatting
    
    final_output = final_df[[
        'vehicle_id', 'Trip_Name', 'Departure_Time', 'Next_Arrival_Time',
        'Departure_Index', 'Arrival_Index', 
        'ODO_Distance_km', 'GPS_Distance_km', 'TRAVEL_Time', 'HALT_Time', 'TOTAL_Time'
    ]].rename(columns={'Next_Arrival_Time': 'Arrival_Time'})

    # Apply custom formatting (hh:mm:ss) and fix data types
    final_output['TRAVEL_Time'] = final_output['TRAVEL_Time'].apply(format_duration_hms)
    final_output['HALT_Time'] = final_output['HALT_Time'].apply(format_duration_hms)
    final_output['TOTAL_Time'] = final_output['TOTAL_Time'].apply(format_duration_hms)
    
    final_output['Departure_Index'] = final_output['Departure_Index'].astype('Int64')
    final_output['Arrival_Index'] = final_output['Arrival_Index'].astype('Int64')

    final_output['ODO_Distance_km'] = round(final_output['ODO_Distance_km'],2)
    final_output['GPS_Distance_km'] = round(final_output['GPS_Distance_km'],2)
    

    return final_output[['vehicle_id', 'Trip_Name', 'Departure_Time', 'Arrival_Time', 
                        'Departure_Index', 'Arrival_Index', 
                        'ODO_Distance_km', 'GPS_Distance_km', 'TRAVEL_Time', 'HALT_Time', 'TOTAL_Time']]

In [94]:
# Example execution (assuming start_time_trips.csv is available):
# final_dataframe = generate_final_trip_report('start_time_trips.csv')
# print(final_dataframe.to_markdown(index=False))

# Encapsulating the output of the function into the final code block:
start_end_times = generate_final_trip_report(rdf_rnk1)
start_end_times.to_csv('start_end_with_loc.csv')
start_end_times
# print(final_output.to_markdown(index=False))

Unnamed: 0,vehicle_id,Trip_Name,Departure_Time,Arrival_Time,Departure_Index,Arrival_Index,ODO_Distance_km,GPS_Distance_km,TRAVEL_Time,HALT_Time,TOTAL_Time
0,18,GGN-MZF,2025-09-21 05:57:00.294,2025-09-21 10:37:03.519,8946,17186,210.36,144.82,04:40:03,05:57:00,10:37:03
1,18,MZF-DDN,2025-09-21 11:29:04.239,2025-09-21 17:10:30.462,19767,32816,143.75,85.68,05:41:26,00:52:00,06:33:26
2,18,DDN-MZF,2025-09-21 18:59:02.981,2025-09-21 21:54:06.335,37655,42533,124.58,85.69,02:55:03,01:48:32,04:43:35
3,18,MZF-GGN,2025-09-21 22:50:02.369,2025-09-22 02:53:07.486,45046,52224,207.66,144.90,04:03:05,00:55:56,04:59:01
4,18,GGN-MZF,2025-09-22 05:55:07.641,2025-09-22 10:59:00.942,56707,65710,208.75,145.04,05:03:53,03:02:00,08:05:53
...,...,...,...,...,...,...,...,...,...,...,...
117,19,MZF-GGN,2025-10-07 08:13:03.257,2025-10-07 12:55:19.486,1493353,1499786,198.85,144.82,04:42:16,00:35:00,05:17:16
118,19,GGN-MZF,2025-10-07 16:03:00.672,2025-10-08 02:07:02.993,1504871,1521073,203.60,144.84,10:04:02,03:07:41,13:11:43
119,19,MZF-DDN,2025-10-08 03:07:06.643,2025-10-08 05:30:04.224,1524165,1528856,125.62,85.68,02:22:57,01:00:03,03:23:01
120,19,DDN-MZF,2025-10-08 07:53:37.603,2025-10-08 10:21:01.283,1535017,1541372,125.52,85.86,02:27:23,02:23:33,04:50:57


In [95]:
# --- 1. Core Energy Calculation Function (Adapted from user template) ---

def analyze_trip_slice_energy(trip_slice: pd.DataFrame) -> dict:
    """
    Calculates energy consumption and regeneration for a single, pre-sliced trip 
    DataFrame, mirroring the logic structure of the user's template.
    """
    
    if trip_slice.empty or 'total_battery_current' not in trip_slice.columns:
        return {'energy_consumed_kwh': 0, 'regen_energy_kwh': 0, 'net_energy_kwh': 0}

    trip_slice = trip_slice.copy()
    
    # Calculate time difference and power specific to this slice (assuming IST is ready)
    trip_slice['time_diff_seconds'] = trip_slice['IST'].diff().dt.total_seconds().fillna(0)
    trip_slice = trip_slice[trip_slice['time_diff_seconds'] > 0].copy()
    
    if trip_slice.empty:
        return {'energy_consumed_kwh': 0, 'regen_energy_kwh': 0, 'net_energy_kwh': 0}

    trip_slice['time_diff_hr'] = trip_slice['time_diff_seconds'] / 3600
    trip_slice['power_kW'] = (trip_slice['bat_voltage'] * trip_slice['total_battery_current']) / 1000

    # Calculate energy consumption and regeneration
    trip_slice['consumption_kwh'] = trip_slice.apply(
        lambda row: row['power_kW'] * row['time_diff_hr'] if row['power_kW'] > 0 else 0, axis=1)
    
    trip_slice['regen_kwh'] = trip_slice.apply(
        lambda row: -row['power_kW'] * row['time_diff_hr'] if row['power_kW'] < 0 else 0, axis=1)

    # Sum statistics
    total_consumption = trip_slice['consumption_kwh'].sum()
    total_regeneration = trip_slice['regen_kwh'].sum()
    net_energy = total_consumption - total_regeneration
    
    return {
        'energy_consumed_kwh': round(total_consumption, 3),
        'regen_energy_kwh': round(total_regeneration, 3),
        'net_energy_kwh': round(net_energy, 3)
    }

In [101]:
def calculate_final_energy_report_iteratively(df_markers: pd.DataFrame, full_df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates detailed energy statistics for each trip segment by slicing the raw 
    source data using the index markers and aggregates the results into a single 
    row per trip.

    FIX: Uses a robust Index-Based Join to ensure all original columns (like vehicle_id) 
    are retained after merging the calculated energy parameters.
    """
    
    df_markers = df_markers.copy()
    full_df = full_df_raw.copy()

    # --- 1. PREPARE SOURCE DATA ---
    if full_df.index.name is None:
        if 'Unnamed: 0' in full_df.columns:
            full_df = full_df.set_index('Unnamed: 0')
        full_df.index.name = 'original_index'
    
    full_df['IST'] = pd.to_datetime(full_df['IST']) 
    df_markers['Departure_Time'] = pd.to_datetime(df_markers['Departure_Time'])
    df_markers['Arrival_Time'] = pd.to_datetime(df_markers['Arrival_Time'])
    
    # Pre-calculate metrics needed for energy calculation
    full_df['power_kW'] = (full_df['bat_voltage'] * full_df['total_battery_current']) / 1000
    full_df['time_diff_seconds'] = full_df.groupby('vehicle_id')['IST'].diff().dt.total_seconds().fillna(0)
    full_df['time_diff_hr'] = full_df['time_diff_seconds'] / 3600.0
    full_df = full_df[full_df['time_diff_seconds'] > 0].copy()
    
    all_trip_stats = []

    # --- 2. Iterative Slicing and Calculation ---
    
    for _, trip in df_markers.iterrows():
        vid = trip['vehicle_id']
        odo_dist = trip['ODO_Distance_km']
        dep_idx = trip['Departure_Index']
        arr_idx = trip['Arrival_Index']
        
        try:
            trip_slice = full_df.loc[dep_idx:arr_idx].copy()
            trip_slice = trip_slice[trip_slice['vehicle_id'] == vid].copy()
        except KeyError:
            continue
            
        if trip_slice.empty:
            continue

        # --- Energy Analysis (Driving, Regen, Idling) ---
        driving_df = trip_slice[trip_slice['gear_position'] == 2.0].copy()
        stationary_df = trip_slice[(trip_slice['gear_position'] == 0.0) & 
                                   (trip_slice['vehiclereadycondition'] == 1.0) &
                                   (trip_slice['gun_connection_status'] == 0.0)].copy()

        # Driving/Regen Calculation
        if not driving_df.empty:
            driving_df['consumption_kwh'] = driving_df.apply(
                lambda row: row['power_kW'] * row['time_diff_hr'] if row['power_kW'] > 0 else 0, axis=1)
            driving_df['regen_kwh'] = driving_df.apply(
                lambda row: -row['power_kW'] * row['time_diff_hr'] if row['power_kW'] < 0 else 0, axis=1)
            trip_driving_energy = driving_df['consumption_kwh'].sum()
            trip_regen_energy = driving_df['regen_kwh'].sum()
        else:
            trip_driving_energy, trip_regen_energy = 0, 0

        # Idling Calculation
        if not stationary_df.empty:
            stationary_df['idling_kwh'] = stationary_df.apply(
                lambda row: row['power_kW'] * row['time_diff_hr'] if row['power_kW'] > 0 else 0, axis=1)
            trip_idling_energy = stationary_df['idling_kwh'].sum()
        else:
            trip_idling_energy = 0

        # Final Aggregation
        net_energy = trip_driving_energy - trip_regen_energy
        energy_rate = net_energy / odo_dist if odo_dist > 0 else 0
        ttl_data_points = len(trip_slice)
        imputed_rows = trip_slice['is_imputed_odometer'].sum()
        imputed_percentage = (imputed_rows / ttl_data_points) * 100
        
        
        # Store results using the index keys
        all_trip_stats.append({
            'Departure_Index': dep_idx,
            'Arrival_Index': arr_idx,
            'Energy_Consumed_kWh': round(trip_driving_energy, 2),
            'Regen_Energy_kWh': round(trip_regen_energy, 2),
            'Idling_Energy_kWh': round(trip_idling_energy, 2),
            'Net_Energy_kWh': round(net_energy, 2),
            'Energy_Rate_kWh_per_km': round(energy_rate, 2),
            'ttl_data_points': ttl_data_points,
            'missing_rows': int(imputed_rows),  # Ensure integer type
            'missing_%': round(imputed_percentage, 2)            
        })

    if not all_trip_stats:
        return df_markers

    df_energy = pd.DataFrame(all_trip_stats)
    
    # 3. Merge Results Back (Using Robust Index Join)
    
    # Set merge keys as index for both DFs
    df_markers_idx = df_markers.set_index(['Departure_Index', 'Arrival_Index'])
    df_energy_idx = df_energy.set_index(['Departure_Index', 'Arrival_Index'])

    # Join the original report structure with the new energy calculations
    final_report = df_markers_idx.join(df_energy_idx, how='left')
    
    # Bring the indices back as columns and rename Trip_Name
    final_report = final_report.reset_index().rename(columns={'Trip_Name': 'Trip'})

    # 4. Final Output Rearrangement and Type Fixes
    
    final_output = final_report[[
        'Departure_Index', 'vehicle_id', 'Trip', 'Departure_Time', 'Arrival_Time', 
        'ODO_Distance_km', 'TRAVEL_Time', 
        'Energy_Consumed_kWh', 'Regen_Energy_kWh', 'Idling_Energy_kWh', 'Net_Energy_kWh', 'Energy_Rate_kWh_per_km',
        'HALT_Time', 'TOTAL_Time', 'GPS_Distance_km', 'Arrival_Index','ttl_data_points','missing_rows','missing_%'
    ]].copy()

    # Final cleanup (rename index column and ensure correct data types)
    final_output = final_output.rename(columns={'Departure_Index': 'Index'})
    final_output['Index'] = final_output['Index'].astype('Int64')
    final_output['Arrival_Index'] = final_output['Arrival_Index'].astype('Int64')
    
    return final_output[['Index','Arrival_Index', 'vehicle_id', 'Trip', 'Departure_Time', 'Arrival_Time', 
                         'ODO_Distance_km', 'TRAVEL_Time', 'Energy_Consumed_kWh', 
                         'Regen_Energy_kWh', 'Idling_Energy_kWh', 'Net_Energy_kWh', 
                         'Energy_Rate_kWh_per_km', 'HALT_Time', 'TOTAL_Time', 'GPS_Distance_km','ttl_data_points','ttl_data_points','missing_rows','missing_%']]

In [102]:
# res = create_final_energy_report_iteratively(start_end_times,df)
res = calculate_final_energy_report_iteratively(start_end_times,labeled_df)
res.to_csv('final_outcome_v2.csv')
res

Unnamed: 0,Index,Arrival_Index,vehicle_id,Trip,Departure_Time,Arrival_Time,ODO_Distance_km,TRAVEL_Time,Energy_Consumed_kWh,Regen_Energy_kWh,Idling_Energy_kWh,Net_Energy_kWh,Energy_Rate_kWh_per_km,HALT_Time,TOTAL_Time,GPS_Distance_km,ttl_data_points,ttl_data_points.1,missing_rows,missing_%
0,8946,17186,18,GGN-MZF,2025-09-21 05:57:00.294,2025-09-21 10:37:03.519,210.36,04:40:03,201.53,42.89,3.60,158.64,0.75,05:57:00,10:37:03,144.82,8236,8236,4462,54.18
1,19767,32816,18,MZF-DDN,2025-09-21 11:29:04.239,2025-09-21 17:10:30.462,143.75,05:41:26,149.08,25.71,4.22,123.36,0.86,00:52:00,06:33:26,85.68,13035,13035,7217,55.37
2,37655,42533,18,DDN-MZF,2025-09-21 18:59:02.981,2025-09-21 21:54:06.335,124.58,02:55:03,111.18,19.73,3.76,91.45,0.73,01:48:32,04:43:35,85.69,4873,4873,3124,64.11
3,45046,52224,18,MZF-GGN,2025-09-21 22:50:02.369,2025-09-22 02:53:07.486,207.66,04:03:05,192.96,42.79,0.72,150.17,0.72,00:55:56,04:59:01,144.90,7179,7179,4073,56.73
4,56707,65710,18,GGN-MZF,2025-09-22 05:55:07.641,2025-09-22 10:59:00.942,208.75,05:03:53,189.76,35.36,4.66,154.40,0.74,03:02:00,08:05:53,145.04,9004,9004,5042,56.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,1493353,1499786,19,MZF-GGN,2025-10-07 08:13:03.257,2025-10-07 12:55:19.486,198.85,04:42:16,186.80,39.45,0.10,147.35,0.74,00:35:00,05:17:16,144.82,6434,6434,3825,59.45
118,1504871,1521073,19,GGN-MZF,2025-10-07 16:03:00.672,2025-10-08 02:07:02.993,203.60,10:04:02,201.41,37.55,1.68,163.86,0.80,03:07:41,13:11:43,144.84,15755,15755,7760,49.25
119,1524165,1528856,19,MZF-DDN,2025-10-08 03:07:06.643,2025-10-08 05:30:04.224,125.62,02:22:57,118.20,19.92,0.09,98.28,0.78,01:00:03,03:23:01,85.68,4692,4692,2677,57.05
120,1535017,1541372,19,DDN-MZF,2025-10-08 07:53:37.603,2025-10-08 10:21:01.283,125.52,02:27:23,98.82,30.13,0.58,68.69,0.55,02:23:33,04:50:57,85.86,6356,6356,4228,66.52


Explanation of Energy Parameters
The energy parameters are calculated by integrating the instantaneous electric power over the entire trip time (Δt 
hr
​
 ), using the vehicle's Battery Voltage (V) and Total Battery Current (I). Power is calculated as kW=(V×I)/1000.

1. Energy_Consumed_kWh (Gross Consumption)
What it is: The total electrical energy drawn from the battery for propulsion and all auxiliary systems during the trip.

Calculation: Sum of (Power×Δt 
hr
​
 ) only when Power>0 and the Gear Position=2 (Driving).

2. Regen_Energy_kWh (Energy Recovered)
What it is: The total electrical energy pushed back into the battery through regenerative braking during the trip.

Calculation: Sum of (Power×Δt 
hr
​
 ) only when Power<0 (representing current flowing into the battery). The value is stored as positive.

3. Idling_Energy_kWh
What it is: The energy consumed specifically while the vehicle is stationary but still powered on (running auxiliary systems, air conditioning, etc.).

Calculation: Sum of (Power×Δt 
hr
​
 ) only when:

Power>0

Gear Position=0 (Neutral/Park)

Vehiclereadycondition=1 (Vehicle ready/on)

Gun Connection Status=0 (Not charging)

4. Net_Energy_kWh
What it is: The true energy depletion experienced by the battery for the entire trip.

Calculation: Net Energy=Energy_Consumed_kWh−Regen_Energy_kWh.

5. Energy_Rate_kWh_per_km (Efficiency)
What it is: The energy efficiency of the trip, indicating how many kilowatt-hours were required to travel one kilometer.

Calculation: Energy Rate=Net_Energy_kWh/ODO_Distance_km.