In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import platform
import logging
import trino

sys.path.append('..')
from common import db_operations

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg

from math import radians, sin, cos, sqrt, atan2

In [2]:
# Configure basic logging for the business logic file
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [3]:
# Charger locations (latitude, longitude)
CHARGER_LOCATIONS = {
    'Dehradun': (30.287722, 77.999029),
    'Gurgaon': (28.423090, 76.991733),
    'Muzaffarnagar': (29.549413, 77.747698)
}


In [4]:
def get_vehicle_gps_data(start_time=None, end_time=None, vehicle_ids=None):
    """
    Fetch vehicle and GPS data from Trino with flexible time filtering.
    
    Args:
        start_time: Start time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        end_time: End time in 'YYYY-MM-DD HH:MM:SS' format (optional)
        vehicle_ids: List of vehicle IDs to filter (optional)
    
    Returns:
        DataFrame with vehicle and GPS data
    """
    # Set default vehicle IDs if not provided
    if vehicle_ids is None:
        vehicle_ids = ['18','19']
    
    # Format vehicle IDs for the query
    vehicle_ids_str = "', '".join(vehicle_ids)
    
    # Build time filter conditions
    time_filter_cpo100 = ""
    time_filter_gps = ""
    
    if start_time and end_time:
        # Convert string times to datetime objects
        start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        
        # Format for the query
        time_filter_cpo100 = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = f"""
        AND timestamp >= CAST('{start_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('{end_time}' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    else:
        # Default time range if not provided
        time_filter_cpo100 = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
        
        time_filter_gps = """
        AND timestamp >= CAST('2025-10-07' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        AND timestamp < CAST('2025-10-09' AS TIMESTAMP) AT TIME ZONE 'Asia/Kolkata' - INTERVAL '5' HOUR - INTERVAL '30' MINUTE
        """
    
    # Build the complete query
    query = f"""
    with cpo100 as
    (
      SELECT 
        id, timestamp, dt, 
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS IST,
        date_trunc('minute', timestamp) as ts_mins,  -- Truncate to minutes
        total_battery_current, bat_voltage, gear_position, odometerreading,
        vehiclereadycondition, ignitionstatus, gun_connection_status
      from 
        facts_prod.can_parsed_output_100
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_cpo100}
    ),
    cpo100_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cpo_rn
      from 
        cpo100
    ),
    c2c_gps_data as
    (
      select 
        id, timestamp,
        CAST(timestamp AT TIME ZONE 'Asia/Kolkata' AS timestamp) AS ts_ist,
        date_trunc('minute', timestamp) as ts_mins, date, latitude, longitude, altitude
      from 
        facts_prod.c2c_gps
      where 
        id IN ('{vehicle_ids_str}')
        {time_filter_gps}
    ),
    c2c_gps_ranked as
    (
      select 
        *, row_number() over(partition by id, ts_mins order by timestamp) as cg_rn
      from 
        c2c_gps_data
    )
    select 
        cp.id as vehicle_id, cp.timestamp, cp.dt, cp.IST, cp.ts_mins,
        cp.total_battery_current, cp.bat_voltage, cp.gear_position, cp.odometerreading,
        cp.vehiclereadycondition, cp.gun_connection_status,
        cp.ignitionstatus, cg.latitude, cg.longitude, cg.altitude
    from 
      cpo100_ranked as cp
      full outer join
      c2c_gps_ranked as cg
      on (cp.id = cg.id and cp.ts_mins = cg.ts_mins and cp.cpo_rn = cg.cg_rn)
    ORDER BY cp.id, cp.timestamp
    """
    
    # Connect to Trino
    conn = trino.dbapi.connect(
        host="trino",
        port=8080,
        user="admin",
        catalog="adhoc",
        schema="default"
    )
    
    # Execute the query
    cur = conn.cursor()
    cur.execute(query)
    
    # Get column names
    columns = [desc[0] for desc in cur.description]
    
    # Fetch all rows
    rows = cur.fetchall()
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns)
    
    # Close connections
    cur.close()
    conn.close()
    
    return df

start_time = '2025-09-24 00:00:00'
end_time = '2025-09-30 00:00:00'

df = get_vehicle_gps_data(start_time,end_time)

In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points on Earth (in km)
    """
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))

    # Radius of Earth in kilometers
    r = 6371.0
    return c * r

In [6]:
def is_near_charger(row, location_name, threshold_km=0.5):
    """
    Check if a GPS reading is near a charger location
    """
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        return False

    charger_lat, charger_lon = CHARGER_LOCATIONS[location_name]
    distance = haversine_distance(row['latitude'], row['longitude'], charger_lat, charger_lon)
    return distance <= threshold_km

In [7]:
def detect_trip_segments(df):
    """
    Detect proximity to charger locations (DDN, MZF, GGN).
    NOTE: The 'trip_segment' column in the returned DF is for debugging/internal use, 
    the actual trip segmentation is done by generate_trips_info.
    """
    # Make a copy to avoid SettingWithCopyWarning
    df = df.copy()

    # Initialize columns
    df['near_dehradun'] = False
    df['near_gurgaon'] = False
    df['near_muzaffarnagar'] = False

    # Check proximity to each charger
    for location in CHARGER_LOCATIONS:
        df[f'near_{location.lower()}'] = df.apply(
            lambda row: is_near_charger(row, location), axis=1
        )

    return df


In [8]:
def generate_trips_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Automatically generates the trips_info DataFrame by detecting transitions
    between key charger locations (DDN, MZF, GGN) based on GPS proximity 
    AND vehicle status (including gun connection).
    """
    logging.info("⚙️ Automatically generating trip start/end times and segment labels...")

    # 1. Add proximity flags to the DataFrame
    df = detect_trip_segments(df.copy())

    # 2a. Determine if the vehicle is in a true "stopped/off" state.
    # Robust Stop Indicator: Charging (1.0) OR Vehicle Not Ready (0.0) OR Ignition Off (0.0)
    df['is_stopped'] = (df['gun_connection_status'] == 1.0) | \
                       (df['vehiclereadycondition'] == 0.0) | \
                       (df['ignitionstatus'] == 0.0)

    # 2b. Define the current effective *Stop* location: 
    # Must be near a charger AND be in a stopped state.
    def get_effective_location(row):
        # A location is only marked if the vehicle is in a stopped/off state
        if row['is_stopped']:
             if row['near_dehradun']: return 'DDN'
             if row['near_gurgaon']: return 'GGN'
             if row['near_muzaffarnagar']: return 'MZF'
        return None # Otherwise, it's ON_ROAD or movement outside a defined stop

    df['Location'] = df.apply(get_effective_location, axis=1)

    # 3. Create a simplified sequence: 'DDN', 'ON_ROAD', 'MZF', 'ON_ROAD', 'GGN', ...
    df['location_sequence'] = df['Location'].fillna('ON_ROAD')

    # Identify the start of a new continuous location/road block
    df['segment_start'] = df['location_sequence'].ne(df['location_sequence'].shift())
    df['segment_id'] = df['segment_start'].cumsum()

    # 4. Filter to get the start/end time and location type of each segment
    segment_boundaries = df.groupby(['vehicle_id', 'segment_id']).agg(
        start_time=('IST', 'first'),
        end_time=('IST', 'last'),
        start_location=('Location', 'first'),
        end_location=('Location', 'last'),
        segment_type=('location_sequence', 'first')
    ).reset_index()

    # 5. Link segments to find the start and end point of each road trip
    # Shift by 1 to link the 'ON_ROAD' segment to the preceding location segment (Start Stop)
    segment_boundaries['Trip_Start_Location'] = segment_boundaries.groupby('vehicle_id')['start_location'].shift(1).fillna('')
    # Shift by -1 to link the 'ON_ROAD' segment to the succeeding location segment (End Stop)
    segment_boundaries['Trip_End_Location'] = segment_boundaries.groupby('vehicle_id')['end_location'].shift(-1).fillna('')

    # 6. Filter for the 'ON_ROAD' segments (the actual trips)
    trips_df = segment_boundaries[segment_boundaries['segment_type'] == 'ON_ROAD'].copy()

    # 7. Final filtering to ensure trips run between two defined charger locations 
    #    AND the start and end locations are different (FIX for X-X trips)
    final_trips = trips_df[
        (trips_df['Trip_Start_Location'].isin(['DDN', 'GGN', 'MZF'])) &
        (trips_df['Trip_End_Location'].isin(['DDN', 'GGN', 'MZF'])) &
        (trips_df['Trip_Start_Location'] != trips_df['Trip_End_Location']) 
    ].copy()

    # 8. Create the final trip naming convention (e.g., GGN-MZF)
    final_trips['Trip'] = final_trips.apply(
        lambda row: f"{row['Trip_Start_Location']}-{row['Trip_End_Location']}",
        axis=1
    )

    # 9. Finalize the trips_info DataFrame structure
    final_trips_info = final_trips[[
        'vehicle_id', 'Trip', 'start_time', 'end_time'
    ]].rename(columns={'start_time': 'Start', 'end_time': 'End'}).reset_index(drop=True)

    # Add Bus Number placeholder for consistency
    final_trips_info['Bus Number'] = final_trips_info['vehicle_id'].apply(
        lambda x: 'DL1PD8509' if str(x) == '18' else 'DL1PD8523'
    )
    # Ensure vehicle_id is string
    final_trips_info['vehicle_id'] = final_trips_info['vehicle_id'].astype(str)

    final_trips_info = final_trips_info[[
        'vehicle_id', 'Bus Number', 'Trip', 'Start', 'End'
    ]]

    logging.info(f"✅ Generated {len(final_trips_info)} trip segments.")
    return final_trips_info

In [9]:
def generate_sessions_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates session boundaries (start/end times) based on vehicle operational state:
    Charging, Driving, Idling, or Off.
    """
    logging.info("⚙️ Automatically generating Charging, Driving, and Idling sessions...")

    # Define the operational state for each row
    def get_session_type(row):
        is_charging = row['gun_connection_status'] == 1.0
        is_driving = row['gear_position'] == 2.0
        is_idle_ready = (row['gear_position'] == 0.0) and (row['vehiclereadycondition'] == 1.0) and (row['ignitionstatus'] == 1.0)
        
        if is_charging:
            return 'CHARGING'
        elif is_driving:
            return 'DRIVING'
        elif is_idle_ready:
            # Idling: Vehicle is ON and in park/neutral
            return 'IDLING'
        else:
            # OFF: Vehicle is not charging, not driving, and ignition/ready is off
            return 'OFF'

    df['Session_Type'] = df.apply(get_session_type, axis=1)

    # Group consecutive identical states
    df['session_start'] = df['Session_Type'].ne(df['Session_Type'].shift())
    df['session_id'] = df['session_start'].cumsum()

    # Aggregate to find the start and end of each unique session
    sessions_df = df.groupby(['vehicle_id', 'session_id', 'Session_Type']).agg(
        Start=('IST', 'first'),
        End=('IST', 'last'),
        # Get start/end odometer for distance calculation in DRIVING sessions
        Start_Odo=('odometerreading', 'first'),
        End_Odo=('odometerreading', 'last')
    ).reset_index()

    # Filter out the 'OFF' sessions as they are usually not needed for energy analysis
    final_sessions_info = sessions_df[sessions_df['Session_Type'] != 'OFF'].copy()

    # Finalize the sessions_info DataFrame structure
    final_sessions_info = final_sessions_info.rename(
        columns={'Session_Type': 'Trip'} # Renamed to 'Trip' to reuse the old analysis function structure
    ).reset_index(drop=True)

    # Add Bus Number placeholder
    final_sessions_info['Bus Number'] = final_sessions_info['vehicle_id'].apply(
        lambda x: 'DL1PD8509' if str(x) == '18' else 'DL1PD8523'
    )
    final_sessions_info['vehicle_id'] = final_sessions_info['vehicle_id'].astype(str)
    
    final_sessions_info = final_sessions_info[[
        'vehicle_id', 'Bus Number', 'Trip', 'Start', 'End', 'Start_Odo', 'End_Odo'
    ]]

    logging.info(f"✅ Generated {len(final_sessions_info)} operational sessions.")
    return final_sessions_info



In [10]:
def energy_mileage_trip_stats(df: pd.DataFrame, trips_info: pd.DataFrame):
    """
    Performs trip-level analysis on electric bus data to calculate energy consumption rates
    for specific trips between locations.
    """
    try:
        # Preprocess data
        logging.info("⚙️ Starting trip-level data processing...")

        # Convert IST column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df['IST']):
            df['IST'] = pd.to_datetime(df['IST'])

        # Drop rows with missing data in key columns
        df.dropna(subset=['odometerreading', 'gear_position', 'vehiclereadycondition', 
                         'total_battery_current', 'bat_voltage', 'IST', 'latitude', 'longitude', 'gun_connection_status'], 
                  inplace=True)
                  
        # Filter out extreme current values
        df = df[df['total_battery_current'].abs() <= 3000].copy()


        # Sort data by vehicle ID and timestamp
        df.sort_values(by=['vehicle_id', 'IST'], inplace=True)

        # Calculate time difference between consecutive data points
        df['time_diff_seconds'] = df.groupby('vehicle_id')['IST'].diff().dt.total_seconds().fillna(0)

        # Filter out records where time difference is zero or negative
        df = df[df['time_diff_seconds'] > 0]

        # Calculate power in kW
        df['power_kW'] = (df['bat_voltage'] * df['total_battery_current']) / 1000

        # Initialize results DataFrame
        all_trip_stats = []

        # Process each trip
        for _, trip in trips_info.iterrows():
            vehicle_id = trip['vehicle_id']
            trip_name = trip['Trip']
            start_time = pd.to_datetime(trip['Start'])
            end_time = pd.to_datetime(trip['End'])

            # Adjust end_time if it's on the next day
            if end_time < start_time:
                end_time += timedelta(days=1)

            # logging.info(f"⚙️ Processing trip: {trip_name} for vehicle {vehicle_id} from {start_time} to {end_time}")

            # Filter data for this vehicle and trip time period
            vehicle_df = df[df['vehicle_id'] == vehicle_id].copy()
            trip_df = vehicle_df[(vehicle_df['IST'] >= start_time) & 
                                 (vehicle_df['IST'] <= end_time)].copy()

            if trip_df.empty:
                # logging.warning(f"No data found for trip {trip_name} of vehicle {vehicle_id}")
                continue
            
            # --- DISTANCE CALCULATION: USE ODOMETER START/END DIFFERENCE ---
            valid_odometer_readings = trip_df.dropna(subset=['odometerreading'])
            trip_distance = 0 # Default to 0
            
            if not valid_odometer_readings.empty:
                # Get the first and last valid odometer reading in the trip time window
                odometer_start = valid_odometer_readings['odometerreading'].iloc[0]
                odometer_end = valid_odometer_readings['odometerreading'].iloc[-1]
                
                # Calculate the total distance. Use max(0, distance) to guard against resets.
                calculated_distance = odometer_end - odometer_start
                trip_distance = max(0, calculated_distance)
                
                # if trip_distance < 1.0:
                #      logging.warning(f"Odometer distance for trip {trip_name} is only {trip_distance:.2f} km. Check data integrity/reset.")
            else:
                logging.warning(f"No valid odometer readings available for trip {trip_name}.")

            # --- Analysis 1: Driving and Regenerative Braking Statistics (Energy) ---
            driving_df = trip_df[trip_df['gear_position'] == 2.0].copy()

            if not driving_df.empty:
                # Calculate energy consumption and regenerative braking energy
                driving_df['energy_consumption_kwh'] = driving_df.apply(
                    lambda row: row['power_kW'] * (row['time_diff_seconds'] / 3600) if row['power_kW'] > 0 else 0, 
                    axis=1)
                driving_df['regen_energy_kwh'] = driving_df.apply(
                    lambda row: -row['power_kW'] * (row['time_diff_seconds'] / 3600) if row['power_kW'] < 0 else 0, 
                    axis=1)
                
                # Sum driving energy statistics
                trip_driving_energy = driving_df['energy_consumption_kwh'].sum()
                trip_regen_energy = driving_df['regen_energy_kwh'].sum()
            else:
                trip_driving_energy = 0
                trip_regen_energy = 0

            # --- Analysis 2: Idling Energy Consumption ---
            # NOTE: We now exclude charging from idling to keep logic clean.
            stationary_df = trip_df[(trip_df['gear_position'] == 0.0) & 
                                   (trip_df['vehiclereadycondition'] == 1.0) &
                                   (trip_df['gun_connection_status'] == 0.0)].copy()

            if not stationary_df.empty:
                stationary_df['energy_kwh'] = stationary_df.apply(
                    lambda row: row['power_kW'] * (row['time_diff_seconds'] / 3600) if row['power_kW'] > 0 else 0, 
                    axis=1)
                trip_idling_energy = stationary_df['energy_kwh'].sum()
            else:
                trip_idling_energy = 0

            # Calculate net energy and consumption rate
            net_energy = trip_driving_energy - trip_regen_energy
            energy_rate = net_energy / trip_distance if trip_distance > 0 else 0

            # Create trip statistics DataFrame
            trip_stats = pd.DataFrame({
                'vehicle_id': [vehicle_id],
                'trip': [trip_name],
                'start_time': [start_time],
                'end_time': [end_time],
                'distance_km': [round(trip_distance, 2)],
                'energy_consumed_kwh': [round(trip_driving_energy, 2)],
                'regen_energy_kwh': [round(trip_regen_energy, 2)],
                'idling_energy_kwh': [round(trip_idling_energy, 2)],
                'net_energy_kwh': [round(net_energy, 2)],
                'energy_rate_kwh_per_km': [round(energy_rate, 2)]
            })

            all_trip_stats.append(trip_stats)

        if all_trip_stats:
            final_df = pd.concat(all_trip_stats, ignore_index=True)
            logging.info("✅ All trip data processed successfully.")
            return final_df
        else:
            # logging.warning("No trip statistics were calculated.")
            return pd.DataFrame()

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return pd.DataFrame()

In [11]:
sessions_info = generate_sessions_info(df)
print("Auto-Generated sessions_info (Charging, Driving, Idling):")
display(sessions_info[['vehicle_id', 'Trip', 'Start', 'End', 'Start_Odo', 'End_Odo']].head(10))

2025-10-12 05:18:48 - INFO - ⚙️ Automatically generating Charging, Driving, and Idling sessions...
2025-10-12 05:18:53 - INFO - ✅ Generated 14826 operational sessions.


Auto-Generated sessions_info (Charging, Driving, Idling):


Unnamed: 0,vehicle_id,Trip,Start,End,Start_Odo,End_Odo
0,18,DRIVING,2025-09-24 00:00:01.798,2025-09-24 00:10:54.378,4634.0,4643.5
1,18,DRIVING,2025-09-24 00:10:56.058,2025-09-24 00:19:58.918,4643.625,4649.75
2,18,IDLING,2025-09-24 00:20:02.319,2025-09-24 00:20:09.018,4649.75,4649.75
3,18,IDLING,2025-09-24 00:20:16.399,2025-09-24 00:20:23.499,4649.75,4649.75
4,18,IDLING,2025-09-24 00:20:30.559,2025-09-24 00:21:05.118,4649.75,4649.75
5,18,IDLING,2025-09-24 00:21:06.998,2025-09-24 00:21:30.241,4649.75,4649.75
6,18,IDLING,2025-09-24 00:21:35.978,2025-09-24 00:21:38.219,,
7,18,IDLING,2025-09-24 00:21:42.758,2025-09-24 00:21:51.178,4649.75,4649.75
8,18,IDLING,2025-09-24 00:21:55.799,2025-09-24 00:22:17.150,4649.75,4649.75
9,18,IDLING,2025-09-24 00:22:21.799,2025-09-24 00:22:32.678,4649.75,4649.75


In [12]:
display(sessions_info[['vehicle_id', 'Trip', 'Start', 'End', 'Start_Odo', 'End_Odo']])

Unnamed: 0,vehicle_id,Trip,Start,End,Start_Odo,End_Odo
0,18,DRIVING,2025-09-24 00:00:01.798,2025-09-24 00:10:54.378,4634.000,4643.500
1,18,DRIVING,2025-09-24 00:10:56.058,2025-09-24 00:19:58.918,4643.625,4649.750
2,18,IDLING,2025-09-24 00:20:02.319,2025-09-24 00:20:09.018,4649.750,4649.750
3,18,IDLING,2025-09-24 00:20:16.399,2025-09-24 00:20:23.499,4649.750,4649.750
4,18,IDLING,2025-09-24 00:20:30.559,2025-09-24 00:21:05.118,4649.750,4649.750
...,...,...,...,...,...,...
14821,19,DRIVING,2025-09-29 23:55:46.218,2025-09-29 23:55:56.298,,
14822,19,DRIVING,2025-09-29 23:55:57.899,2025-09-29 23:56:31.169,9247.000,9247.375
14823,19,DRIVING,2025-09-29 23:56:32.379,2025-09-29 23:57:02.418,9247.500,9248.000
14824,19,DRIVING,2025-09-29 23:57:05.618,2025-09-29 23:57:32.978,9248.125,9248.500


In [13]:
# Process session-level energy statistics
session_stats = energy_mileage_trip_stats(df, sessions_info)

# Display results
print("\nSession Statistics:")
display(session_stats.head(10))

2025-10-12 05:18:53 - INFO - ⚙️ Starting trip-level data processing...
2025-10-12 05:20:00 - INFO - ✅ All trip data processed successfully.



Session Statistics:


Unnamed: 0,vehicle_id,trip,start_time,end_time,distance_km,energy_consumed_kwh,regen_energy_kwh,idling_energy_kwh,net_energy_kwh,energy_rate_kwh_per_km
0,18,DRIVING,2025-09-24 00:00:01.798,2025-09-24 00:10:54.378,7.62,8.79,0.05,0.0,8.74,1.15
1,18,DRIVING,2025-09-24 00:10:56.058,2025-09-24 00:19:58.918,5.75,4.91,4.44,0.0,0.47,0.08
2,18,IDLING,2025-09-24 00:20:02.319,2025-09-24 00:20:09.018,0.0,0.0,0.0,0.08,0.0,0.0
3,18,IDLING,2025-09-24 00:20:30.559,2025-09-24 00:21:05.118,0.0,0.0,0.0,0.03,0.0,0.0
4,18,IDLING,2025-09-24 00:21:06.998,2025-09-24 00:21:30.241,0.0,0.0,0.0,0.01,0.0,0.0
5,18,DRIVING,2025-09-24 00:22:34.718,2025-09-24 00:33:36.738,7.62,1.82,2.25,0.0,-0.43,-0.06
6,18,DRIVING,2025-09-24 00:33:39.279,2025-09-24 00:35:13.018,0.0,0.25,0.0,0.0,0.25,0.0
7,18,IDLING,2025-09-24 00:35:49.278,2025-09-24 00:36:07.538,0.0,0.0,0.0,0.08,0.0,0.0
8,18,IDLING,2025-09-24 00:36:10.658,2025-09-24 00:36:25.339,0.0,0.0,0.0,0.0,0.0,0.0
9,18,IDLING,2025-09-24 00:36:52.719,2025-09-24 00:41:41.362,0.0,0.0,0.0,0.18,0.0,0.0


In [14]:
# Pre-process the DataFrame to add location proximity flags (needed by generate_trips_info)
df = detect_trip_segments(df)

# --- AUTOMATICALLY GENERATE trip_info ---
trips_info = generate_trips_info(df)
print("Auto-Generated trips_info:")
display(trips_info)
# ----------------------------------------

2025-10-12 05:20:01 - INFO - ⚙️ Automatically generating trip start/end times and segment labels...
2025-10-12 05:20:02 - INFO - ✅ Generated 43 trip segments.


Auto-Generated trips_info:


Unnamed: 0,vehicle_id,Bus Number,Trip,Start,End
0,18,DL1PD8509,GGN-MZF,2025-09-24 06:01:04.506,2025-09-24 10:36:01.279
1,18,DL1PD8509,MZF-DDN,2025-09-24 11:23:46.798,2025-09-24 14:08:00.191
2,18,DL1PD8509,DDN-MZF,2025-09-24 16:12:02.532,2025-09-24 19:55:03.100
3,18,DL1PD8509,MZF-GGN,2025-09-24 20:30:02.300,2025-09-25 01:24:24.799
4,18,DL1PD8509,GGN-MZF,2025-09-25 05:30:01.759,2025-09-25 10:31:09.886
5,18,DL1PD8509,MZF-DDN,2025-09-25 10:59:03.006,2025-09-25 13:55:11.070
6,18,DL1PD8509,DDN-MZF,2025-09-25 16:48:01.232,2025-09-25 19:39:14.022
7,18,DL1PD8509,MZF-GGN,2025-09-25 20:10:00.562,2025-09-26 02:57:00.087
8,18,DL1PD8509,GGN-MZF,2025-09-26 06:01:00.183,2025-09-26 10:41:03.289
9,18,DL1PD8509,MZF-DDN,2025-09-26 11:30:44.715,2025-09-26 14:29:04.174


In [15]:
# Process trip-level energy statistics
trip_stats = energy_mileage_trip_stats(df, trips_info)

# Display results
print("\nTrip Statistics:")
display(trip_stats)

2025-10-12 05:20:02 - INFO - ⚙️ Starting trip-level data processing...
2025-10-12 05:20:02 - INFO - ✅ All trip data processed successfully.



Trip Statistics:


Unnamed: 0,vehicle_id,trip,start_time,end_time,distance_km,energy_consumed_kwh,regen_energy_kwh,idling_energy_kwh,net_energy_kwh,energy_rate_kwh_per_km
0,18,GGN-MZF,2025-09-24 06:01:04.506,2025-09-24 10:36:01.279,197.12,214.72,47.13,6.82,167.59,0.85
1,18,MZF-DDN,2025-09-24 11:23:46.798,2025-09-24 14:08:00.191,125.12,170.71,9.71,3.5,161.01,1.29
2,18,DDN-MZF,2025-09-24 16:12:02.532,2025-09-24 19:55:03.100,127.5,145.37,39.3,7.16,106.08,0.83
3,18,MZF-GGN,2025-09-24 20:30:02.300,2025-09-25 01:24:24.799,216.62,239.03,68.69,4.28,170.33,0.79
4,18,GGN-MZF,2025-09-25 05:30:01.759,2025-09-25 10:31:09.886,207.5,213.63,53.69,3.43,159.94,0.77
5,18,MZF-DDN,2025-09-25 10:59:03.006,2025-09-25 13:55:11.070,125.5,130.42,23.62,6.85,106.8,0.85
6,18,DDN-MZF,2025-09-25 16:48:01.232,2025-09-25 19:39:14.022,127.62,174.56,17.64,3.42,156.92,1.23
7,18,MZF-GGN,2025-09-25 20:10:00.562,2025-09-26 02:57:00.087,209.62,291.79,52.7,5.48,239.1,1.14
8,18,GGN-MZF,2025-09-26 06:01:00.183,2025-09-26 10:41:03.289,197.88,224.76,39.75,3.69,185.01,0.93
9,18,MZF-DDN,2025-09-26 11:30:44.715,2025-09-26 14:29:04.174,125.5,198.17,15.63,6.13,182.54,1.45


In [16]:
trips_info.to_csv('trips_info.csv')

In [17]:
trip_stats.to_csv('trip_stats.csv')

In [18]:
trip_stats[trip_stats.distance_km>0]

Unnamed: 0,vehicle_id,trip,start_time,end_time,distance_km,energy_consumed_kwh,regen_energy_kwh,idling_energy_kwh,net_energy_kwh,energy_rate_kwh_per_km
0,18,GGN-MZF,2025-09-24 06:01:04.506,2025-09-24 10:36:01.279,197.12,214.72,47.13,6.82,167.59,0.85
1,18,MZF-DDN,2025-09-24 11:23:46.798,2025-09-24 14:08:00.191,125.12,170.71,9.71,3.5,161.01,1.29
2,18,DDN-MZF,2025-09-24 16:12:02.532,2025-09-24 19:55:03.100,127.5,145.37,39.3,7.16,106.08,0.83
3,18,MZF-GGN,2025-09-24 20:30:02.300,2025-09-25 01:24:24.799,216.62,239.03,68.69,4.28,170.33,0.79
4,18,GGN-MZF,2025-09-25 05:30:01.759,2025-09-25 10:31:09.886,207.5,213.63,53.69,3.43,159.94,0.77
5,18,MZF-DDN,2025-09-25 10:59:03.006,2025-09-25 13:55:11.070,125.5,130.42,23.62,6.85,106.8,0.85
6,18,DDN-MZF,2025-09-25 16:48:01.232,2025-09-25 19:39:14.022,127.62,174.56,17.64,3.42,156.92,1.23
7,18,MZF-GGN,2025-09-25 20:10:00.562,2025-09-26 02:57:00.087,209.62,291.79,52.7,5.48,239.1,1.14
8,18,GGN-MZF,2025-09-26 06:01:00.183,2025-09-26 10:41:03.289,197.88,224.76,39.75,3.69,185.01,0.93
9,18,MZF-DDN,2025-09-26 11:30:44.715,2025-09-26 14:29:04.174,125.5,198.17,15.63,6.13,182.54,1.45


In [19]:
trip_stats.trip.unique()

array(['GGN-MZF', 'MZF-DDN', 'DDN-MZF', 'MZF-GGN', 'DDN-GGN', 'GGN-DDN'],
      dtype=object)