In [1]:
import sys
import platform
import logging
sys.path.append('..')

import pandas as pd
import numpy as np
import trino
from datetime import datetime, date, timedelta
from db_operations import connect_to_trino, write_df_to_iceberg

In [2]:
# Configure basic logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')


In [3]:
# ---- reporting config (edit ONLY this) ----
TABLE_NAME = "power_consumption_report"   # <‚Äî This is now the source of truth for the table name


# --------------------
# Step 2: Function to fetch data for a given day
# --------------------
def fetch_data_for_day(conn, date_str: str, ids: list = None) -> pd.DataFrame:

    logging.info(f"üì• [2/5] STEP 2a: Validating and fetching data for {date_str}...")

    try:
        target_date = datetime.strptime(date_str, "%Y-%m-%d").date()
        yesterday_date = target_date - timedelta(days=1)
    except ValueError as e:
        logging.error(f"Invalid date format: {e}")
        return pd.DataFrame()    
    
    # Conditional WHERE clause based on the 'ids' parameter
    where_clause = ""
    if ids is not None and len(ids) > 0:
        id_list_str = ", ".join(f"'{id}'" for id in ids)
        where_clause = f"AND id IN ({id_list_str})"

    query = f"""
        WITH two_days_data AS (
            SELECT 
                "id",
                at_timezone("timestamp", 'Asia/Kolkata') AS IST,
                "BAT_SOC", "Bat_Voltage", "Total_Battery_Current", "GUN_Connection_Status",
                "OdoMeterReading", "Gear_Position", "Vehiclereadycondition",
                "Chargingcontactor1positive", "Chargingcontactor1negative",
                "Chargingcontactor2positive", "Chargingcontactor2negative"
            FROM
                facts_prod.can_parsed_output_100
            WHERE
                dt = DATE '{target_date.isoformat()}' OR dt = DATE '{yesterday_date.isoformat()}'
                {where_clause}                
        )
        SELECT
            *
        FROM
            two_days_data
        WHERE
            CAST(IST AS DATE) = DATE '{target_date.isoformat()}'
    """
    try:
        cursor = conn.cursor()
        logging.info(f"‚öôÔ∏è [2/5] STEP 2b: Executing query for {target_date} and {yesterday_date}...")
        cursor.execute(query)
    
        logging.info("‚úÖ [2/5] STEP 2c: Query executed successfully!")
        data = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(data, columns=columns)
    except Exception as e:
        logging.error(f"‚ùå Error during data fetching: {e}")
        return pd.DataFrame()    

    return df

In [4]:
def process_soc_charging_data(df: pd.DataFrame):
    """
    Generates a summary, and performs outlier analysis for charging events
    across multiple device IDs.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing raw data for one or more devices.
        
    Returns:
        pd.DataFrame: A DataFrame summarizing charging events for each device.
    """

    # Create a copy of the DataFrame to avoid modifying a slice.
    logging.info("‚öôÔ∏è STEP 3a: Starting data processing...")
    df = df.copy()    

    if not df.empty:
        df['id'] = df['id'].astype(str)
        df['Total_Battery_Current'] = pd.to_numeric(df['Total_Battery_Current'], errors='coerce')
        df['Total_Battery_Current'] = df['Total_Battery_Current'].astype(float)
        logging.info(f"‚úÖ [3/5] STEP 3b: Data fetching for {date_str} completed. Rows fetched: {len(df)}")
    else:
        logging.warning(f"‚ö†Ô∏è [3/5] STEP 3b: No data found for {date_str}. Returning empty DataFrame.")
    
    if df.empty or 'id' not in df.columns:
        logging.warning("Input DataFrame is empty or does not contain an 'id' column.")
        return pd.DataFrame()
        
    device_ids = df['id'].unique().tolist()
    all_summary_data = []
    
    for device_id in device_ids:
        logging.info(f"‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: {device_id}")
        device_df = df[df['id'] == device_id].copy()

        # Check if the grouped DataFrame is empty
        if device_df.empty:
            logging.warning(f"No charging events were detected for device {device_id}. Skipping.")
            continue        
        
        if 'timestamp' in device_df.columns:
            device_df.loc[:, 'ist_timestamp'] = pd.to_datetime(device_df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
        elif 'IST' in device_df.columns:
            device_df.loc[:, 'ist_timestamp'] = pd.to_datetime(device_df['IST'])
        else:
            logging.info(f"Neither 'timestamp' nor 'IST' column found for device {device_id}. Cannot proceed.")
            continue
            
        device_df.sort_values(by='ist_timestamp', inplace=True)
        device_df.reset_index(drop=True, inplace=True)

        current_threshold = 3200
        device_df = device_df[(device_df['Total_Battery_Current'].abs() != current_threshold)].copy()
        
        for col in ['BAT_SOC', 'Bat_Voltage', 'Chargingcontactor1positive',
                    'Chargingcontactor1negative', 'Chargingcontactor2positive',
                    'Chargingcontactor2negative', 'GUN_Connection_Status']:
            device_df.loc[:, col] = device_df[col].fillna(0)
        
        for col in ['BAT_SOC', 'Bat_Voltage']:
            device_df.loc[:, col] = device_df[col].replace(0.0, np.nan).ffill().bfill()
        
        device_df.loc[:, 'Total_Battery_Current'] = device_df['Total_Battery_Current'].fillna(0)

        # --- MODIFICATION START ---
        # Use GUN_Connection_Status for session detection
        charging_start_indices = device_df[device_df['GUN_Connection_Status'].diff() == 1].index.tolist()
        charging_end_indices = device_df[device_df['GUN_Connection_Status'].diff() == -1].index.tolist()
        # --- MODIFICATION END ---
        
        if not charging_start_indices and not device_df.empty and device_df.iloc[0]['GUN_Connection_Status'] == 1:
            charging_start_indices.insert(0, device_df.index[0])
        
        if len(charging_end_indices) < len(charging_start_indices):
            if len(charging_start_indices) > 0:
                charging_end_indices.append(device_df.index[-1])
            
        merged_events = []
        if len(charging_start_indices) > 0 and len(charging_end_indices) > 0:
            num_sessions = min(len(charging_start_indices), len(charging_end_indices))

            for i in range(num_sessions):
                start_index = charging_start_indices[i]
                end_index = charging_end_indices[i]
                
                if i > 0:
                    prev_end_time = device_df.loc[charging_end_indices[i-1], 'ist_timestamp']
                    current_start_time = device_df.loc[start_index, 'ist_timestamp']
                    prev_end_soc = device_df.loc[charging_end_indices[i-1], 'BAT_SOC']
                    current_start_soc = device_df.loc[start_index, 'BAT_SOC']

                    time_diff = (current_start_time - prev_end_time).total_seconds()
                    soc_diff = abs(current_start_soc - prev_end_soc)
                    
                    if (time_diff <= 5 * 60 and soc_diff <= 1.0) or (time_diff <= 60):
                        merged_events[-1] = (merged_events[-1][0], end_index)
                        continue
                
                merged_events.append((start_index, end_index))
        else:
            logging.info(f"No charging events were detected for device {device_id}.")
            continue

        summary_data_device = []
        BATTERY_CAPACITY_KWH = 423
        
        for start_index, end_index in merged_events:
            event_df = device_df.loc[start_index:end_index].copy()

            if event_df.empty:
                logging.info(f"Warning: Empty event data found for device {device_id}. Skipping.")
                continue            
                
            # MODIFICATION: Use 'GUN_Connection_Status' to filter for charging periods
            charging_periods = event_df[event_df['GUN_Connection_Status'] == 1].copy()
            
            total_duration = 0
            if not charging_periods.empty:
                charging_periods.loc[:, 'time_diff'] = charging_periods['ist_timestamp'].diff().dt.total_seconds().fillna(0)
                total_duration = int(charging_periods['time_diff'].sum())

            start_row = event_df.iloc[0].copy()
            end_row = event_df.iloc[-1].copy()
            
            energy_Wh = 0
            if not charging_periods.empty:
                charging_periods.loc[:, 'power_W'] = charging_periods['Bat_Voltage'] * charging_periods['Total_Battery_Current'].abs()
                energy_Wh = np.trapezoid(charging_periods['power_W'], x=charging_periods['ist_timestamp'].astype(np.int64) / 10**9) / 3600
            
            total_kwh_consumed_tpc = energy_Wh / 1000

            total_kwh_consumed_soc = (end_row['BAT_SOC'] - start_row['BAT_SOC']) * BATTERY_CAPACITY_KWH / 100
            total_kwh_consumed_soc = abs(total_kwh_consumed_soc)

            percent_diff = 0
            if total_kwh_consumed_tpc + total_kwh_consumed_soc != 0:
                percent_diff = (abs(total_kwh_consumed_tpc - total_kwh_consumed_soc) / 
                                ((total_kwh_consumed_tpc + total_kwh_consumed_soc) / 2)) * 100
            
            summary_data_device.append({
                'vehicle_id': device_id,
                'start_time': start_row['ist_timestamp'],
                'end_time': end_row['ist_timestamp'],
                'charge_dur_seconds': round(total_duration,0),
                'soc_start': start_row['BAT_SOC'],
                'soc_end': end_row['BAT_SOC'],
                'tpc_kwh': round(total_kwh_consumed_tpc,2),
                'soc_kwh': round(total_kwh_consumed_soc,2),
                'diff_kwh_percent': round(percent_diff,2)
            })
        
        all_summary_data.extend(summary_data_device)

    logging.info("‚úÖ STEP 3d: All vehicle data processed.")
    return pd.DataFrame(all_summary_data)

In [5]:
# --------------------
# Main execution logic
# --------------------

conn = connect_to_trino()

if conn:
    try:
        # Define the vehicle IDs for the report
        vehicle_ids_for_report = ['3', '16', '18', '19']

        # Get the date to process
        yesterday = date.today() - timedelta(days=1)
        date_str = yesterday.isoformat()
        
        logging.info(f"‚ñ∂Ô∏è Starting daily report job for {date_str}")

        # Fetch data using the logic from this file
        df_raw = fetch_data_for_day(conn, date_str, vehicle_ids_for_report)
        
        if not df_raw.empty:
            # Process the data
            df_processed = process_soc_charging_data(df_raw)
            
            if not df_processed.empty:
                # Write to the database (uncomment to enable)
                # write_df_to_iceberg(df_processed)
                logging.info("‚úÖ STEP 4: Processing and write for all IDs complete.")
            else:
                logging.info("Processed DataFrame is empty. No data to write.")
        else:
            logging.info("Raw DataFrame is empty. No processing needed.")

    except Exception as e:
        logging.critical(f"‚ùå A critical error occurred in the main script: {e}")
        
    finally:
        logging.info("üîí STEP 5: Closing Trino connection...")
        conn.close()
        logging.info("‚úÖ STEP 5: Connection closed.")
else:
    logging.critical("‚ùå Failed to establish a database connection. Exiting.")

2025-09-15 18:14:05 - INFO - ‚ñ∂Ô∏è Starting daily report job for 2025-09-14
2025-09-15 18:14:05 - INFO - üì• [2/5] STEP 2a: Validating and fetching data for 2025-09-14...
2025-09-15 18:14:05 - INFO - ‚öôÔ∏è [2/5] STEP 2b: Executing query for 2025-09-14 and 2025-09-13...


üîå [1/5] STEP 1: Connecting to Trino...
‚úÖ [1/5] STEP 1: Connected to Trino


2025-09-15 18:14:20 - INFO - ‚úÖ [2/5] STEP 2c: Query executed successfully!
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3a: Starting data processing...
2025-09-15 18:14:25 - INFO - ‚úÖ [3/5] STEP 3b: Data fetching for 2025-09-14 completed. Rows fetched: 302586
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 13
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 15
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 16
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 3
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 7
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 11
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 14
2025-09-15 18:14:25 - INFO - ‚öôÔ∏è STEP 3c: Processing charging data for vehicle ID: 6
2025-09-15 18:14:25 - INFO - No 