In [5]:
# cols = df.filter(regex='^(Pack1_cell)', axis=1).columns
# cols

In [1]:
!python3 --version 

Python 3.13.7


In [121]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os
import logging
import math

# Set up logging for better output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def generate_and_analyze_report(raw_file_path):
    """
    Generates a PDF report, a CSV summary, and performs outlier analysis
    for charging events.

    Args:
        raw_file_path (str): The path to the raw data CSV file.
    """
    # Define output file paths based on the input file
    base_dir = os.path.dirname(raw_file_path)
    output_folder = os.path.join(base_dir, 'socAnalysis')
    os.makedirs(output_folder, exist_ok=True)

    output_base_name = os.path.basename(raw_file_path).split('.')[0]
    output_csv_filename = os.path.join(output_folder, f'{output_base_name}_charging_event_summary.csv')
    output_pdf_filename = os.path.join(output_folder, f'{output_base_name}_charging_event_summary.pdf')

    try:
        df = pd.read_csv(raw_file_path)
    except FileNotFoundError:
        print(f"Error: The raw file '{raw_file_path}' was not found.")
        return

    vehicle_dict = {3: "HR55AY7626", 16: "HR55AY9237", 18: "T0825AP8224BE", 19: "T0825AP8227BE"}
    vehicle_id = df['id'].iloc[0] if 'id' in df.columns else 'Unknown'
    vehicle_reg = vehicle_dict.get(vehicle_id, "Unknown")
    
    if 'timestamp' in df.columns:
        df['IST'] = pd.to_datetime(df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
    elif 'IST' in df.columns:
        df['IST'] = pd.to_datetime(df['IST'])
    else:
        logger.error("Neither 'timestamp' nor 'IST' column found. Cannot proceed.")
        return
        
    df.sort_values(by='IST', inplace=True)
    df.reset_index(drop=True, inplace=True)

    current_threshold = 3200
    df = df[(df['Total_Battery_Current'].abs() != current_threshold)].copy()
    for col in ['BAT_SOC', 'Bat_Voltage']:
        df[col] = df[col].replace(0.0, np.nan).ffill().bfill()
        
    # NEW LOGIC: Fill missing current data with 0 to be included in idle time calculation
    df['Total_Battery_Current'] = df['Total_Battery_Current'].fillna(0)

    df.dropna(subset=['BAT_SOC', 'Bat_Voltage', 'Chargingcontactor1positive'], inplace=True)
    
    charging_start_indices = df[df['Chargingcontactor1positive'].diff() == 1].index.tolist()
    charging_end_indices = df[df['Chargingcontactor1positive'].diff() == -1].index.tolist()

    if df.iloc[0]['Chargingcontactor1positive'] == 1 and (not charging_start_indices or charging_start_indices[0] != df.index[0]):
        charging_start_indices.insert(0, df.index[0])

    if df.iloc[-1]['Chargingcontactor1positive'] == 1 and len(charging_start_indices) > len(charging_end_indices):
        charging_end_indices.append(df.index[-1])
        
    merged_events = []
    if len(charging_start_indices) > 0 and len(charging_end_indices) > 0:
        current_start_index = charging_start_indices[0]
        current_end_index = charging_end_indices[0]
        
        time_threshold_seconds = 5 * 60
        soc_threshold = 1.0
        
        for i in range(1, len(charging_start_indices)):
            prev_end_index = charging_end_indices[i-1]
            current_start_index_next = charging_start_indices[i]
            
            prev_end_time = df.loc[prev_end_index, 'IST']
            current_start_time = df.loc[current_start_index_next, 'IST']
            prev_end_soc = df.loc[prev_end_index, 'BAT_SOC']
            current_start_soc = df.loc[current_start_index_next, 'BAT_SOC']

            time_diff = (current_start_time - prev_end_time).total_seconds()
            soc_diff = abs(current_start_soc - prev_end_soc)
            
            if (time_diff <= time_threshold_seconds and soc_diff <= soc_threshold) or (time_diff <= 60):
                current_end_index = charging_end_indices[i]
            else:
                merged_events.append((current_start_index, current_end_index))
                current_start_index = charging_start_indices[i]
                current_end_index = charging_end_indices[i]
        
        merged_events.append((current_start_index, current_end_index))
    else:
        logger.warning("No charging events were detected.")
        return

    summary_data = []
    BATTERY_CAPACITY_KWH = 423
    
    with PdfPages(output_pdf_filename) as pdf:
        for i, (start_index, end_index) in enumerate(merged_events):
            event_df = df.loc[start_index:end_index].copy()
            
            charging_periods = event_df[event_df['GUN_Connection_Status'] == 1].copy()
            
            total_duration = 0
            total_idle_duration = 0
            if not charging_periods.empty:
                charging_periods.loc[:, 'time_diff'] = charging_periods['IST'].diff().dt.total_seconds().fillna(0)
                total_duration = charging_periods['time_diff'].sum()
                
                # # RE-IMPLEMENTED LOGIC: Calculate idle time based on current
                # idle_periods = charging_periods[charging_periods['Total_Battery_Current'].abs() < 1].copy()
                # if not idle_periods.empty:
                #     idle_periods.loc[:, 'idle_time_diff'] = idle_periods['IST'].diff().dt.total_seconds().fillna(0)
                #     total_idle_duration = idle_periods['idle_time_diff'].sum()

            start_row = event_df.iloc[0].copy()
            end_row = event_df.iloc[-1].copy()
            
            full_soc_time = None
            if end_row['BAT_SOC'] >= 100.0:
                soc_100_rows = charging_periods[charging_periods['BAT_SOC'] >= 100.0].copy()
                if not soc_100_rows.empty:
                    soc_100_timestamp = soc_100_rows.iloc[0]['IST']
                    time_to_100 = 0
                    
                    charging_periods_up_to_100 = charging_periods[charging_periods['IST'] <= soc_100_timestamp].copy()
                    if not charging_periods_up_to_100.empty:
                        charging_periods_up_to_100.loc[:, 'time_diff'] = charging_periods_up_to_100['IST'].diff().dt.total_seconds().fillna(0)
                        time_to_100 = charging_periods_up_to_100['time_diff'].sum()
                    
                    full_soc_time = time_to_100

            energy_Wh = 0
            if not charging_periods.empty:
                charging_periods.loc[:, 'power_W'] = charging_periods['Bat_Voltage'] * charging_periods['Total_Battery_Current'].abs()
                energy_Wh = np.trapezoid(charging_periods['power_W'], x=charging_periods['IST'].astype(np.int64) / 10**9) / 3600
            
            total_kwh_consumed_tpc = energy_Wh / 1000

            total_kwh_consumed_soc = (end_row['BAT_SOC'] - start_row['BAT_SOC']) * BATTERY_CAPACITY_KWH / 100

            percent_diff = 0
            if total_kwh_consumed_tpc + total_kwh_consumed_soc != 0:
                percent_diff = (abs(total_kwh_consumed_tpc - total_kwh_consumed_soc) / 
                                ((total_kwh_consumed_tpc + total_kwh_consumed_soc) / 2)) * 100
            
            summary_data.append({
                'evnt_id': (i+1),
                'start_time': start_row['IST'],
                'end_time': end_row['IST'],
                'ttl_dur': total_duration,
                # 'idle_time': total_idle_duration, # RE-ADDED
                'soc_start': start_row['BAT_SOC'],
                'soc_end': end_row['BAT_SOC'],
                'tpc_kwh': total_kwh_consumed_tpc,
                'soc_kwh': total_kwh_consumed_soc,
                'diff_kw_percent': percent_diff,
                'full_soc_time': full_soc_time
            })

            # --- Plotting Logic for Individual Graphs ---
            fig, ax1 = plt.subplots(figsize=(12, 6))

            color = 'tab:blue'
            ax1.set_xlabel('Timestamp (IST)')
            ax1.set_ylabel('Battery SOC (%)', color=color)
            ax1.plot(event_df['IST'], event_df['BAT_SOC'], marker='o', linestyle='-', color=color, label='Battery SOC')
            ax1.tick_params(axis='y', labelcolor=color)
            ax1.grid(True)
            
            ax2 = ax1.twinx()
            color = 'tab:red'
            ax2.set_ylabel('Total Battery Current (A)', color=color)
            # ax2.plot(event_df['IST'], event_df['Total_Battery_Current'], marker='x', linestyle='--', color=color, label='Total Battery Current')
            ax2.plot(event_df['IST'], event_df['Total_Battery_Current'].abs(), 'x', markersize=2, linestyle='--', color='r', label='Total Battery Current (A)')
            ax2.tick_params(axis='y', labelcolor=color)

            plt.title(f'Battery SOC and Current vs. Time for Charging Event {i+1}')
            fig.tight_layout()
            pdf.savefig(fig)
            plt.close(fig)

        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df[summary_df['soc_start'] != summary_df['soc_end']].reset_index(drop=True)

        if summary_df.empty:
            logger.warning("No meaningful charging events found after filtering. No report will be generated.")
            return

        # Generate a CSV summary table from a copy to keep original data
        summary_df_table = summary_df.copy()
        summary_df_table['start_time'] = summary_df_table['start_time'].dt.strftime('%d/%m %H:%M:%S')
        summary_df_table['end_time'] = summary_df_table['end_time'].dt.strftime('%d/%m %H:%M:%S')
        
        summary_df_table['ttl_dur'] = summary_df_table['ttl_dur'].apply(lambda x: f"{int(x // 3600):02d}:{int((x % 3600) // 60):02d}:{int(x % 60):02d}" if pd.notna(x) else 'N/A')
        # summary_df_table['idle_time'] = summary_df_table['idle_time'].apply(lambda x: f"{int(x // 3600):02d}:{int((x % 3600) // 60):02d}:{int(x % 60):02d}" if pd.notna(x) else 'N/A') # RE-ADDED
        summary_df_table['full_soc_time'] = summary_df_table['full_soc_time'].apply(lambda x: f"{int(x // 3600):02d}:{int((x % 3600) // 60):02d}:{int(x % 60):02d}" if pd.notna(x) else 'N/A')

        summary_df_table['tpc_kwh'] = summary_df_table['tpc_kwh'].round(2)
        summary_df_table['soc_kwh'] = summary_df_table['soc_kwh'].round(2)
        summary_df_table['diff_kw_percent'] = summary_df_table['diff_kw_percent'].round(2)
        
        # Save the formatted table to CSV
        summary_df.to_csv(output_csv_filename, index=False)
        
        # --- Create and Add Summary Table to PDF ---
        fig_summary, ax_summary = plt.subplots(figsize=(11.69, 8.27))  # A4 size in inches
        ax_summary.axis('off')

        table_data = summary_df_table.values
        col_labels = summary_df_table.columns
        
        table = ax_summary.table(
            cellText=table_data, 
            colLabels=col_labels, 
            loc='center', 
            cellLoc='center',
            colWidths=[0.05, 0.12, 0.12, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08]
        )
        table.auto_set_font_size(False)
        table.set_fontsize(8)
        table.scale(1.2, 1.2)
        ax_summary.set_title("Summary of Charging Events", fontsize=16)
        
        fig_summary.tight_layout()
        pdf.savefig(fig_summary)
        plt.close(fig_summary)

        # --- Create and Add Aggregated Graphs to PDF ---
        # Plot 1: SOC vs. Event Index
        fig_soc, ax_soc = plt.subplots(figsize=(12, 6))
        bar_width = 0.4
        event_indices = summary_df.index
        ax_soc.bar(event_indices - bar_width/2, summary_df['soc_start'], bar_width, label='Start SOC')
        ax_soc.bar(event_indices + bar_width/2, summary_df['soc_end'], bar_width, label='End SOC')
        ax_soc.set_xlabel('Charging Event Index')
        ax_soc.set_ylabel('SOC (%)')
        ax_soc.set_title('Start and End SOC for Each Charging Event')
        ax_soc.legend()
        fig_soc.tight_layout()
        pdf.savefig(fig_soc)
        plt.close(fig_soc)

        # Plot 2: Energy vs. Event Index
        fig_energy, ax_energy = plt.subplots(figsize=(12, 6))
        ax_energy.bar(event_indices - bar_width/2, summary_df['tpc_kwh'], bar_width, label='TPC kWh')
        ax_energy.bar(event_indices + bar_width/2, summary_df['soc_kwh'], bar_width, label='SOC kWh')
        ax_energy.set_xlabel('Charging Event Index')
        ax_energy.set_ylabel('Energy (kWh)')
        ax_energy.set_title('TPC kWh vs. SOC kWh for Each Charging Event')
        ax_energy.legend()
        fig_energy.tight_layout()
        pdf.savefig(fig_energy)
        plt.close(fig_energy)

    logger.info(f"\nPDF report saved to '{output_pdf_filename}'.")
    logger.info(f"\nCSV summary saved to '{output_csv_filename}'.")
    

# HR55AY7626,HR55AY9237,T0825AP8224BE,T0825AP8227BE    
# generate_and_analyze_report('HR55AY9237/HR55AY9237_01310825.csv')
# generate_and_analyze_report('HR55AY7626/HR55AY7626_01310825.csv')
# generate_and_analyze_report('T0825AP8224BE/T0825AP8224BE_01310825.csv')
generate_and_analyze_report('T0825AP8227BE/T0825AP8227BE_01310825.csv')

2025-09-09 15:44:02,220 - INFO - 
PDF report saved to 'T0825AP8227BE/socAnalysis/T0825AP8227BE_01310825_charging_event_summary.pdf'.
2025-09-09 15:44:02,220 - INFO - 
CSV summary saved to 'T0825AP8227BE/socAnalysis/T0825AP8227BE_01310825_charging_event_summary.csv'.


In [None]:
Logic to execute the analysis and generate reports for DB:

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os
import logging
import math

# Set up logging for better output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [9]:
# HR55AY7626,HR55AY9237,T0825AP8224BE,T0825AP8227BE    

df = pd.read_csv('HR55AY7626/HR55AY7626_01310825.csv')
df['IST'] = pd.to_datetime(df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
cols = ["id","IST", "BAT_SOC", "Bat_Voltage","Total_Battery_Current","GUN_Connection_Status",
        "GUNA_DCP_Temperature", "GUNA_DCM_Temperature",
        "GUNB_DCP_Temperature","GUNB_DCM_Temperature",
        "Chargingcontactor1positive","Chargingcontactor1negative",
        "Chargingcontactor2positive","Chargingcontactor2negative"]
df = df[cols]

In [10]:
def process_soc_charging_data(df: pd.DataFrame):
    """
    Generates a summary, and performs outlier analysis for charging events.
    Args: raw_file_path (str): The path to the raw dataframe values.
    """
    
    if 'timestamp' in df.columns:
        df['IST'] = pd.to_datetime(df['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
    elif 'IST' in df.columns:
        df['IST'] = pd.to_datetime(df['IST'])
    else:
        logger.error("Neither 'timestamp' nor 'IST' column found. Cannot proceed.")
        return
        
    df.sort_values(by='IST', inplace=True)
    df.reset_index(drop=True, inplace=True)

    current_threshold = 3200
    df = df[(df['Total_Battery_Current'].abs() != current_threshold)].copy()
    for col in ['BAT_SOC', 'Bat_Voltage']:
        df[col] = df[col].replace(0.0, np.nan).ffill().bfill()
        
    # NEW LOGIC: Fill missing current data with 0 to be included in idle time calculation
    df['Total_Battery_Current'] = df['Total_Battery_Current'].fillna(0)

    df.dropna(subset=['BAT_SOC', 'Bat_Voltage', 'Chargingcontactor1positive'], inplace=True)
    
    charging_start_indices = df[df['Chargingcontactor1positive'].diff() == 1].index.tolist()
    charging_end_indices = df[df['Chargingcontactor1positive'].diff() == -1].index.tolist()

    if df.iloc[0]['Chargingcontactor1positive'] == 1 and (not charging_start_indices or charging_start_indices[0] != df.index[0]):
        charging_start_indices.insert(0, df.index[0])

    if df.iloc[-1]['Chargingcontactor1positive'] == 1 and len(charging_start_indices) > len(charging_end_indices):
        charging_end_indices.append(df.index[-1])
        
    merged_events = []
    if len(charging_start_indices) > 0 and len(charging_end_indices) > 0:
        current_start_index = charging_start_indices[0]
        current_end_index = charging_end_indices[0]
        
        time_threshold_seconds = 5 * 60
        soc_threshold = 1.0
        
        for i in range(1, len(charging_start_indices)):
            prev_end_index = charging_end_indices[i-1]
            current_start_index_next = charging_start_indices[i]
            
            prev_end_time = df.loc[prev_end_index, 'IST']
            current_start_time = df.loc[current_start_index_next, 'IST']
            prev_end_soc = df.loc[prev_end_index, 'BAT_SOC']
            current_start_soc = df.loc[current_start_index_next, 'BAT_SOC']

            time_diff = (current_start_time - prev_end_time).total_seconds()
            soc_diff = abs(current_start_soc - prev_end_soc)
            
            if (time_diff <= time_threshold_seconds and soc_diff <= soc_threshold) or (time_diff <= 60):
                current_end_index = charging_end_indices[i]
            else:
                merged_events.append((current_start_index, current_end_index))
                current_start_index = charging_start_indices[i]
                current_end_index = charging_end_indices[i]
        
        merged_events.append((current_start_index, current_end_index))
    else:
        logger.warning("No charging events were detected.")
        return

    summary_data = []
    BATTERY_CAPACITY_KWH = 423
    
    for i, (start_index, end_index) in enumerate(merged_events):
        event_df = df.loc[start_index:end_index].copy()
        
        charging_periods = event_df[event_df['GUN_Connection_Status'] == 1].copy()
        
        total_duration = 0
        total_idle_duration = 0
        if not charging_periods.empty:
            charging_periods.loc[:, 'time_diff'] = charging_periods['IST'].diff().dt.total_seconds().fillna(0)
            total_duration = charging_periods['time_diff'].sum()
            
            # # RE-IMPLEMENTED LOGIC: Calculate idle time based on current
            # idle_periods = charging_periods[charging_periods['Total_Battery_Current'].abs() < 1].copy()
            # if not idle_periods.empty:
            #     idle_periods.loc[:, 'idle_time_diff'] = idle_periods['IST'].diff().dt.total_seconds().fillna(0)
            #     total_idle_duration = idle_periods['idle_time_diff'].sum()

        start_row = event_df.iloc[0].copy()
        end_row = event_df.iloc[-1].copy()
        
        full_soc_time = None
        if end_row['BAT_SOC'] >= 100.0:
            soc_100_rows = charging_periods[charging_periods['BAT_SOC'] >= 100.0].copy()
            if not soc_100_rows.empty:
                soc_100_timestamp = soc_100_rows.iloc[0]['IST']
                time_to_100 = 0
                
                charging_periods_up_to_100 = charging_periods[charging_periods['IST'] <= soc_100_timestamp].copy()
                if not charging_periods_up_to_100.empty:
                    charging_periods_up_to_100.loc[:, 'time_diff'] = charging_periods_up_to_100['IST'].diff().dt.total_seconds().fillna(0)
                    time_to_100 = charging_periods_up_to_100['time_diff'].sum()
                
                full_soc_time = time_to_100

        energy_Wh = 0
        if not charging_periods.empty:
            charging_periods.loc[:, 'power_W'] = charging_periods['Bat_Voltage'] * charging_periods['Total_Battery_Current'].abs()
            energy_Wh = np.trapezoid(charging_periods['power_W'], x=charging_periods['IST'].astype(np.int64) / 10**9) / 3600
        
        total_kwh_consumed_tpc = energy_Wh / 1000

        total_kwh_consumed_soc = (end_row['BAT_SOC'] - start_row['BAT_SOC']) * BATTERY_CAPACITY_KWH / 100
        total_kwh_consumed_soc = abs(total_kwh_consumed_soc)

        percent_diff = 0
        if total_kwh_consumed_tpc + total_kwh_consumed_soc != 0:
            percent_diff = (abs(total_kwh_consumed_tpc - total_kwh_consumed_soc) / 
                            ((total_kwh_consumed_tpc + total_kwh_consumed_soc) / 2)) * 100
        
        summary_data.append({
            'veh_id': df['id'].iloc[0],
            'start_time': start_row['IST'],
            'end_time': end_row['IST'],
            'charge_dur': total_duration,
            'soc_start': start_row['BAT_SOC'],
            'soc_end': end_row['BAT_SOC'],
            'tpc_kwh': total_kwh_consumed_tpc,
            'soc_kwh': total_kwh_consumed_soc,
            'diff_kw_percent': percent_diff
        })


    summary_df = pd.DataFrame(summary_data)

    if summary_df.empty:
        logger.warning("No meaningful charging events found after filtering. No report will be generated.")
        return

    # Generate a CSV summary table from a copy to keep original data
    summary_df_table = summary_df.copy()
    summary_df_table['start_time'] = summary_df_table['start_time'].dt.strftime('%d/%m/%y %H:%M:%S')
    summary_df_table['end_time'] = summary_df_table['end_time'].dt.strftime('%d/%m/%y %H:%M:%S')
    summary_df_table['start_time'] = pd.to_datetime(summary_df_table['start_time'], format='%d/%m/%y %H:%M:%S')
    summary_df_table['end_time'] = pd.to_datetime(summary_df_table['end_time'], format='%d/%m/%y %H:%M:%S')
    
    summary_df_table['charge_dur'] = summary_df_table['charge_dur'].apply(lambda x: f"{int(x // 3600):02d}:{int((x % 3600) // 60):02d}:{int(x % 60):02d}" if pd.notna(x) else 'N/A')
    summary_df_table['tpc_kwh'] = summary_df_table['tpc_kwh'].round(2)
    summary_df_table['soc_kwh'] = summary_df_table['soc_kwh'].round(2)
    summary_df_table['diff_kw_percent'] = summary_df_table['diff_kw_percent'].round(2)

    return summary_df_table

df_final = pd.DataFrame(process_soc_charging_data(df))
df_final.head()

Unnamed: 0,veh_id,start_time,end_time,charge_dur,soc_start,soc_end,tpc_kwh,soc_kwh,diff_kw_percent
0,3,2025-08-01 19:12:30,2025-08-01 19:30:09,00:17:38,22.4,28.8,29.07,27.07,7.1
1,3,2025-08-14 16:08:09,2025-08-14 16:21:37,00:13:28,20.8,32.8,53.52,50.76,5.29
2,3,2025-08-18 15:13:42,2025-08-18 15:15:44,00:02:02,22.8,23.2,1.7,1.69,0.26
3,3,2025-08-18 15:29:21,2025-08-18 15:38:54,00:09:32,28.8,32.4,15.79,15.23,3.65
4,3,2025-08-18 15:44:31,2025-08-18 15:55:51,00:11:20,32.4,37.2,22.94,20.3,12.19


In [11]:

len(df_final)

31