In [21]:
!pwd

/Users/apple/Documents/naarni/repo/explore_data/4.BCS_TMS_analysis


In [1]:
# # BCS / TMS Analysis ‚Äì Staged, Memory-Safe Pipeline
#
# This notebook restructures the original `3.bcs_tms_analysis.py` into
# **stages**, so that:
#
# - Large DataFrames are created once per stage.
# - Intermediate results are saved to disk (Feather/CSV) and reloaded later.
# - We explicitly delete big objects and trigger garbage collection.
# - PDF/table generation releases figure memory after use.
#
# Stages:
# 1. Load raw monthly CSV ‚Üí preprocess ‚Üí add state ‚Üí save `df_with_state.feather`
# 2. Battery condition analysis (vehicle-wise + fleet) using `df_with_state.feather`
# 3. SoC session analysis + SoC accuracy PDF using `df_with_state.feather`
# 4. Time-weighted energy metrics using saved SoC sessions
#
# Run one stage at a time to keep memory low on an 8 GB M1.
import os
import sys
import gc
import ctypes
import numpy as np
import pandas as pd
import platform
import logging
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from datetime import datetime, timedelta
import pyarrow.feather as ft

In [2]:
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 50)
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
print(f"Using Python version: {platform.python_version()}")
repo_path = '/Users/apple/Documents/naarni/repo/dview-naarni-data-platform'
if repo_path not in sys.path:
    sys.path.append(os.path.join(repo_path, 'tasks'))
from common.db_operations import connect_to_trino, fetch_data_for_day
def free_mem():
    """Try to return freed memory back to the OS (no-op on some platforms)."""
    try:
        libc = ctypes.CDLL(None)
        if hasattr(libc, "malloc_trim"):
            libc.malloc_trim(0)
    except Exception:
        pass

Using Python version: 3.13.7


In [3]:
CORE_COLS = [
    "id", "timestamp", "dt",
    "vehiclereadycondition", "gun_connection_status", "ignitionstatus",
    "vehicle_speed_vcu", "gear_position",
    "bat_soc", "soh", "total_battery_current",
    "pack1_cellmax_temperature", "pack1_cell_min_temperature",
    "pack1_maxtemperature_cell_number", "pack1_celltemperature_cellnumber",
    "bat_voltage", "cellmax_voltagecellnumber", "cellmax_voltagecellnumber",
    "cellminvoltagecellnumber", "cell_min_voltage",
    "dcdcbus",
]

In [4]:
def rename_battery_temp_columns(df: pd.DataFrame) -> pd.DataFrame:
    rename_map = {
        "pack1_cellmax_temperature": "batt_maxtemp",
        "pack1_cell_min_temperature": "batt_mintemp",
        "pack1_maxtemperature_cell_number":"batt_maxtemp_pack", 
        "pack1_celltemperature_cellnumber":"batt_mintemp_pack",
        "batt_maxvolt":"batt_maxvolt",
        "cellmax_voltagecellnumber":"batt_maxvolt_cell",
        "cell_min_voltage":"batt_minvolt",
        "cellminvoltagecellnumber":"batt_minvolt_cell", 
    }
    existing = {k: v for k, v in rename_map.items() if k in df.columns}
    if not existing:
        logging.warning("No matching temperature columns found to rename.")
        return df
    return df.rename(columns=existing)

In [5]:
def impute_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["id", "timestamp"])
    for vid, grp in df.groupby("id", sort=False):
        idx = grp.index
        if "batt_maxtemp" in df.columns:
            df.loc[idx, "batt_maxtemp"] = grp["batt_maxtemp"].ffill(limit=60)
        if "batt_mintemp" in df.columns:
            df.loc[idx, "batt_mintemp"] = grp["batt_mintemp"].ffill(limit=60)
        if "batt_maxvolt" in df.columns:
            df.loc[idx, "batt_maxvolt"] = grp["batt_maxvolt"].ffill(limit=30)
        if "batt_minvolt" in df.columns:
            df.loc[idx, "batt_minvolt"] = grp["batt_minvolt"].ffill(limit=30)
        if "bat_voltage" in df.columns:
            df.loc[idx, "bat_voltage"] = grp["bat_voltage"].ffill(limit=20)
        if "bat_soc" in df.columns:
            df.loc[idx, "bat_soc"] = grp["bat_soc"].ffill(limit=300)
        if "soh" in df.columns:
            df.loc[idx, "soh"] = grp["soh"].ffill(limit=300)
        if "total_battery_current" in df.columns:
            df.loc[idx, "total_battery_current"] = grp["total_battery_current"].interpolate(
                limit=10, limit_direction="both"
            )
        if "vehiclereadycondition" in df.columns:
            df.loc[idx, "vehiclereadycondition"] = grp["vehiclereadycondition"].ffill()
        if "gun_connection_status" in df.columns:
            df.loc[idx, "gun_connection_status"] = grp["gun_connection_status"].ffill()
    return df

In [6]:
def prepare_df_with_state(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["timestamp"] = pd.to_datetime(out["timestamp"], errors="coerce")
    out = out.dropna(subset=["timestamp"]).sort_values(["id", "timestamp"]).reset_index(drop=True)
    gcs_raw = out["gun_connection_status"]
    gcs_num = pd.to_numeric(gcs_raw, errors="coerce")
    gcs_str = gcs_raw.astype(str).str.strip().str.lower()
    gun_connected = (gcs_num == 1) | gcs_str.isin({"1", "true", "yes", "y", "connected", "on"})
    out["mode"] = np.where(gun_connected, "CHARGING", "DISCHARGING")
    out["batt_maxtemp"] = pd.to_numeric(out.get("batt_maxtemp"), errors="coerce")
    out["batt_mintemp"] = pd.to_numeric(out.get("batt_mintemp"), errors="coerce")
    out["batt_maxvolt"] = pd.to_numeric(out.get("batt_maxvolt"), errors="coerce")
    out["batt_minvolt"] = pd.to_numeric(out.get("batt_minvolt"), errors="coerce")
    out["batt_temp_delta"] = out["batt_maxtemp"] - out["batt_mintemp"]
    out["volt_delta_mv"] = (out["batt_maxvolt"] - out["batt_minvolt"]) * 1000.0
    out["dt_sec"] = out.groupby("id")["timestamp"].diff().dt.total_seconds().fillna(0)
    cols_keep = [
        "id", "timestamp", "mode",
        "vehiclereadycondition", "gun_connection_status",
        "batt_maxtemp", "batt_mintemp", "batt_temp_delta",
        "batt_maxvolt", "batt_minvolt", "volt_delta_mv",
        "batt_maxtemp_pack","batt_mintemp_pack",
        "batt_maxvolt_cell","batt_minvolt_cell",
        "bat_voltage", "total_battery_current",
        "bat_soc", "soh", "dt_sec",
    ]
    cols_keep = [c for c in cols_keep if c in out.columns]
    out = out[cols_keep]
    return out

In [7]:
# =====================================================================
# PDF REPORT ENGINE (MODULAR + REUSABLE)
# =====================================================================
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import gc

class PdfReport:
    """
    A modular wrapper for PDF generation.
    - Handles PdfPages context
    - Provides create_page() for new fig/ax
    - Provides add_table() for uniform styling
    """

    def __init__(
        self,
        output_path: str,
        font_family: str = "DejaVu Sans Mono",
        font_size: float = 8.0,
        show_grid: bool = True
    ):
        self.output_path = output_path
        self.font_family = font_family
        self.font_size = font_size
        self.show_grid = show_grid

    def __enter__(self):
        self.pdf = PdfPages(self.output_path)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pdf.close()

    # ----------------------------------------------------------------------
    def new_page(self, figsize=(11.69, 8.27), title=None, title_size=14):
        """Return a fresh (fig, ax) pair ready for drawing."""
        fig, ax = plt.subplots(figsize=figsize)
        if title:
            fig.suptitle(title, fontsize=title_size, fontweight="bold")
        return fig, ax

    # ----------------------------------------------------------------------
    def add_figure(self, fig):
        """Add a finished figure to PDF."""
        self.pdf.savefig(fig)
        plt.close(fig)
        gc.collect()

    # ----------------------------------------------------------------------
    def draw_table(
        self,
        ax,
        df,
        title=None,
        col_widths=None
    ):
        """Draw a styled table on the given axes."""
        ax.axis("off")
        if title:
            ax.set_title(title, fontsize=12, pad=10)

        table = ax.table(
            cellText=df.values,
            colLabels=df.columns,
            rowLabels=getattr(df, "index", None),
            loc="center",
            cellLoc="center"
        )

        # Font settings
        table.auto_set_font_size(False)
        table.set_fontsize(self.font_size)
        table.scale(1.0, 1.3)

        for (r, c), cell in table.get_celld().items():
            # Font family for each cell
            cell.get_text().set_fontfamily(self.font_family)

            # Gridlines
            if self.show_grid:
                cell.set_edgecolor("black")
                cell.set_linewidth(0.25)

        # Custom column widths
        if col_widths:
            for col_idx, width in enumerate(col_widths):
                for (r, c), cell in table.get_celld().items():
                    if c == col_idx:
                        cell.set_width(width)

        return table

In [8]:
# =====================================================================
# UNIFIED BATTERY CONDITION REPORT EXPORTER
# (USES PdfReport TO AVOID REPETITION)
# =====================================================================

def export_battery_condition_report_modular(
    vehicle_results: dict,
    fleet_mode_summary: dict,
    fleet_overall_summary: dict,
    output_pdf: str,
    font_family: str = "Courier New"
):
    """
    Modular, PdfReport-based unified report generator for:
      - Fleet summary (mode-wise)
      - Fleet summary (overall)
      - Vehicle-wise summaries
    """

    with PdfReport(output_pdf, font_family=font_family) as rep:

        # ------------------------------------------------------------
        # PAGE 1 ‚Äî Fleet Summary (By Mode)
        # ------------------------------------------------------------
        fig, axes = plt.subplots(1, 3, figsize=(11.69, 8.27))
        fig.suptitle("Fleet Battery Condition ‚Äî By Mode", fontsize=16, fontweight="bold")

        rep.draw_table(axes[0], fleet_mode_summary["temp"], "Max Temp %")
        rep.draw_table(axes[1], fleet_mode_summary["delta"], "ŒîT (¬∞C) %")
        rep.draw_table(axes[2], fleet_mode_summary["volt"], "Volt Œî (mV) %")

        rep.add_figure(fig)

        # ------------------------------------------------------------
        # PAGE 2 ‚Äî Fleet Summary (Mode-Agnostic)
        # ------------------------------------------------------------
        fig, axes = plt.subplots(1, 3, figsize=(11.69, 8.27))
        fig.suptitle("Fleet Battery Condition ‚Äî Overall Summary", fontsize=16, fontweight="bold")

        rep.draw_table(axes[0], fleet_overall_summary["temp"], "Max Temp % ‚Äî Fleet")
        rep.draw_table(axes[1], fleet_overall_summary["delta"], "ŒîT (¬∞C) % ‚Äî Fleet")
        rep.draw_table(axes[2], fleet_overall_summary["volt"], "Voltage Œî (mV) % ‚Äî Fleet")

        rep.add_figure(fig)

        # ------------------------------------------------------------
        # VEHICLE-WISE TABLES
        # ------------------------------------------------------------
        for vid, tables in vehicle_results.items():

            # tables = { "temp_df": df, "delta_df": df, "volt_df": df }
            for label, df_table in tables.items():

                fig, ax = rep.new_page(
                    figsize=(8.27, 11.69),
                    title=f"Vehicle {vid} ‚Äî {label}"
                )
                rep.draw_table(ax, df_table)
                rep.add_figure(fig)


In [19]:
def analyze_battery_conditions_vehiclewise(df, output_pdf=None):
    """
    Compute per-vehicle battery condition tables.
    Return:
        vehicle_results = {
            vehicle_id: {
                "temp_df": df,
                "delta_df": df,
                "volt_df": df
            }
        }
    If output_pdf is provided, also generates a PDF.
    """

    vehicle_results = {}
    generate_pdf = output_pdf is not None

    # --------------------------------------------------------
    # Build all per-vehicle tables FIRST (always)
    # --------------------------------------------------------
    for vid, group in df.groupby("id"):

        mode_results = {}

        for mode, subset in group.groupby("mode"):
            mode_results[mode] = {
                "Battery Max Temp (%)": (
                    subset["temp_bucket"].value_counts(normalize=True) * 100
                ).round(2),
                "ŒîT (¬∞C) Range (%)": (
                    subset["temp_delta_bucket"].value_counts(normalize=True) * 100
                ).round(2),
                "Voltage Œî (mV) (%)": (
                    subset["volt_delta_bucket"].value_counts(normalize=True) * 100
                ).round(2),
            }

        temp_df = pd.concat(
            {m: r["Battery Max Temp (%)"] for m, r in mode_results.items()},
            axis=1
        ).fillna(0)

        delta_df = pd.concat(
            {m: r["ŒîT (¬∞C) Range (%)"] for m, r in mode_results.items()},
            axis=1
        ).fillna(0)

        volt_df = pd.concat(
            {m: r["Voltage Œî (mV) (%)"] for m, r in mode_results.items()},
            axis=1
        ).fillna(0)

        vehicle_results[vid] = {
            "temp_df": temp_df,
            "delta_df": delta_df,
            "volt_df": volt_df
        }

    # --------------------------------------------------------
    # PDF EXPORT (optional)
    # --------------------------------------------------------
    if generate_pdf:
        with PdfPages(output_pdf) as pdf:
            for vid, tables in vehicle_results.items():
                temp_df = tables["temp_df"]
                delta_df = tables["delta_df"]
                volt_df = tables["volt_df"]

                fig, axes = plt.subplots(3, 1, figsize=(8.27, 11.69))
                fig.suptitle(f"Vehicle ID: {vid}", fontsize=14, fontweight="bold")

                def draw(ax, df_table, title):
                    ax.axis("off")
                    ax.set_title(title, fontsize=11, pad=10)
                    tbl = ax.table(
                        cellText=df_table.values,
                        rowLabels=df_table.index,
                        colLabels=df_table.columns,
                        cellLoc="center",
                        loc="center",
                    )
                    tbl.auto_set_font_size(False)
                    tbl.set_fontsize(7.5)
                    tbl.scale(1.1, 1.2)

                draw(axes[0], temp_df, "Battery Max Temperature Distribution (%)")
                draw(axes[1], delta_df, "Temperature Delta (¬∞C) Distribution (%)")
                draw(axes[2], volt_df, "Voltage Delta (mV) Distribution (%)")

                plt.tight_layout(rect=[0, 0, 1, 0.97])
                pdf.savefig(fig)
                plt.close(fig)

    # --------------------------------------------------------
    # CRITICAL: Return results ALWAYS
    # --------------------------------------------------------
    return vehicle_results



In [9]:
def compute_fleet_summary(vehicle_results: dict, mode_agnostic: bool = False):
    temp_list, delta_list, volt_list = [], [], []
    for vid, res in vehicle_results.items():
        temp = res["temp_df"]
        delt = res["delta_df"]
        volt = res["volt_df"]
        if mode_agnostic:
            temp_list.append(temp.sum(axis=1))
            delta_list.append(delt.sum(axis=1))
            volt_list.append(volt.sum(axis=1))
        else:
            temp_list.append(temp)
            delta_list.append(delt)
            volt_list.append(volt)
    def combine_mode_wise(frames):
        combined = pd.concat(frames, axis=0)
        summed = combined.groupby(combined.index).sum()
        normalized = (summed.div(summed.sum()) * 100).round(2)
        return normalized
    def combine_mode_agnostic(frames):
        s = pd.concat(frames, axis=1).sum(axis=1)
        out = (s / s.sum() * 100).round(2).to_frame("Fleet %")
        return out
    if mode_agnostic:
        return {
            "temp": combine_mode_agnostic(temp_list),
            "delta": combine_mode_agnostic(delta_list),
            "volt": combine_mode_agnostic(volt_list),
        }
    else:
        return {
            "temp": combine_mode_wise(temp_list),
            "delta": combine_mode_wise(delta_list),
            "volt": combine_mode_wise(volt_list),
        }

In [10]:
def calc_soc_accuracy_sessions(df: pd.DataFrame,
                               capacity_kwh: float = 423.0,
                               max_gap_sec: int = 300) -> pd.DataFrame:
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    df = df.dropna(subset=["timestamp"]).sort_values(["id", "timestamp"]).reset_index(drop=True)
    df["dt_sec"] = df.groupby("id")["timestamp"].diff().dt.total_seconds().fillna(0)
    df.loc[df["dt_sec"] < 0, "dt_sec"] = 0
    gcs_raw = df["gun_connection_status"]
    gcs_num = pd.to_numeric(gcs_raw, errors="coerce")
    gcs_str = gcs_raw.astype(str).str.strip().str.lower()
    gun_connected = (gcs_num == 1) | gcs_str.isin({"1", "true", "yes", "y", "connected", "on"})
    df["mode"] = np.where(gun_connected, "CHARGING", "DISCHARGING")
    CURRENT_LIMIT = 1000
    def clean_current(series):
        s = pd.to_numeric(series, errors="coerce").copy()
        s[s.abs() > CURRENT_LIMIT] = np.nan
        return s.interpolate(limit=30, limit_direction="both").ffill().bfill()
    df["total_battery_current"] = (
        df.groupby("id", group_keys=False)["total_battery_current"].apply(clean_current)
    )
    mode_change = df["mode"] != df["mode"].shift(fill_value=df["mode"].iloc[0])
    new_vehicle = df["id"] != df["id"].shift(fill_value=df["id"].iloc[0])
    gap_break = df["dt_sec"] > max_gap_sec
    df["session_break"] = (mode_change | new_vehicle | gap_break).astype(int)
    df["session_id"] = df["session_break"].cumsum()
    ACTIVE_I = 10
    MAX_DT = 60
    results = []
    for (vid, sid), g in df.groupby(["id", "session_id"], sort=False):
        g = g.sort_values("timestamp")
        if len(g) < 2:
            continue
        mode = g["mode"].iloc[0]
        if mode not in ["CHARGING", "DISCHARGING"]:
            continue
        g["dt_sess"] = g["dt_sec"].clip(upper=MAX_DT)
        g_active = g[g["total_battery_current"].abs() > ACTIVE_I]
        if g_active.empty:
            continue
        g["bat_soc"] = pd.to_numeric(g["bat_soc"], errors="coerce")
        g.loc[(g["bat_soc"] <= 0) | (g["bat_soc"] > 100), "bat_soc"] = np.nan
        g["bat_soc"] = g["bat_soc"].ffill().bfill()
        soc_start = g["bat_soc"].iloc[0]
        soc_end = g["bat_soc"].iloc[-1]
        if mode == "DISCHARGING" and soc_end > soc_start:
            soc_end = soc_start
        if mode == "CHARGING" and soc_end < soc_start:
            soc_end = soc_start
        soh_avg = pd.to_numeric(g["soh"], errors="coerce").mean()
        if mode == "CHARGING":
            delta_soc = soc_end - soc_start
        else:
            delta_soc = soc_start - soc_end
        energy_soc_kwh = abs(delta_soc * soh_avg * capacity_kwh / 10000.0)
        e_meas_kwh = (
            g_active["bat_voltage"] *
            g_active["total_battery_current"] *
            g_active["dt_sess"]
        ).sum() / 3.6e6
        e_meas_kwh = abs(e_meas_kwh)
        accuracy = np.nan
        if energy_soc_kwh > 1e-6:
            accuracy = (1 - abs(e_meas_kwh - energy_soc_kwh) / energy_soc_kwh) * 100
        dur_min = (g["timestamp"].iloc[-1] - g["timestamp"].iloc[0]).total_seconds() / 60
        results.append({
            "vehicle_id": vid,
            "session_id": sid,
            "mode": mode,
            "start_time": g["timestamp"].iloc[0],
            "end_time": g["timestamp"].iloc[-1],
            "duration_min": round(dur_min, 2),
            "soc_start": round(soc_start, 2),
            "soc_end": round(soc_end, 2),
            "soh_avg": round(soh_avg, 2),
            "energy_soc_kwh": round(energy_soc_kwh, 3),
            "energy_measured_kwh": round(e_meas_kwh, 3),
            "accuracy_percent": round(accuracy, 2),
        })
    return pd.DataFrame(results).sort_values(
        ["vehicle_id", "start_time"]
    ).reset_index(drop=True)

In [11]:
def export_soc_accuracy_report_modular(
    df: pd.DataFrame,
    output_pdf: str,
    font_family: str = "Courier New",
    max_rows_per_page: int = 40,
    col_widths: list[float] | None = None
):
    """
    Modular SoC-Accuracy PDF exporter using PdfReport.
    Produces paginated vehicle-wise tables.
    """

    with PdfReport(output_pdf, font_family=font_family) as rep:

        vehicle_ids = sorted(df["vehicle_id"].unique())

        for vid in vehicle_ids:
            vdf = df[df["vehicle_id"] == vid].copy()

            if vdf.empty:
                continue

            total_rows = len(vdf)
            num_pages = int(np.ceil(total_rows / max_rows_per_page))

            # Build column widths only once per vehicle
            columns = list(vdf.columns)
            if col_widths is None:
                # Auto-generate reasonable weights:
                # - wider for timestamps
                # - medium for energy fields
                # - narrow for ids
                width_map = {
                    "vehicle_id": 3,
                    "session_id": 3,
                    "mode": 4,
                    "start_time": 12,
                    "end_time": 12,
                    "duration_min": 4,
                    "soc_start": 4,
                    "soc_end": 4,
                    "soh_avg": 4,
                    "energy_soc_kwh": 6,
                    "energy_measured_kwh": 6,
                    "accuracy_percent": 4,
                }
                col_widths_final = [width_map.get(c, 5) for c in columns]
            else:
                col_widths_final = col_widths

            for page_idx in range(num_pages):
                start = page_idx * max_rows_per_page
                end = start + max_rows_per_page
                chunk = vdf.iloc[start:end]

                title = (
                    f"Vehicle {vid} ‚Äî SoC Accuracy "
                    f"(Page {page_idx+1} / {num_pages})"
                )

                # Create a page in the PDF
                fig, ax = rep.new_page(
                    figsize=(11.69, 8.27),
                    title=title,
                    title_size=14
                )

                rep.draw_table(
                    ax,
                    chunk,
                    col_widths=col_widths_final
                )

                rep.add_figure(fig)

In [12]:
# =====================================================================
# FUNCTION ‚Äî Compute Time-Weighted Energy Metrics
# =====================================================================

def compute_time_weighted_energy(soc_accuracy_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute weighted-average energy measurements using duration-weighting.

    Input columns required:
        - vehicle_id
        - mode (CHARGING / DISCHARGING)
        - duration_min
        - energy_soc_kwh
        - energy_measured_kwh

    Returns:
        DataFrame with columns:
            vehicle_id, mode, total_time_hr,
            weighted_avg_energy_soc_kwh,
            weighted_avg_energy_measured_kwh,
            difference_kwh,
            difference_percent
    """
    df = soc_accuracy_df.copy()

    # Convert duration to hours
    df["duration_hr"] = df["duration_min"] / 60.0
    df = df[df["duration_hr"] > 0].copy()

    # Ensure required columns exist
    required = {
        "vehicle_id", "mode", "duration_hr",
        "energy_soc_kwh", "energy_measured_kwh"
    }
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    results = []
    for (vid, mode), g in df.groupby(["vehicle_id", "mode"], sort=False):

        total_time = g["duration_hr"].sum()
        if total_time <= 0:
            continue

        # Weighted averages
        w_avg_soc = (
            (g["energy_soc_kwh"] * g["duration_hr"]).sum() / total_time
        )
        w_avg_meas = (
            (g["energy_measured_kwh"] * g["duration_hr"]).sum() / total_time
        )

        diff_kwh = w_avg_meas - w_avg_soc

        # Avoid division by zero for percent calculation
        if w_avg_soc > 1e-6:
            diff_pct = (1 - abs(diff_kwh) / w_avg_soc) * 100
            diff_pct = round(diff_pct, 2)
        else:
            diff_pct = np.nan

        results.append({
            "vehicle_id": vid,
            "mode": mode,
            "total_time_hr": round(total_time, 3),
            "weighted_avg_energy_soc_kwh": round(w_avg_soc, 3),
            "weighted_avg_energy_measured_kwh": round(w_avg_meas, 3),
            "difference_kwh": round(diff_kwh, 3),
            "difference_percent": diff_pct,
        })

    return pd.DataFrame(results).sort_values(
        ["vehicle_id", "mode"]
    ).reset_index(drop=True)


In [13]:
# =====================================================================
# STAGE 1 ‚Äî Load raw monthly CSV ‚Üí preprocess ‚Üí df_with_state.feather
# =====================================================================

date_str = "2025-10-01"
target_date = datetime.strptime(date_str, "%Y-%m-%d").date()
ist_start = datetime.combine(target_date, datetime.min.time())
ist_end = ist_start + timedelta(days=31)
utc_start = ist_start - timedelta(hours=5, minutes=30)
utc_end = ist_end - timedelta(hours=5, minutes=30)
logging.info(f"üîç Query window (UTC): {utc_start} ‚Üí {utc_end}")


vehicle_ids = ['3','16','18','19','32','42','6','7','9','11','12','13','14','15','20','25','27','28','29','30','31','33','35','41','46']

csv_path = "oct25_can_parsed_data.csv"
logging.info(f"üìÅ Loading monthly CAN data from CSV: {csv_path}")

# figure out available columns then load only needed ones
head_cols = pd.read_csv(csv_path, nrows=0).columns.tolist()
usecols = [c for c in CORE_COLS if c in head_cols]
df_cpo100 = pd.read_csv(csv_path, usecols=usecols)
df_cpo100 = rename_battery_temp_columns(df_cpo100)
logging.info(f"Raw df_cpo100 loaded with {len(df_cpo100):,} rows and {df_cpo100.shape[1]} columns")

if "id" in df_cpo100.columns:
    df_cpo100["id"] = df_cpo100["id"].astype(str)
    df_cpo100 = df_cpo100[df_cpo100["id"].isin(vehicle_ids)]
    logging.info(f"Filtered by vehicle_ids ‚Üí {len(df_cpo100):,} rows")

df_cpo100["timestamp"] = pd.to_datetime(df_cpo100["timestamp"], errors="coerce")
df_cpo100 = df_cpo100.dropna(subset=["timestamp"])
df_cpo100 = df_cpo100[(df_cpo100["timestamp"] >= utc_start) & (df_cpo100["timestamp"] <= utc_end)]
logging.info(f"After date filter ‚Üí {len(df_cpo100):,} rows")

logging.info("üßπ Imputing missing values ...")
df_cpo100 = impute_missing_values(df_cpo100)

logging.info("üß† Preparing df_with_state (mode + temp/volt deltas)...")
df_with_state = prepare_df_with_state(df_cpo100)
logging.info(f"df_with_state has {len(df_with_state):,} rows and {df_with_state.shape[1]} columns")

feather_path = "df_with_state_30days.feather"
df_with_state.to_feather(feather_path)
logging.info(f"üíæ Saved df_with_state ‚Üí {feather_path}")

del df_cpo100
del df_with_state
gc.collect()
free_mem()

logging.info("‚úÖ Stage 1 complete. Run Stage 2 next.")

2025-11-16 21:01:13 - INFO - üîç Query window (UTC): 2025-09-30 18:30:00 ‚Üí 2025-10-31 18:30:00
2025-11-16 21:01:13 - INFO - üìÅ Loading monthly CAN data from CSV: oct25_can_parsed_data.csv
2025-11-16 21:01:39 - INFO - Raw df_cpo100 loaded with 24,269,440 rows and 19 columns
2025-11-16 21:01:50 - INFO - Filtered by vehicle_ids ‚Üí 24,269,440 rows
2025-11-16 21:02:08 - INFO - After date filter ‚Üí 23,616,456 rows
2025-11-16 21:02:08 - INFO - üßπ Imputing missing values ...
2025-11-16 21:02:58 - INFO - üß† Preparing df_with_state (mode + temp/volt deltas)...
2025-11-16 21:03:38 - INFO - df_with_state has 23,616,456 rows and 20 columns
2025-11-16 21:03:41 - INFO - üíæ Saved df_with_state ‚Üí df_with_state_30days.feather
2025-11-16 21:03:46 - INFO - ‚úÖ Stage 1 complete. Run Stage 2 next.


In [20]:
# =====================================================================
# STAGE 2 ‚Äî CLEAN + MODULAR + PDF ENGINE DRIVEN
# =====================================================================

feather_path = "df_with_state_30days.feather"
logging.info(f"üìÅ Loading df_with_state from {feather_path} for battery condition analysis...")

import pyarrow.feather as ft
meta = ft.read_table(feather_path).schema
cols_available = [f.name for f in meta]
cols_needed = ["id", "mode", "batt_maxtemp", "batt_mintemp", "batt_maxvolt", "batt_minvolt"]
cols_to_load = [c for c in cols_needed if c in cols_available]

df_cond = pd.read_feather(feather_path, columns=cols_to_load)
logging.info(f"df_cond loaded with {len(df_cond):,} rows and {df_cond.shape[1]} columns")

logging.info("üìä Running vehicle-wise battery condition analysis...")
vehicle_results = analyze_battery_conditions_vehiclewise(
    df_cond,
    output_pdf=None      # Disable old per-vehicle PDF
)

logging.info("üìä Computing fleet summaries...")
fleet_mode = compute_fleet_summary(vehicle_results, mode_agnostic=False)
fleet_overall = compute_fleet_summary(vehicle_results, mode_agnostic=True)

logging.info("üìÑ Exporting consolidated fleet battery condition report...")
export_battery_condition_report_modular(
    vehicle_results=vehicle_results,
    fleet_mode_summary=fleet_mode,
    fleet_overall_summary=fleet_overall,
    output_pdf="battery_condition_fleet_report_30days.pdf",
    font_family="Courier New"
)

del df_cond, vehicle_results, fleet_mode, fleet_overall
gc.collect()
free_mem()

logging.info("‚úÖ Stage 2 complete. Run Stage 3 next.")

2025-11-16 21:09:19 - INFO - üìÅ Loading df_with_state from df_with_state_30days.feather for battery condition analysis...
2025-11-16 21:09:21 - INFO - df_cond loaded with 23,616,456 rows and 6 columns
2025-11-16 21:09:21 - INFO - üìä Running vehicle-wise battery condition analysis...


KeyError: 'temp_bucket'

In [None]:
# =====================================================================
# STAGE 3 ‚Äî SoC Sessions + SoC Accuracy PDF
# =====================================================================

feather_path = "df_with_state_30days.feather"
logging.info(f"üìÅ Loading df_with_state from {feather_path} for SoC analysis...")

import pyarrow.feather as ft

meta = ft.read_table(feather_path).schema
cols_available = [f.name for f in meta]

cols_needed = [
    "id",
    "timestamp",
    "gun_connection_status",
    "bat_soc",
    "soh",
    "bat_voltage",
    "total_battery_current"
]

cols_to_load = [c for c in cols_needed if c in cols_available]

# If you want to load only necessary columns replace columns=cols_available with cols_to_load
df_soc_base = pd.read_feather(feather_path, columns=cols_available)
logging.info(f"df_soc_base loaded with {len(df_soc_base):,} rows and {df_soc_base.shape[1]} columns")

# ---------------------------------------------------------------------
# SoC session extraction
# ---------------------------------------------------------------------
logging.info("‚ö° Computing SoC accuracy sessions...")
soc_accuracy_df = calc_soc_accuracy_sessions(df_soc_base)
logging.info(f"SoC sessions computed: {len(soc_accuracy_df):,} rows")

# ---------------------------------------------------------------------
# Save to parquet
# ---------------------------------------------------------------------
soc_path = "soc_accuracy_sessions_30days.parquet"
soc_accuracy_df.to_parquet(soc_path, index=False)
logging.info(f"üíæ Saved SoC sessions ‚Üí {soc_path}")

# ---------------------------------------------------------------------
# PDF Export (modular)
# ---------------------------------------------------------------------
logging.info("üìÑ Exporting SoC accuracy PDF (modular engine)...")

export_soc_accuracy_report_modular(
    soc_accuracy_df,
    output_pdf="vehicle_wise_soc_accuracy_30days_clean.pdf",
    font_family="Courier New",
    max_rows_per_page=40,
    col_widths=None      # Set list of widths here if manual tuning desired
)

# ---------------------------------------------------------------------
# Cleanup
# ---------------------------------------------------------------------
del df_soc_base
del soc_accuracy_df

gc.collect()
free_mem()

logging.info("‚úÖ Stage 3 complete. Run Stage 4 next.")

In [None]:
# =====================================================================
# STAGE 4 ‚Äî Time-weighted energy metrics from SoC sessions
# =====================================================================

soc_path = "soc_accuracy_sessions_30days.parquet"
logging.info(f"üìÅ Loading SoC sessions from {soc_path} for energy summary...")

soc_accuracy_df = pd.read_parquet(soc_path)
logging.info(f"SoC sessions loaded: {len(soc_accuracy_df):,} rows")

logging.info("üî¢ Computing time-weighted energy metrics...")
weighted_energy_summary = compute_time_weighted_energy(soc_accuracy_df)

logging.info("üìÑ Time-weighted energy summary preview:")
try:
    display(weighted_energy_summary.head())
except:
    print(weighted_energy_summary.head())

out_csv = "weighted_energy_summary_30days.csv"
weighted_energy_summary.to_csv(out_csv, index=False)
logging.info(f"‚úÖ Time-weighted energy summary saved ‚Üí {out_csv}")

del soc_accuracy_df
del weighted_energy_summary
gc.collect()
free_mem()

logging.info("üéâ Stage 4 complete. All stages complete.")

In [None]:
# ---------------------------------------------------------------------
# STAGE 5 ‚Äî SoC Dynamics PDF (Unified Modular PDF Renderer)
# ---------------------------------------------------------------------

def render_soc_dynamics_report(
    pdf_path: str,
    fleet_soc_table: pd.DataFrame,
    veh_soc_tables: dict,
    fleet_jump_table: pd.DataFrame,
    veh_jump_tables: dict,
    fleet_c_table: pd.DataFrame,
    veh_c_table: pd.DataFrame,
    font_family: str = "Courier New"
):
    """
    Unified PDF renderer for SoC Dynamics + Jump/Drop tables + C-rate tables
    using the PdfReport class.
    Tables are rendered in a minimal grid style resembling Excel.
    """

    with PdfReport(pdf_path, font_family=font_family, font_size=8.0) as rep:

        # -------------------------
        # PAGE 1 ‚Äî Fleet SoC occupancy
        # -------------------------
        fig, ax = rep.new_page(figsize=(10, 6), title="Fleet-Level SoC Occupancy (%)")
        rep.draw_table(ax, fleet_soc_table)
        rep.add_figure(fig)

        # -------------------------
        # PAGE 2 ‚Äî Vehicle-wise SoC occupancy
        # -------------------------
        for vid, tab in veh_soc_tables.items():
            fig, ax = rep.new_page(figsize=(10, 6), title=f"Vehicle {vid} ‚Äî SoC Occupancy (%)")
            rep.draw_table(ax, tab)
            rep.add_figure(fig)

        # -------------------------
        # PAGE 3 ‚Äî Fleet Jump/Drop table
        # -------------------------
        fig, ax = rep.new_page(figsize=(10, 6), title="Fleet-Level SoC Jumps/Drops (Count + %)")
        rep.draw_table(ax, fleet_jump_table)
        rep.add_figure(fig)

        # -------------------------
        # PAGE 4 ‚Äî Vehicle-wise Jump/Drop tables
        # -------------------------
        for vid, tab in veh_jump_tables.items():
            fig, ax = rep.new_page(figsize=(10, 6), title=f"Vehicle {vid} ‚Äî SoC Jumps/Drops (Count + %)")
            rep.draw_table(ax, tab)
            rep.add_figure(fig)

        # -------------------------
        # PAGE 5 ‚Äî Fleet C-Rate Summary
        # -------------------------
        fig, ax = rep.new_page(figsize=(10, 6), title="Fleet-Level C-Rate Summary")
        rep.draw_table(ax, fleet_c_table)
        rep.add_figure(fig)

        # -------------------------
        # PAGE 6 ‚Äî Vehicle-wise C-Rate Summary
        # -------------------------
        fig, ax = rep.new_page(figsize=(11, 8), title="Vehicle-Wise C-Rate Summary (All Vehicles)")
        rep.draw_table(ax, veh_c_table)
        rep.add_figure(fig)


# ---------------------------------------------------------------------
# Execute Stage 5 PDF generation
# ---------------------------------------------------------------------
pdf_path = "soc_dynamics_and_c_rate_report_30days.pdf"
render_soc_dynamics_report(
    pdf_path,
    fleet_soc_table=fleet_soc_table,
    veh_soc_tables=veh_soc_tables,
    fleet_jump_table=fleet_jump_table,
    veh_jump_tables=veh_jump_tables,
    fleet_c_table=fleet_c_table,
    veh_c_table=veh_c_table,
    font_family="Courier New"
)

del df_soc_dyn
gc.collect()
free_mem()

logging.info("‚úÖ Stage 5 complete (modular).")

In [None]:
# =====================================================================
# STAGE 6 ‚Äî Hotspot Analysis (Temperature + Voltage)
# PACK-LEVEL (108 TC) + CELL-LEVEL (576 CELLS)
# =====================================================================

logging.info("üìÅ Loading df_with_state for Hotspot Analysis...")

feather_path = "df_with_state_30days.feather"
schema = ft.read_table(feather_path).schema
cols_available = [f.name for f in schema]

cols_needed = [
    "id", "mode",
    "batt_maxtemp_pack", "batt_mintemp_pack",
    "batt_maxvolt", "batt_maxvolt_cell",
    "batt_minvolt", "batt_minvolt_cell"
]
cols_to_load = [c for c in cols_needed if c in cols_available]

df_hot = pd.read_feather(feather_path, columns=cols_to_load)
logging.info(f"df_hot loaded: {len(df_hot):,} rows")

if "mode" not in df_hot.columns:
    raise ValueError("Column 'mode' missing. Run Stage 5 before Stage 6.")

# =====================================================================
# üîß GENERIC HOTSPOT BUILDER (Fleet or Per-Vehicle)
# =====================================================================

def build_hotspot_tables(df: pd.DataFrame, max_col: str, min_col: str, key: str):
    """
    Build hotspot tables for MAX and MIN values on either:
        - pack-level temperature  (key='pack_tc')
        - cell-level voltage      (key='cell_id')
    Returns:
        (fleet_table, veh_tables)
    """

    fleet_tables = {}
    veh_tables = {}

    # ---------- Fleet MAX ----------
    fleet_max_raw = (
        df.groupby(["mode", max_col], observed=False)
        .size()
        .reset_index(name="count")
    )

    fleet_max = {}
    for mode in ["CHARGING", "DISCHARGING"]:
        sub = fleet_max_raw[fleet_max_raw["mode"] == mode].copy()
        total = sub["count"].sum() or 1
        sub["percent"] = (sub["count"] / total * 100).round(2)

        sub = sub.rename(columns={
            max_col: key,
            "count": f"{mode.lower()}_max_count",
            "percent": f"{mode.lower()}_max_pct"
        })

        fleet_max[mode] = sub.drop(columns=["mode"], errors="ignore").set_index(key)

    # ---------- Fleet MIN ----------
    fleet_min_raw = (
        df.groupby(["mode", min_col], observed=False)
        .size()
        .reset_index(name="count")
    )

    fleet_min = {}
    for mode in ["CHARGING", "DISCHARGING"]:
        sub = fleet_min_raw[fleet_min_raw["mode"] == mode].copy()
        total = sub["count"].sum() or 1
        sub["percent"] = (sub["count"] / total * 100).round(2)

        sub = sub.rename(columns={
            min_col: key,
            "count": f"{mode.lower()}_min_count",
            "percent": f"{mode.lower()}_min_pct"
        })

        fleet_min[mode] = sub.drop(columns=["mode"], errors="ignore").set_index(key)

    # ---------- Combine Fleet ----------
    fleet_table = (
        fleet_max["CHARGING"]
        .join(fleet_max["DISCHARGING"], how="outer")
        .join(fleet_min["CHARGING"], how="outer")
        .join(fleet_min["DISCHARGING"], how="outer")
        .fillna(0)
    )

    # =================================================================
    # PER-VEHICLE TABLES
    # =================================================================
    for vid, g in df.groupby("id"):

        v = {}

        # MAX
        vmax_raw = (
            g.groupby(["mode", max_col], observed=False)
            .size()
            .reset_index(name="count")
        )

        vmax = {}
        for mode in ["CHARGING", "DISCHARGING"]:
            sub = vmax_raw[vmax_raw["mode"] == mode].copy()
            total = sub["count"].sum() or 1
            sub["percent"] = (sub["count"] / total * 100).round(2)

            sub = sub.rename(columns={
                max_col: key,
                "count": f"{mode.lower()}_max_count",
                "percent": f"{mode.lower()}_max_pct"
            })

            vmax[mode] = sub.drop(columns=["mode"], errors="ignore").set_index(key)

        # MIN
        vmin_raw = (
            g.groupby(["mode", min_col], observed=False)
            .size()
            .reset_index(name="count")
        )

        vmin = {}
        for mode in ["CHARGING", "DISCHARGING"]:
            sub = vmin_raw[vmin_raw["mode"] == mode].copy()
            total = sub["count"].sum() or 1
            sub["percent"] = (sub["count"] / total * 100).round(2)

            sub = sub.rename(columns={
                min_col: key,
                "count": f"{mode.lower()}_min_count",
                "percent": f"{mode.lower()}_min_pct"
            })

            vmin[mode] = sub.drop(columns=["mode"], errors="ignore").set_index(key)

        # Join per-vehicle
        veh_tables[vid] = (
            vmax["CHARGING"]
            .join(vmax["DISCHARGING"], how="outer")
            .join(vmin["CHARGING"], how="outer")
            .join(vmin["DISCHARGING"], how="outer")
            .fillna(0)
        )

    return fleet_table, veh_tables


# =====================================================================
# SECTION 1 ‚Äî TEMPERATURE HOTSPOTS (Pack-Level)
# =====================================================================

logging.info("üî• Computing Temperature Hotspots...")
fleet_temp_hotspots, veh_temp_hotspots = build_hotspot_tables(
    df_hot,
    max_col="batt_maxtemp_pack",
    min_col="batt_mintemp_pack",
    key="pack_tc"
)
logging.info("üî• Temperature Hotspot Tables ready.")

# =====================================================================
# SECTION 2 ‚Äî VOLTAGE HOTSPOTS (Cell-Level)
# =====================================================================

logging.info("‚ö° Computing Voltage Hotspots...")
fleet_voltage_hotspots, veh_voltage_hotspots = build_hotspot_tables(
    df_hot,
    max_col="batt_maxvolt_cell",
    min_col="batt_minvolt_cell",
    key="cell_id"
)
logging.info("‚ö° Voltage Hotspot Tables ready.")

# =====================================================================
# OUTPUT
# =====================================================================

fleet_temp_hotspots
veh_temp_hotspots
fleet_voltage_hotspots
veh_voltage_hotspots

logging.info("‚úÖ Stage 6 complete.")