- ASC data for vehicle 4268 bearing device ID: 6
- Oct 27th 2025. 7:11pm - 7:28pm

In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import platform
import logging
import argparse
# import trino
import io
import boto3
from itertools import islice
from datetime import datetime, date, timedelta
import pendulum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# Add parent directory to path
repo_path = '/Users/kenobi/Documents/naarni/repo/dview-naarni-data-platform'
sys.path.append(os.path.join(repo_path, 'tasks'))

# Import necessary files and its respective functions
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg,drop_table,execute_query
from common.optimizer_logic import optimize_dataframe_memory

# Import business logic functions
from biz_logic.energy_mileage.energy_mileage_daily_v0 import energy_mileage_stats ,impute_odometer_readings

from biz_logic.energy_consumption.energy_consumption_report import energy_consumption_stats

# Configure basic logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.14.0


In [None]:
# ---- reporting config (edit ONLY this) ----
TABLE_NAME = "can_parsed_output_100"   # <— change only this

# derived (don’t edit)
REPORT_TABLE = f"adhoc.facts_prod.{TABLE_NAME}"
REPORT_S3_LOCATION = f"s3a://naarni-data-lake/aqua/warehouse/facts_prod.db/{TABLE_NAME}/"

In [4]:
def fetch_data(start_date, end_date, vehicle_ids):
    """
    Fetch raw battery data from the database for the specified date range and vehicle IDs.
    
    Args:
        start_date: Start date in 'YYYY-MM-DD' format
        end_date: End date in 'YYYY-MM-DD' format
        vehicle_ids: List of vehicle IDs
        
    Returns:
        Tuple of (df_cpo100, df_can_ac) containing raw data from both tables
    """
    logging.info(f"Fetching raw battery data from {start_date} to {end_date} for vehicles {vehicle_ids}")
    
    # Format vehicle IDs for the query
    vehicle_ids_str = ', '.join([f"'{vid}'" for vid in vehicle_ids])
    
    # Connect to Trino
    # conn = connect_to_trino(host="analytics.internal.naarni.com", port=443, user="admin", catalog="adhoc", schema="default")
    conn = connect_to_trino(host="trino.naarni.internal",port=80,user="admin",catalog="adhoc",schema="default")


    # Query for cpo100 data
    cpo100_query = f"""
    SELECT 
        *
    FROM
        facts_prod.can_parsed_output_100
    WHERE 
        id in ({vehicle_ids_str})
        and date(timestamp AT TIME ZONE 'Asia/Kolkata') between DATE('{start_date}') AND DATE('{end_date}')
    """

    # Execute queries and fetch data
    cur = conn.cursor()

    # Fetch cpo100 data
    cur.execute(cpo100_query)
    cpo100_columns = [desc[0] for desc in cur.description]
    cpo100_rows = cur.fetchall()
    df_cpo100 = pd.DataFrame(cpo100_rows, columns=cpo100_columns)

    logging.info(f"Done Fetching data.")
    logging.info(f"Retrieved {len(df_cpo100)} cpo100 records from the database.")
    
    # Close connections
    cur.close()
    conn.close()
    
    return df_cpo100

In [5]:
# conn = connect_to_trino(host="analytics.internal.naarni.com",port=443,user="admin",catalog="adhoc",schema="default")

# vehicle_ids=["6"]
# start_date = "2025-10-01"
# end_date = "2025-10-02"
# df_lakehouse = fetch_data(start_date, end_date, vehicle_ids)
# display(df_lakehouse.head(20))
# # df.to_csv("can_parsed_output_100_sample.csv", index=False)

In [6]:
# display(df_lakehouse.sort_values(by=["sequence"], ascending=False).head())

In [8]:
df_lakehouse = pd.read_csv("can_parsed_output_100_clickhouse.csv")
df_clickhouse = pd.read_csv("can_parsed_output_100_clickhouse.csv")
df_raw = pd.read_csv("c2c_can_01Oct2025.csv")
df_lakehouse.columns = df_lakehouse.columns.str.strip().str.lower()
df_clickhouse.columns = df_clickhouse.columns.str.strip().str.lower()
df_raw.columns = df_raw.columns.str.strip().str.lower()

def clean_cols(*dfs):
    """
    Cleans column names by:
      - stripping leading/trailing whitespace
      - converting to lowercase
    Returns cleaned DataFrames in the same order.
    """
    cleaned = []
    for df in dfs:
        df.columns = df.columns.str.strip().str.lower()
        cleaned.append(df)
    return cleaned

# Apply the cleaning function to all three
df_lakehouse, df_clickhouse, df_raw = clean_cols(df_lakehouse, df_clickhouse, df_raw)

In [11]:
def to_epoch_ms_from_str(ts_str):
    """Convert '2025-10-01 00:00:00.776 115875' → epoch ms."""
    if pd.isna(ts_str):
        return np.nan
    match = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)", str(ts_str))
    if not match:
        return np.nan
    dt_part = match.group(1)
    dt = pd.to_datetime(dt_part, errors='coerce')
    return int(dt.timestamp() * 1000) if pd.notna(dt) else np.nan

# df_lakehouse["timestamp_epoch"] = df_lakehouse["timestamp"].apply(to_epoch_ms_from_str)

In [14]:
def move_column(df, col_to_move, col_after):
    """
    Moves column `col_to_move` to appear right after `col_after`.
    """
    cols = list(df.columns)
    if col_to_move not in cols or col_after not in cols:
        return df  # nothing to do if columns missing

    cols.insert(cols.index(col_after) + 1, cols.pop(cols.index(col_to_move)))
    return df[cols]

# Apply it
df_lakehouse = move_column(df_lakehouse, "timestamp_epoch", "id")

df_lakehouse = df_lakehouse.drop(['timestamp', 'date', 'dt', 'rank'], axis=1, errors='ignore')
df_lakehouse.rename(columns={"timestamp_epoch": "timestamp"}, inplace=True)
# df_lakehouse.drop('insert_timestamp', axis=1, inplace=True)
# df_clickhouse.drop('timestamp.1', axis=1, inplace=True)

In [None]:
print("Min timestamp and number of records in SAMPLE:", df_lakehouse.timestamp.min(), len(df_lakehouse))
print("Max timestamp and number of records in SAMPLE:", df_lakehouse.timestamp.max(), len(df_lakehouse))
print("Min timestamp and number of records in CLICKHOUSE:", df_clickhouse.timestamp.min(), len(df_clickhouse))
print("Max timestamp and number of records in CLICKHOUSE:", df_clickhouse.timestamp.max(), len(df_clickhouse))
print("Min timestamp and number of records in RAW:", df_raw.timestamp.min(), len(df_raw))      #Correct IST timestamp

AttributeError: 'DataFrame' object has no attribute 'timestamp'

In [16]:
df_lakehouse = df_lakehouse.sort_values(by="timestamp").copy()
display(df_lakehouse.head())

df_clickhouse = df_clickhouse[(df_clickhouse.timestamp>=df_lakehouse.timestamp.min()) & (df_clickhouse.timestamp<=df_lakehouse.timestamp.max())].copy()
df_clickhouse = df_clickhouse.reindex(columns=df_lakehouse.columns)
df_clickhouse = df_clickhouse.sort_values(by=["timestamp","sequence"]).copy()
display(df_clickhouse.head())

df_raw = df_raw[(df_raw.timestamp>=df_lakehouse.timestamp.min()) & (df_raw.timestamp<=df_lakehouse.timestamp.max())].copy()
df_raw = df_raw.sort_values(by=["timestamp","sequence"]).copy()
display(df_raw.head())

KeyError: 'timestamp'

In [None]:
print("Min timestamp and number of records in SAMPLE:", df_lakehouse.timestamp.min(), len(df_lakehouse))
print("Max timestamp and number of records in SAMPLE:", df_lakehouse.timestamp.max(), len(df_lakehouse))
print("Min timestamp and number of records in CLICKHOUSE:", df_clickhouse.timestamp.min(), len(df_clickhouse))
print("Max timestamp and number of records in CLICKHOUSE:", df_clickhouse.timestamp.max(), len(df_clickhouse))
print("Min timestamp and number of records in RAW:", df_raw.timestamp.min(), len(df_raw))      #Correct IST timestamp

In [None]:
# Reorder ClickHouse columns to match Lakehouse
df_clickhouse = df_clickhouse.reindex(columns=df_lakehouse.columns)

# Rename ClickHouse timestamp for join clarity
df_lakehouse = df_lakehouse.rename(columns={"timestamp": "timestamp_lakehouse","sequence":"sequence_lakehouse"})
df_clickhouse = df_clickhouse.rename(columns={"timestamp": "timestamp_clickhouse","sequence":"sequence_clickhouse"})

# Join on epoch timestamp
df_merged = df_lakehouse.merge(df_clickhouse,left_on=["timestamp_lakehouse","sequence_lakehouse"],right_on=["timestamp_clickhouse","sequence_clickhouse"],how="outer",suffixes=("_lakehouse", "_clickhouse"))

df_merged["timestamp_lakehouse"] = (
    df_merged["timestamp_lakehouse"]
    .round(0)            # ensure clean integers if floats snuck in
    .astype("Int64")     # converts safely while allowing NaN
)

In [None]:
# df_clickhouse[(df_clickhouse.timestamp_clickhouse >= 1759276800776) and (df_clickhouse.timestamp_clickhouse <= 1759276800776)]["timestamp_clickhouse"].value_counts()
# 1759369880889: Thursday, 2 October 2025 01:51:20.889
# 1759369881129: Thursday, 2 October 2025 01:51:21.129
df_clickhouse_cp = df_clickhouse[(df_clickhouse["timestamp_clickhouse"]>=1759276800776) & (df_clickhouse["timestamp_clickhouse"]<=1759277399996)]#["timestamp_clickhouse"].value_counts()

In [None]:
print("Min timestamp and Max timestamp in CLICKHOUSE:", df_clickhouse_cp.timestamp_clickhouse.min(), df_clickhouse_cp.timestamp_clickhouse.max())
print("Min timestamp and Min timestamp in C2C_CAN RAW data:", df_raw.timestamp.min(), df_raw.timestamp.max())      #Correct IST timestamp

In [None]:
df_raw["timestamp"].value_counts().describe()

In [None]:
df_clickhouse_cp["timestamp_clickhouse"].value_counts().describe()

In [None]:
len(df_clickhouse_cp["timestamp_clickhouse"].unique()),len(df_raw.timestamp.unique())

In [None]:
df_lakehouse[(df_lakehouse["timestamp_lakehouse"]>1759276800776) & (df_lakehouse["timestamp_lakehouse"]<1759277399996)]["timestamp_lakehouse"].value_counts().describe()

In [None]:
df_raw[(df_raw["timestamp"]>1759369880000) & (df_raw["timestamp"]<1759369881150)].sort_values(by=["timestamp","sequence"])["timestamp"].value_counts().sort_values()

In [None]:
df_merged.head()

In [None]:
ts_counts = df_lakehouse.timestamp_lakehouse.value_counts()
ts_counts[ts_counts>1]

In [None]:
# 1759369880889: Thursday, 2 October 2025 01:51:20.889
# 1759369881129: Thursday, 2 October 2025 01:51:21.129
df_lakehouse[(df_lakehouse["timestamp_lakehouse"]>1759369880000) & (df_lakehouse["timestamp_lakehouse"]<1759369881150)]["timestamp_lakehouse"].value_counts()

In [None]:
df_clickhouse[df_clickhouse.timestamp_clickhouse==1759369881150].sort_values(by="sequence_clickhouse")#.to_csv("can_data_ts_1759369881150.csv", index=False)

In [None]:
df_lakehouse[df_lakehouse.timestamp_lakehouse==1759369881078].sort_values(by="sequence_lakehouse")#.to_csv("can_data_ts_1759369881078.csv", index=False)

In [None]:
df_review = pd.read_csv("c2c_candata_021025.csv")
df_review.head()

In [None]:
print(df_review.timestamp.min(),df_review.timestamp.max())

In [None]:
len(ts_counts),len(df_review)

In [None]:
df_lakehouse[df_lakehouse.timestamp_lakehouse==1759369802571].sort_values(by="sequence_lakehouse")

In [None]:
df_review[(df_review["timestamp"]>1759369880000) & (df_review["timestamp"]<1759369881150)].sort_values(by=["timestamp","sequence"])["timestamp"].value_counts().sort_values()

In [None]:
df_review[df_review.timestamp == 1759369802571].sort_values(by="sequence")

In [None]:
df_review.timestamp.min(), df_review.timestamp.max()

In [None]:
len(df_lakehouse[(df_lakehouse.timestamp_lakehouse>=df_review.timestamp.min()) & (df_lakehouse.timestamp_lakehouse<=df_review.timestamp.max())].sort_values(by="sequence_lakehouse"))

In [None]:
ts_counts = df_review.timestamp.value_counts().sort_values(ascending=False)
ts_counts.head()

In [None]:
ts_counts.describe()

In [None]:
ts_counts[ts_counts>20].sort_values()

In [None]:
wait

In [None]:
# Define suffixes
suffix_l, suffix_r = "_lakehouse", "_clickhouse"

# --- Identify shared base columns ---
base_cols = sorted(
    list(
        set(c.replace(suffix_l, "")
            for c in df_merged.columns if c.endswith(suffix_l))
        & set(c.replace(suffix_r, "")
            for c in df_merged.columns if c.endswith(suffix_r))
    )
)

print(f"Comparing {len(base_cols)} common columns...")

# --- Define the row comparison function ---
def row_diff(row):
    for col in base_cols:
        l, r = f"{col}{suffix_l}", f"{col}{suffix_r}"
        val_l = row.get(l, None)
        val_r = row.get(r, None)

        # Handle missing values cleanly
        if pd.isna(val_l) and pd.isna(val_r):
            continue

        # Robust comparison (handles pd.NA and mixed types)
        try:
            if val_l != val_r:
                return True
        except TypeError:
            if str(val_l) != str(val_r):
                return True
    return False

# --- Apply comparison row-wise ---
df_merged["is_diff"] = df_merged.apply(row_diff, axis=1)
df_diff = df_merged[df_merged["is_diff"]]

print("Number of differing rows:", len(df_diff))


In [None]:
# Filter for differing rows
# df_diff = df_merged[df_merged["is_diff"]].copy()

print(f"Number of differing rows: {len(df_diff)}")

# Display first few differences for a quick inspection
pd.set_option("display.max_columns", None)   # so all columns are visible
pd.set_option("display.max_colwidth", None)
display(df_diff.head(10))


In [None]:
df_clickhouse.head()


In [None]:
df_lakehouse.head()

In [None]:
# # assuming df_lakehouse and df_clickhouse are already aligned and cleaned
# cols = df_lakehouse.columns.tolist()
# chunk_size = 57
# num_chunks = math.ceil(len(cols) / chunk_size)

In [None]:
def plot_chunked_heatmaps(df_merged, suffix_left="_lakehouse", suffix_right="_clickhouse",
                          chunk_size=57, label_left="Lakehouse", label_right="ClickHouse"):
    """
    Plot grouped heatmaps comparing missing values between two suffixed sets of columns
    within a merged DataFrame.

    Parameters:
    - df_merged: merged DataFrame with suffixed columns
    - suffix_left: suffix for the first dataset (default '_lakehouse')
    - suffix_right: suffix for the second dataset (default '_clickhouse')
    - chunk_size: number of columns to display per comparison pair
    - label_left / label_right: titles for plots
    """

    # Identify matching column roots (without suffix)
    base_cols = sorted(
        list(
            set(
                c.replace(suffix_left, "")
                for c in df_merged.columns
                if c.endswith(suffix_left)
            )
            & set(
                c.replace(suffix_right, "")
                for c in df_merged.columns
                if c.endswith(suffix_right)
            )
        )
    )

    n_chunks = math.ceil(len(base_cols) / chunk_size)

    for i in range(n_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(base_cols))
        group = base_cols[start:end]

        # Build lists of suffixed column names for each side
        left_cols = [f"{c}{suffix_left}" for c in group]
        right_cols = [f"{c}{suffix_right}" for c in group]

        fig, axes = plt.subplots(1, 2, figsize=(25, 10), sharey=True)

        sns.heatmap(df_merged[left_cols].isnull(), cmap=['#007f5f', '#f94144'],
                    cbar=False, yticklabels=False, ax=axes[0])
        axes[0].set_title(f"{label_left} — Columns {start+1} to {end}", fontsize=13)

        sns.heatmap(df_merged[right_cols].isnull(), cmap=['#007f5f', '#f94144'],
                    cbar=False, yticklabels=False, ax=axes[1])
        axes[1].set_title(f"{label_right} — Columns {start+1} to {end}", fontsize=13)

        plt.suptitle(f"Null Comparison Heatmap (Columns {group[0]} → {group[-1]})", fontsize=15)
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

In [None]:
plot_chunked_heatmaps(df_merged,suffix_left="_lakehouse",suffix_right="_clickhouse",chunk_size=57)

In [None]:
# Filter only matching timestamp rows
df_matched = df_merged[
    (df_merged["timestamp_lakehouse"].notna()) &
    (df_merged["timestamp_clickhouse"].notna()) &
    (df_merged["timestamp_lakehouse"] == df_merged["timestamp_clickhouse"])
].copy()

In [None]:
suffix_l, suffix_r = "_lakehouse", "_clickhouse"

# Identify shared base columns
base_cols = sorted(
    list(
        set(c.replace(suffix_l, "")
            for c in df_matched.columns if c.endswith(suffix_l))
        & set(c.replace(suffix_r, "")
            for c in df_matched.columns if c.endswith(suffix_r))
    )
)

In [None]:
mismatch_counts = {}
for col in base_cols:
    left = f"{col}{suffix_l}"
    right = f"{col}{suffix_r}"
    mismatches = (df_matched[left] != df_matched[right]) & (
        ~(df_matched[left].isna() & df_matched[right].isna())
    )
    mismatch_counts[col] = mismatches.sum()

# Convert to sorted Series for easy viewing
mismatch_summary = pd.Series(mismatch_counts).sort_values(ascending=False)

In [None]:
mismatch_summary.value_counts()

In [None]:
# Find all rows with at least one differing value
def has_diff(row):
    for col in base_cols:
        l, r = f"{col}{suffix_l}", f"{col}{suffix_r}"
        val_l, val_r = row[l], row[r]
        if pd.isna(val_l) and pd.isna(val_r):
            continue
        if val_l != val_r:
            return True
    return False

df_matched["is_diff"] = df_matched.apply(has_diff, axis=1)
df_diff_rows = df_matched[df_matched["is_diff"]].copy()


In [None]:
def extract_diffs(row):
    diffs = {}
    for col in base_cols:
        l, r = f"{col}{suffix_l}", f"{col}{suffix_r}"
        if l in row and r in row:
            val_l, val_r = row[l], row[r]
            if pd.isna(val_l) and pd.isna(val_r):
                continue
            if val_l != val_r:
                diffs[col] = (val_l, val_r)
    return diffs

df_diff_rows["diff_columns"] = df_diff_rows.apply(extract_diffs, axis=1)
df_diff_view = df_diff_rows[["timestamp_lakehouse", "diff_columns"]]
display(df_diff_view.head(20))


In [None]:
print(len(df_lakehouse),len(df_lakehouse.timestamp_lakehouse.unique()))
print(len(df_clickhouse),len(df_clickhouse.timestamp_clickhouse.unique()))

In [None]:
len(df_clickhouse.timestamp_clickhouse.unique())

In [None]:
df_lakehouse.loc[(df_lakehouse["timestamp_lakehouse"]>1759369871078) & (df_lakehouse["timestamp_lakehouse"]<1759369881078)].sort_values(by=["timestamp_lakehouse"])

In [None]:
df_lakehouse[df_lakehouse.timestamp_lakehouse==1759369881078].sort_values(by="sequence_lakehouse")

In [None]:
df_clickhouse[df_clickhouse.timestamp_clickhouse==1759369881078].sort_values(by="sequence_clickhouse")

In [None]:
len(df_merged),len(df_lakehouse),len(df_clickhouse)

In [None]:
# Voltage columns: pack_cellvoltage_1 ... pack_cellvoltage_380
voltage_cols = sorted(
    [c for c in df_lakehouse.columns if re.match(r"pack_cellvoltage_\d+", c)],
    key=lambda x: int(re.findall(r"\d+", x)[0])
)

# Temperature columns: pack_temperature_1 ... pack_temperature_80
temp_cols = sorted(
    [c for c in df_lakehouse.columns if re.match(r"pack_temperature\d+", c)],
    key=lambda x: int(re.findall(r"\d+", x)[0])
)

print(f"Voltage columns: {len(voltage_cols)}")
print(f"Temperature columns: {len(temp_cols)}")

In [None]:
def compute_voltage_coverage_intervals(df, voltage_cols, timestamp_col):
    """
    Computes how long it takes to receive a full set of 380 voltages 
    by accumulating non-null voltages across timestamps.
    """

    df = df.sort_values(timestamp_col).reset_index(drop=True)

    intervals = []
    seen = set()
    start_ts = None

    for idx, row in df.iterrows():
        # Start new interval if needed
        if start_ts is None:
            start_ts = row[timestamp_col]

        # Add non-null voltage columns seen in this row
        for col in voltage_cols:
            if pd.notna(row[col]):
                seen.add(col)

        # If full coverage achieved
        if len(seen) == len(voltage_cols):   # 380
            end_ts = row[timestamp_col]

            # Compute time difference
            duration_ms = end_ts - start_ts
            duration_sec = duration_ms / 1000

            intervals.append({
                "start_ts": start_ts,
                "end_ts": end_ts,
                "duration_ms": duration_ms,
                "duration_sec": duration_sec,
                "duration_rows": idx   # number of rows needed
            })

            # Reset for the next cycle
            seen = set()
            start_ts = None

    return pd.DataFrame(intervals)


voltage_intervals_lake = compute_voltage_coverage_intervals(df_lakehouse, voltage_cols, "timestamp_lakehouse")
voltage_intervals_click = compute_voltage_coverage_intervals(df_clickhouse, voltage_cols, "timestamp_clickhouse")


plt.figure(figsize=(12,6))
sns.histplot(voltage_intervals_lake["duration_sec"], bins=30, kde=True)
plt.title("Distribution of Full-Voltage Coverage Duration (Lakehouse)")
plt.xlabel("Seconds to receive all 380 cell voltages")
plt.ylabel("Frequency")
plt.show()

In [None]:
voltage_intervals_lake.duration_sec.describe(percentiles=[0.25, 0.5, 0.75, 0.8,0.85,0.9,0.95, 0.99,0.995, 0.999])

In [None]:
def compute_temperature_coverage_intervals(df, temperature_cols, timestamp_col):
    """
    Computes how long it takes to receive a full set of 80 temperatures
    by accumulating non-null temperatures across timestamps.
    """

    df = df.sort_values(timestamp_col).reset_index(drop=True)

    intervals = []
    seen = set()
    start_ts = None

    for idx, row in df.iterrows():
        # Start new interval if needed
        if start_ts is None:
            start_ts = row[timestamp_col]

        # Add non-null temperature columns seen in this row
        for col in temperature_cols:
            if pd.notna(row[col]):
                seen.add(col)

        # If full coverage achieved
        if len(seen) == len(temperature_cols):   # 80
            end_ts = row[timestamp_col]

            # Compute time difference
            duration_ms = end_ts - start_ts
            duration_sec = duration_ms / 1000

            intervals.append({
                "start_ts": start_ts,
                "end_ts": end_ts,
                "duration_ms": duration_ms,
                "duration_sec": duration_sec,
                "duration_rows": idx   # number of rows needed
            })

            # Reset for the next cycle
            seen = set()
            start_ts = None

    return pd.DataFrame(intervals)


temperature_intervals_lake = compute_temperature_coverage_intervals(df_lakehouse, temp_cols, "timestamp_lakehouse")
temperature_intervals_click = compute_temperature_coverage_intervals(df_clickhouse, temp_cols, "timestamp_clickhouse")


plt.figure(figsize=(12,6))
sns.histplot(temperature_intervals_lake["duration_sec"], bins=30, kde=True)
plt.title("Distribution of Full-Temperature Coverage Duration (Lakehouse)")
plt.xlabel("Seconds to receive all 80 cell temperatures")
plt.ylabel("Frequency")
plt.show()

In [None]:
temperature_intervals_lake.duration_sec.describe(percentiles=[0.25, 0.5, 0.75, 0.8,0.85,0.9,0.95, 0.99,0.995, 0.999])

### IDs & Timestamp

#### Discrete Variables
- id: Related to vehicle id
- sequence_lakehouse: 
- number_of_can_ids
- number_of_can_records
- vcuversioninformation

#### Timestamp
- timestamp: YYYY-MM-DD HH:MM:SS format

#### Continuous Variables
- percentage_of_can_ids

In [None]:
l = ["id","timestamp_lakehouse","sequence_lakehouse","number_of_can_ids","number_of_records","percentage_of_can_ids","vcuversioninformation"]
for col in l:
    print(f"Describing {col}:")
    print(df_lakehouse[col].describe())

In [None]:
df_lakehouse["lowpressureoilpumpfaultcode"].value_counts(dropna=False)

In [None]:
l = ["lowpressureoilpumpfaultcode","bms_fault_code","vcu_fault_code","fiveinone_faultcode"]
for col in l:
    print(f"Describing {col}:")
    print(df_lakehouse[col].describe())