In [1]:
import gc
import os
import sys
import platform
import logging
import argparse
from datetime import datetime, date, timedelta
import pendulum
import pandas as pd
import numpy as np

repo_path = '/home/notebook'
sys.path.append(os.path.join(repo_path, 'test'))


# from common.cpoall_aggregator import run_daily_tms_analysis
# Import necessary files and its respective functions
from common.db_operations import connect_to_trino,execute_query, fetch_distinct_ids_for_day_trino,fetch_data_for_day_trino, write_df_to_iceberg, parse_arguments, get_target_date
from common.cpoall_aggregator import run_daily_tms_analysis,CORE_COLS

In [2]:
CORE_COLS = [
    "id", "date", "timestamp", 
    "b2t_tms_control_cmd",
    "b2t_set_water_out_temp",
    "b2t_battery_min_temp",
    "b2t_battery_max_temp",
    "tms_working_mode",
    "tms_fault_code",
    "coolant_out_temp",
    "coolant_in_temp",
    "hv_voltage",
    "comp_target_hz",
    "comp_status",
    "comp_running_frequency",
    "ac_fault_code",
    "outside_temp",
    "cabin_temp",
    "blower_speed",
    "ac_status",
    "ac_set_temp",
    "ac_operating_mode",
    "v2t_vehicle_coolant_low",
    "comp_current",
    "ac_system_ipm_module_temp",
    "ac_system_eva_temp",
]


In [3]:
conn = connect_to_trino()   # your DB function
day_str = "2025-12-01"
core_cols = CORE_COLS
source_schema = "facts_prod"
source_table = "can_output_ac"
ids = ['3','6','18','19']

df_raw = fetch_data_for_day_trino(conn=conn,day_str=day_str,ids=ids,core_cols=core_cols,table=source_table,schema=source_schema,)

2025-12-23 14:27:24 - INFO - ðŸ”Œ STEP 1: Connecting to Trino...
2025-12-23 14:27:24 - INFO - âœ… STEP 1: Connected to Trino


In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239658 entries, 0 to 239657
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   id                         239658 non-null  object        
 1   date                       239658 non-null  object        
 2   timestamp                  239658 non-null  datetime64[ns]
 3   b2t_tms_control_cmd        220321 non-null  object        
 4   b2t_set_water_out_temp     220321 non-null  float64       
 5   b2t_battery_min_temp       220321 non-null  float64       
 6   b2t_battery_max_temp       220321 non-null  float64       
 7   tms_working_mode           147327 non-null  object        
 8   tms_fault_code             147327 non-null  object        
 9   coolant_out_temp           147327 non-null  float64       
 10  coolant_in_temp            147327 non-null  float64       
 11  hv_voltage                 147385 non-null  float64 

In [4]:
import pandas as pd
import numpy as np

CAT_COLS = [
    "b2t_tms_control_cmd",
    "tms_working_mode",
    "tms_fault_code",
    "ac_operating_mode",
    "comp_status",
    "ac_status",
    "v2t_vehicle_coolant_low",
]

def profile_categoricals(df: pd.DataFrame, cols, top_n: int = 15) -> pd.DataFrame:
    rows = []
    n = len(df)

    # common junk tokens you may want to treat as missing
    missing_like = {"", " ", "na", "n/a", "null", "none", "nan", "-", "--"}

    for col in cols:
        if col not in df.columns:
            continue

        s = df[col].astype("string")

        # normalize strings a bit for consistent uniques
        s_norm = s.str.strip()
        s_lower = s_norm.str.lower()

        nn = int(s_norm.notna().sum())
        non_null_pct = (nn / n * 100) if n else np.nan

        # treat missing-like tokens as missing
        is_missing_like = s_lower.isin(missing_like)
        s_clean = s_norm.mask(is_missing_like, pd.NA)

        nunique = int(s_clean.nunique(dropna=True))
        top = s_clean.value_counts(dropna=True).head(top_n)

        # format "value (count, pct%)" list for easy reading
        top_fmt = "; ".join(
            [f"{idx} ({cnt}, {cnt/nn*100:.1f}%)" for idx, cnt in top.items()]
        ) if nn else ""

        rows.append({
            "column": col,
            "pandas_dtype_now": str(df[col].dtype),
            "non_null_pct": non_null_pct,
            "nunique": nunique,
            "top_values": top_fmt,
            "missing_like_cnt": int(is_missing_like.sum()),
        })

    return pd.DataFrame(rows).sort_values(["non_null_pct", "nunique"], ascending=[True, False]).reset_index(drop=True)

cat_profile_df = profile_categoricals(df_raw, CAT_COLS, top_n=20)
cat_profile_df


Unnamed: 0,column,pandas_dtype_now,non_null_pct,nunique,top_values,missing_like_cnt
0,ac_operating_mode,object,61.321967,7,"Free (80354, 54.7%); Cooling (36925, 25.1%); Air_Supply_Mode (19071, 13.0%); Defrosting (5592, 3.8%); Heating_Mode (5008, 3.4%); Forced_Heating (12, 0.0%); Forced_Cooling (1, 0.0%)",0
1,ac_status,object,61.321967,2,"Start (124273, 84.6%); Stop (22690, 15.4%)",0
2,tms_working_mode,object,61.47385,4,"Off (98200, 66.7%); Charging_Cooling (28117, 19.1%); Fast_Discharge_Cooling (11290, 7.7%); Self_Circulation (9720, 6.6%)",0
3,tms_fault_code,object,61.47385,1,"No Fault (147327, 100.0%)",0
4,comp_status,object,61.498051,2,"Off (105128, 71.3%); On (42257, 28.7%)",0
5,b2t_tms_control_cmd,object,91.931419,4,"Off (154173, 70.0%); Charging_Cooling (34630, 15.7%); Self_Circulation (16050, 7.3%); Fast_Discharge_Cooling (15468, 7.0%)",0
6,v2t_vehicle_coolant_low,object,98.301329,1,"Normal (235587, 100.0%)",0


In [5]:
import numpy as np
import pandas as pd

PCTS = [0.01, 0.05, 0.50, 0.95, 0.99]

def profile_numeric_bounds(df: pd.DataFrame, cols: list[str], pcts=PCTS) -> pd.DataFrame:
    """
    Profiles numeric-ish columns using observed min/max + percentiles.
    - Coerces to numeric with errors='coerce'
    - Returns non-null %, obs_min/max, pXX columns
    """
    rows = []
    n = len(df)

    for col in cols:
        s_raw = df[col]
        s_num = pd.to_numeric(s_raw, errors="coerce")
        nn = int(s_num.notna().sum())

        if nn == 0:
            rows.append({
                "column": col,
                "pandas_dtype_now": str(s_raw.dtype),
                "non_null_pct": 0.0,
                "obs_min": np.nan,
                "p01": np.nan, "p05": np.nan, "p50": np.nan, "p95": np.nan, "p99": np.nan,
                "obs_max": np.nan,
            })
            continue

        qs = s_num.quantile(pcts).to_dict()
        rows.append({
            "column": col,
            "pandas_dtype_now": str(s_raw.dtype),
            "non_null_pct": (nn / n * 100) if n else np.nan,
            "obs_min": float(s_num.min()),
            "p01": float(qs[0.01]),
            "p05": float(qs[0.05]),
            "p50": float(qs[0.50]),
            "p95": float(qs[0.95]),
            "p99": float(qs[0.99]),
            "obs_max": float(s_num.max()),
        })

    out = pd.DataFrame(rows)

    # sort: low coverage first, then widest observed spread
    spread = (out["obs_max"] - out["obs_min"]).replace([np.inf, -np.inf], np.nan)
    out["obs_spread"] = spread
    out = out.sort_values(["non_null_pct", "obs_spread"], ascending=[True, False]).drop(columns=["obs_spread"])

    desired_order = [
        "column", "pandas_dtype_now", "non_null_pct",
        "obs_min", "p01", "p05", "p50", "p95", "p99", "obs_max"
    ]
    return out[desired_order].reset_index(drop=True)

In [6]:
AC_META = {
    "id","date","timestamp","insert_timestamp","dt",
    "sequence","number_of_can_ids","number_of_records","percentage_of_can_ids"
}

# candidate numeric cols = everything except meta and known categoricals
KNOWN_CATEGORICAL = {
    "b2t_tms_control_cmd","tms_working_mode","tms_fault_code",
    "ac_operating_mode","comp_status","ac_status","v2t_vehicle_coolant_low",
    "ac_fault_code"  # keep as categorical for now; can be numeric later if truly numeric codes
}

num_candidates = [c for c in df_raw.columns if c not in AC_META and c not in KNOWN_CATEGORICAL]

ac_bounds_df = profile_numeric_bounds(df_raw, num_candidates)
ac_bounds_df


Unnamed: 0,column,pandas_dtype_now,non_null_pct,obs_min,p01,p05,p50,p95,p99,obs_max
0,outside_temp,float64,61.321967,0.0,9.0,10.0,22.0,27.0,28.0,32.0
1,cabin_temp,float64,61.321967,0.0,11.0,12.0,22.0,25.0,28.0,31.0
2,ac_set_temp,float64,61.321967,20.0,20.0,20.0,24.0,27.0,32.0,32.0
3,blower_speed,float64,61.321967,0.0,0.0,0.0,3.0,9.0,9.0,9.0
4,ac_system_eva_temp,float64,61.467591,-1.0,1.0,4.0,18.0,27.0,48.0,56.0
5,ac_system_ipm_module_temp,float64,61.467591,0.0,0.0,0.0,29.0,45.0,47.0,48.0
6,comp_current,float64,61.467591,0.0,0.0,0.0,0.0,14.8,18.7,21.4
7,coolant_in_temp,float64,61.47385,0.0,10.0,14.0,20.0,28.0,37.0,40.0
8,coolant_out_temp,float64,61.47385,8.0,13.0,14.0,20.0,27.0,35.0,38.0
9,hv_voltage,float64,61.498051,0.0,0.0,0.0,632.0,655.0,662.0,670.0


In [None]:
import numpy as np
import pandas as pd

PCTS = [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

def diff_gap_profile(
    df: pd.DataFrame,
    vars_: list[str],
    id_col: str = "id",
    ts_col: str = "timestamp",
    pcts: list[float] = PCTS,
) -> pd.DataFrame:
    """
    For each variable v:
      - keep rows where v is non-null
      - sort by (id, timestamp)
      - compute diff_seconds between consecutive rows within each id
      - summarize distribution of diff_seconds (describe + percentiles)
    Returns one row per variable.
    """
    # Ensure timestamp is datetime
    if not np.issubdtype(df[ts_col].dtype, np.datetime64):
        df = df.copy()
        df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce")

    base = df[[id_col, ts_col] + vars_].copy()
    base = base.dropna(subset=[id_col, ts_col])

    out_rows = []

    for v in vars_:
        sub = base.loc[base[v].notna(), [id_col, ts_col]]
        n_points = len(sub)

        if n_points <= 1:
            out_rows.append({
                "var": v,
                "n_points": n_points,
                "n_diffs": 0,
                "non_null_pct": (n_points / len(base) * 100.0) if len(base) else np.nan,
                "mean_s": np.nan, "std_s": np.nan, "min_s": np.nan, "max_s": np.nan,
                **{f"p{int(p*100):02d}_s": np.nan for p in pcts},
            })
            continue

        sub = sub.sort_values([id_col, ts_col], kind="mergesort")
        diff_s = sub.groupby(id_col)[ts_col].diff().dt.total_seconds()
        diff_s = diff_s.dropna()

        # If you want to ignore duplicates/bursts, you can uncomment:
        # diff_s = diff_s[diff_s > 0]

        desc = diff_s.describe()
        qs = diff_s.quantile(pcts).to_dict()

        out_rows.append({
            "var": v,
            "n_points": n_points,
            "n_diffs": int(diff_s.shape[0]),
            "non_null_pct": (n_points / len(base) * 100.0) if len(base) else np.nan,
            "mean_s": float(desc["mean"]),
            "std_s": float(desc["std"]),
            "min_s": float(desc["min"]),
            "max_s": float(desc["max"]),
            **{f"p{int(p*100):02d}_s": float(qs[p]) for p in pcts},
        })

    out = pd.DataFrame(out_rows)

    # Nice-to-have: order by coverage then by "tail risk" (p99)
    if "p99_s" in out.columns:
        out = out.sort_values(["non_null_pct", "p99_s"], ascending=[True, False])

    return out.reset_index(drop=True)


# ---- Usage example ----
vars_to_check = [
    # "outside_temp", "cabin_temp", "hv_voltage", "comp_current",
    # "coolant_out_temp", "coolant_in_temp",
    # "ac_status", "ac_operating_mode", "tms_working_mode", "comp_status",
    "b2t_tms_control_cmd",
    "b2t_set_water_out_temp",
    "b2t_battery_min_temp",
    "b2t_battery_max_temp",
    "tms_working_mode",
    "tms_fault_code",
    "coolant_out_temp",
    "coolant_in_temp",
    "hv_voltage",
    "comp_target_hz",
    "comp_status",
    "comp_running_frequency",
    "ac_fault_code",
    "outside_temp",
    "cabin_temp",
    "blower_speed",
    "ac_status",
    "ac_set_temp",
    "ac_operating_mode",
    "v2t_vehicle_coolant_low",
    "comp_current",
    "ac_system_ipm_module_temp",
    "ac_system_eva_temp",    
]



gap_profile_df = diff_gap_profile(df_raw, vars_to_check, id_col="id", ts_col="timestamp")
display(gap_profile_df)


Unnamed: 0,var,n_points,n_diffs,non_null_pct,mean_s,std_s,min_s,max_s,p05_s,p10_s,p25_s,p50_s,p75_s,p90_s,p95_s,p99_s
0,outside_temp,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
1,cabin_temp,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
2,blower_speed,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
3,ac_status,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
4,ac_set_temp,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
5,ac_operating_mode,146963,146959,61.321967,2.325532,36.608102,0.022,9021.727,0.94,0.98,1.04,1.16,2.12,3.3274,4.46,8.5684
6,comp_current,147312,147308,61.467591,2.319837,36.571465,0.02,9012.087,0.94,0.98,1.04,1.16,2.12,3.339,4.47365,8.5
7,ac_system_ipm_module_temp,147312,147308,61.467591,2.319837,36.571465,0.02,9012.087,0.94,0.98,1.04,1.16,2.12,3.339,4.47365,8.5
8,ac_system_eva_temp,147312,147308,61.467591,2.319837,36.571465,0.02,9012.087,0.94,0.98,1.04,1.16,2.12,3.339,4.47365,8.5
9,tms_working_mode,147327,147323,61.47385,2.319314,36.541027,0.022,9013.027,0.94,0.98,1.04,1.16,2.12,3.325,4.46,8.46
