In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import platform
import logging
import argparse
import trino
import io
import boto3
from itertools import islice
from datetime import datetime, date, timedelta
import pendulum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Add parent directory to path
repo_path = '/Users/apple/Documents/naarni/repo/dview-naarni-data-platform'
sys.path.append(os.path.join(repo_path, 'tasks'))

# Import necessary files and its respective functions
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg,drop_table,execute_query
from common.optimizer_logic import optimize_dataframe_memory

# Import business logic functions
from biz_logic.energy_mileage.energy_mileage_daily_v0 import energy_mileage_stats ,impute_odometer_readings
from biz_logic.energy_consumption.energy_consumption_report import energy_consumption_stats

# Configure basic logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.13.7


In [3]:
# ---- reporting config (edit ONLY this) ----
TABLE_NAME = "can_parsed_output_100"   # <— change only this

# derived (don’t edit)
REPORT_TABLE = f"adhoc.facts_prod.{TABLE_NAME}"
REPORT_S3_LOCATION = f"s3a://naarni-data-lake/aqua/warehouse/facts_prod.db/{TABLE_NAME}/"

In [None]:
def fetch_data(start_date, end_date, vehicle_ids):
    """
    Fetch raw battery data from the database for the specified date range and vehicle IDs.
    
    Args:
        start_date: Start date in 'YYYY-MM-DD' format
        end_date: End date in 'YYYY-MM-DD' format
        vehicle_ids: List of vehicle IDs
        
    Returns:
        Tuple of (df_cpo100, df_can_ac) containing raw data from both tables
    """
    logging.info(f"Fetching raw battery data from {start_date} to {end_date} for vehicles {vehicle_ids}")
    
    # Format vehicle IDs for the query
    vehicle_ids_str = ', '.join([f"'{vid}'" for vid in vehicle_ids])
    
    # Connect to Trino
    # conn = connect_to_trino(host="analytics.internal.naarni.com", port=443, user="admin", catalog="adhoc", schema="default")
    conn = connect_to_trino(host="trino.naarni.internal",port=80,user="admin",catalog="adhoc",schema="default")


    # Query for cpo100 data
    cpo100_query = f"""
    SELECT 
        *
    FROM
        facts_prod.can_parsed_output_100
    WHERE 
        id in ({vehicle_ids_str})
        and date(timestamp AT TIME ZONE 'Asia/Kolkata') between DATE('{start_date}') AND DATE('{end_date}')
    """

    # Execute queries and fetch data
    cur = conn.cursor()

    # Fetch cpo100 data
    cur.execute(cpo100_query)
    cpo100_columns = [desc[0] for desc in cur.description]
    cpo100_rows = cur.fetchall()
    df_cpo100 = pd.DataFrame(cpo100_rows, columns=cpo100_columns)

    logging.info(f"Done Fetching data.")
    logging.info(f"Retrieved {len(df_cpo100)} cpo100 records from the database.")
    
    # Close connections
    cur.close()
    conn.close()
    
    return df_cpo100

In [14]:
df_raw = pd.read_csv("c2c_can_27Oct2025_ascVer1.csv")
df_raw = df_raw[['id', 'timestamp', 'sequence', 'can_id', 'byte1', 'byte2', 'byte3', 'byte4', 'byte5', 'byte6', 'byte7', 'byte8']]
df_raw.sort_values(by=['timestamp'],ascending=True).head()

df_raw.to_csv("c2c_can_27Oct2025_ascVer1_conv.csv", index=False)

In [15]:
df_raw.head()

Unnamed: 0,id,timestamp,sequence,can_id,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8
0,6,1761552899248,42451,217449893,15,1,0,75,0,255,0,249
1,6,1761552899248,42470,419299536,225,3,0,255,255,255,255,255
2,6,1761552899248,42453,418384139,255,255,255,136,125,198,128,125
3,6,1761552899248,42454,419373295,14,125,116,0,125,255,255,125
4,6,1761552899248,42455,419361843,16,239,160,38,0,0,0,64


In [11]:
len(df_raw.can_id.unique())

215

In [16]:
df_raw.timestamp.min(),df_raw.timestamp.max()

(np.int64(1761552600548), np.int64(1761552899248))

In [7]:
def safe_hex_to_int(val):
    """Convert a hex string (like 'FF' or '18F0090B') to integer safely."""
    try:
        return int(val, 16)
    except Exception:
        return None

In [8]:
import pandas as pd
import re

# === Step 1: Input / Output Paths ===
asc_path = "can_20251027193535.asc"
csv_output_path = "converted_can_20251027193535_int.csv"

# === Step 2: Regular Expression to Parse ASC Lines ===
pattern = re.compile(
    r"^\s*(\d+\.\d+)\s+(\d+)\s+([0-9A-Fa-fx]+)\s+\w+\s+\w+\s+(\d+)\s+([0-9A-Fa-f\s]+)$"
)

rows = []
sequence = 1

def safe_hex_to_int(val):
    """Convert a hex string (like 'FF' or '18F0090B') to integer safely."""
    try:
        return int(val, 16)
    except Exception:
        return None

with open(asc_path, "r") as f:
    for line in f:
        match = pattern.match(line.strip())
        if not match:
            continue

        timestamp = float(match.group(1))
        channel = int(match.group(2))
        can_id_hex = match.group(3).replace("x", "").upper()
        dlc = int(match.group(4))
        data_bytes = match.group(5).strip().split()

        # Normalize byte length to 8
        data_bytes = (data_bytes + ['00'] * 8)[:8]

        # Convert each data byte and CAN ID to integer
        can_id_int = safe_hex_to_int(can_id_hex)
        data_bytes_int = [safe_hex_to_int(b) for b in data_bytes]

        rows.append({
            "id": channel,
            "timestamp": timestamp,
            "timestamp": timestamp,  # duplicate field
            "sequence": sequence,
            "can_id": can_id_int,
            "byte1": data_bytes_int[0],
            "byte2": data_bytes_int[1],
            "byte3": data_bytes_int[2],
            "byte4": data_bytes_int[3],
            "byte5": data_bytes_int[4],
            "byte6": data_bytes_int[5],
            "byte7": data_bytes_int[6],
            "byte8": data_bytes_int[7],
        })
        sequence += 1

# === Step 3: Build DataFrame ===
df = pd.DataFrame(rows)

# === Step 4: Save to CSV ===
df.to_csv(csv_output_path, index=False)
print(f"✅ Integer-converted CSV successfully written to {csv_output_path}")
df.head()

✅ Integer-converted CSV successfully written to converted_can_20251027193535_int.csv


Unnamed: 0,id,timestamp,sequence,can_id,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8
0,1,0.0,1,418384139,255,255,255,112,125,116,126,125
1,1,0.0015,2,150892043,0,0,0,0,0,0,0,0
2,1,0.0042,3,419373295,0,125,0,0,125,255,255,125
3,1,0.0048,4,217056000,253,0,100,255,255,255,255,255
4,1,0.0071,5,419299536,225,3,0,255,255,255,255,255


In [12]:
len(df.can_id.unique())

237

In [47]:
popular_ids = df_raw["can_id"].value_counts()
popular_ids = popular_ids[(popular_ids > 10) & (popular_ids < 50)]
print(popular_ids)

can_id
217449383    47
418579448    47
419375945    43
419351104    41
485293835    34
217450921    33
418579192    30
409081843    29
409016307    29
409540595    22
418185199    21
409278451    21
409606131    18
408950771    18
409737203    17
409343987    16
409475059    16
409802739    15
409671667    14
418521843    13
418521587    13
418516211    13
409147379    13
418578936    12
Name: count, dtype: int64


In [26]:
df_raw[(df_raw.can_id==418384139)&(df_raw.byte4==112)&(df_raw.byte6==116)&(df_raw.byte7==126)&(df_raw.byte8==125)].head(10)

Unnamed: 0,id,timestamp,sequence,can_id,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8
15296,6,1761552774048,8058,418384139,255,255,255,112,125,116,126,125
15303,6,1761552774048,8065,418384139,255,255,255,112,125,116,126,125
15316,6,1761552774048,8078,418384139,255,255,255,112,125,116,126,125
15324,6,1761552774028,8041,418384139,255,255,255,112,125,116,126,125
15329,6,1761552774028,8046,418384139,255,255,255,112,125,116,126,125
17724,6,1761552753328,57089,418384139,255,255,255,112,125,116,126,125
17729,6,1761552753328,57094,418384139,255,255,255,112,125,116,126,125
17734,6,1761552753328,57099,418384139,255,255,255,112,125,116,126,125
17746,6,1761552753308,57069,418384139,255,255,255,112,125,116,126,125
17750,6,1761552753308,57073,418384139,255,255,255,112,125,116,126,125


In [65]:
df_raw[(df_raw.can_id==419375945)&(df_raw.byte8==11)].head(20)

Unnamed: 0,id,timestamp,sequence,can_id,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8
16573,6,1761552764828,761,419375945,2,19,135,2,127,0,0,11


In [63]:
df[(df.can_id==419375945)&(df.byte8==11)].head(20)

Unnamed: 0,id,timestamp,sequence,can_id,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8
35857,1,44.9348,35858,419375945,2,19,135,2,128,0,0,11
