In [1]:
#!/usr/bin/env python
# coding: utf-8
from pathlib import Path
import sys

# Get current working directory (works in notebooks and scripts)
current_dir = Path.cwd()

# Try to find the common directory in parent directories
project_root = None
for parent in [current_dir] + list(current_dir.parents):
    if (parent / "common").is_dir():
        project_root = parent
        break
print(current_dir)

if project_root is None:
    raise FileNotFoundError("Could not find 'common' directory in any parent directory")

# Add project root to path
sys.path.insert(0, str(project_root))

# Now import as before
from common.db_operations import connect_to_trino, fetch_data_for_day, write_df_to_iceberg
import logging
import platform
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta

/home/notebook/prod/nelson


In [2]:
# Configure basic logging for the business logic file
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

# Print the Python version being used
print(f"Using Python version: {platform.python_version()}")

Using Python version: 3.11.13


In [4]:
conn = connect_to_trino()

2025-09-28 13:16:22 - INFO - üîå STEP 1: Connecting to Trino...
2025-09-28 13:16:22 - INFO - ‚úÖ STEP 1: Connected to Trino


In [5]:
# df = pd.read_sql("select id,cast(timestamp as date) as dateval,count(*) from clickhouse.naarni.can_parsed_output_100 where cast(timestamp as date) between DATE('2025-09-01') and DATE('2025-09-09') group by 1,2", conn)
df = pd.read_sql("select id,date,count(*) from clickhouse.naarni.c2c_gps where   date between DATE('2025-09-01') and DATE('2025-09-09') group by 1,2", conn)
# df = pd.read_sql("select id,cast(timestamp as date) as dateval,count(*) from clickhouse.naarni.can_output_ac where cast(timestamp as date) between DATE('2025-09-01') and DATE('2025-09-09') group by 1,2", conn)
# df = pd.read_sql("select id,date,count(*) from facts_prod.c2c_gps where   date= DATE('2025-09-09') group by 1,2", conn)
df.head()

  df = pd.read_sql("select id,date,count(*) from clickhouse.naarni.c2c_gps where   date between DATE('2025-09-01') and DATE('2025-09-09') group by 1,2", conn)


Unnamed: 0,id,date,_col2
0,b'9',2025-09-07,7945
1,b'11',2025-09-09,7063
2,b'7',2025-09-06,28511
3,b'14',2025-09-08,25887
4,b'3',2025-09-08,10499


In [None]:
# ---- report configuration ----
TABLE_NAME = "energy_mileage_report"
SOURCE_TABLE = "can_parsed_output_100"
COLUMNS_TO_FETCH = [
    'at_timezone("timestamp", \'Asia/Kolkata\') AS IST',
   '*'
]

In [None]:
# --------------------
# Main execution logic
# --------------------
def main(start_date_str: str = None, end_date_str: str = None):
    conn = connect_to_trino()
    df_duplicate_processed = pd.DataFrame()
    df_duplicate_raw = pd.DataFrame()
    vehicle_ids_for_report = []    
    if conn:
        try:
            # Determine the date range to process
            if start_date_str and end_date_str:
                start_date = date.fromisoformat(start_date_str)
                end_date = date.fromisoformat(end_date_str)
                date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
            else:
                # Default to processing yesterday's data
                date_range = [date.today() - timedelta(days=1)]


            for single_date in date_range:
                date_str = single_date.isoformat()
                logging.info(f"‚ñ∂Ô∏è Starting daily report job for {date_str}")

                # Example 2: Call the function with specific vehicle IDs
                logging.info("\n--- Processing specific vehicle IDs ---")
                # vehicle_ids_for_report = ['3', '16', '18', '19']
                vehicle_ids_for_report = ['3']
                df_raw_specific = fetch_data_for_day(conn, date_str, COLUMNS_TO_FETCH, SOURCE_TABLE, vehicle_ids_for_report)
                df_duplicate_raw = df_raw_specific.copy()

                if not df_raw_specific.empty:
                    # df_processed_specific = analyze_vehicle_energy_stats(df_raw_specific)
                    # df_duplicate_processed = df_processed_specific.copy()
                    if not df_processed_specific.empty:
                        # Updated function call with the missing 'conn' and 'schema' arguments
                        # write_df_to_iceberg(conn, df_processed_specific, TABLE_NAME, db_operations.COLUMN_SCHEMA_MILEAGE)
                        logging.info("‚úÖ Processing and write for specific IDs complete.")
                    else:
                        logging.info("Processed DataFrame is empty. No data to write.")
                else:
                    logging.info("Raw DataFrame is empty. No processing needed.")

        except Exception as e:
            logging.critical(f"‚ùå A critical error occurred in the main script: {e}")

        finally:
            logging.info("üîí STEP 5: Closing Trino connection...")
            conn.close()
            logging.info("‚úÖ STEP 5: Connection closed.")
    else:
        logging.critical("‚ùå Failed to establish a database connection. Exiting.")
    
    return df_duplicate_raw, df_duplicate_processed

In [None]:

if __name__ == "__main__":
    global_df_raw, global_df_processed = main()
    # --- For a one-time manual backfill, uncomment the line below and set your dates ---
    # main(start_date_str='2025-07-24', end_date_str='2025-09-15')

    # --- For daily automated runs, use the existing call ---
    # main()

In [None]:
global_df_raw.head()

In [None]:
cols = global_df_raw.filter(regex='^(ccs|bcl)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='^(temp|low)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(pack1)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(pack2)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(^pack|voltage)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(pack)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='^(pack_temperature)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='^(mot)', axis=1).columns
print(cols,'Length of col:',len(cols))

In [None]:
cols = global_df_raw.filter(regex='(temp)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(door)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(warning|signal|alarm|stat)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(enable)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(fault)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(code)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(bat)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(brake)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(air)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(dc)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(pres)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(insul)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
cols = global_df_raw.filter(regex='(volt|cur)', axis=1).columns
print('Length of col:',len(cols),'\n',cols)

In [None]:
for i in global_df_raw.columns:
    print(i)

In [None]:
global_df_raw.IST.min(),global_df_raw.IST.max()