In [1]:
# Block: Generate Summaries Directly from CSV using Pandas
import pandas as pd
import json
from pathlib import Path
from collections import defaultdict

# --- Configuration ---
# This should be in the parent directory as per your setup
CSV_PATH = "CONSOLIDATED_OUTPUT_DATA.csv" 
SUMMARY_PATH = "db_summary.json"
KPI_MAPPING_PATH = "context_kpi_mapping.json"
# -------------------

def create_summaries_from_csv(csv_path, summary_path, kpi_mapping_path):
    """
    Reads a CSV file into a Pandas DataFrame and generates a db_summary.json
    and a context_kpi_mapping.json without using an external database.
    """
    print("🚀 Starting data summary generation using Pandas...")
    csv_file = Path(csv_path)
    if not csv_file.exists():
        print(f"❌ Error: CSV file not found at '{csv_path}'")
        return

    # --- Load Data into Pandas ---
    try:
        # Using low_memory=False can help prevent dtype mixing issues with large files
        df = pd.read_csv(csv_path, low_memory=False)
        print(f"✔️ Successfully loaded '{csv_path}' into DataFrame.")
    except Exception as e:
        print(f"❌ Error reading CSV file: {e}")
        return

    # === Part 1: Generate Database Summary (db_summary.json) ===
    print("✔️ Generating database summary...")
    table_name = csv_file.stem
    db_summary = {"tables": [{"name": table_name, "columns": []}]}
    
    for col_name in df.columns:
        column_data = {
            "name": col_name,
            "type": str(df[col_name].dtype), # Get Pandas dtype as string
            "unique_values": None
        }
        
        try:
            unique_count = df[col_name].nunique()
            # Only list unique values if they are few, to keep the file size manageable
            if unique_count <= 50:
                # Convert to string and handle potential NaN values gracefully
                values = [str(v) for v in df[col_name].unique() if pd.notna(v)]
                column_data["unique_values"] = values
        except Exception as e:
            print(f"⚠️ Could not process column '{col_name}' for unique values: {e}")
        
        db_summary["tables"][0]["columns"].append(column_data)
        
    # Write the summary to a JSON file
    with open(summary_path, 'w') as f:
        json.dump(db_summary, f, indent=4)
    print(f"✔️ Database summary saved to '{summary_path}'")
    
    # === Part 2: Generate Context-KPI Mapping (context_kpi_mapping.json) ===
    print(f"✔️ Generating Context-KPI mapping...")
    context_kpi_map = defaultdict(list)
    
    # Ensure the required columns exist
    if "Context" in df.columns and "KPI" in df.columns:
        # Pandas equivalent of 'SELECT DISTINCT "Context", "KPI"'
        distinct_pairs = df[["Context", "KPI"]].drop_duplicates()
        
        for _, row in distinct_pairs.iterrows():
            context, kpi = row["Context"], row["KPI"]
            # Ensure we don't add null/empty values to our mapping
            if pd.notna(context) and pd.notna(kpi):
                context_kpi_map[context].append(kpi)
                
        # Write the mapping to a JSON file
        with open(kpi_mapping_path, 'w') as f:
            json.dump(context_kpi_map, f, indent=4, sort_keys=True)
        print(f"✔️ Context-KPI mapping saved to '{kpi_mapping_path}'")
    else:
        print("❌ Error: 'Context' or 'KPI' column not found in the CSV. Cannot create mapping.")


# --- Run the full setup process ---
create_summaries_from_csv(CSV_PATH, SUMMARY_PATH, KPI_MAPPING_PATH)
print("\n✅ Full data processing is complete!")

🚀 Starting data summary generation using Pandas...
✔️ Successfully loaded 'CONSOLIDATED_OUTPUT_DATA.csv' into DataFrame.
✔️ Generating database summary...
✔️ Database summary saved to 'db_summary.json'
✔️ Generating Context-KPI mapping...
✔️ Context-KPI mapping saved to 'context_kpi_mapping.json'

✅ Full data processing is complete!
