# Handling Excel Files in Python

# =============================================================================================================================================================


**Author:** Luis Paulo Vinatea Barberena  
**Date:** 2025-05-21

When you need to work with Excel files in Python, you can use the `openpyxl` library. This library allows you to read and write Excel files in the `.xlsx` format.
It is a powerful tool for data manipulation and analysis, especially when dealing with large datasets.


# ========================================================================================================================================================

## Import Standard Libraries

# ========================================================================================================================================================


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import openpyxl
from IPython.display import display, HTML
import matplotlib.colors as mcolors


# Import project utilities
import sys

sys.path.append("../../")
from src.utils.data_processing import *  # noqa: F403
from src.utils.visualization import *  # noqa: F403

# Visualization settings
plt.style.use("seaborn-v0_8-whitegrid")
%matplotlib inline
sns.set_palette("viridis")
plt.rcParams["figure.figsize"] = (12, 8)
pd.set_option("display.max_columns", None)

# For reproducibility
np.random.seed(42)

# Configuration
XLSX_DATA_PATH = "../../data/raw/*.xlsx"
OUTPUT_DIR = "../../data/processed/sites/df1/"

# Global visualization settings
used_colors = set()
site_color_mapping = {}
available_css4_colors = list(mcolors.CSS4_COLORS.keys())


# =============================================================================

## Data Loading Functions

# =============================================================================


In [2]:
def load_excel_files(data_path: str) -> dict:
    """
    Load all Excel files from the specified path and create DataFrames.

    Args:
        data_path (str): Path pattern for Excel files

    Returns:
        dict: Dictionary containing loaded DataFrames
    """
    dfs = {}
    df_count = 1

    try:
        for file in glob.glob(data_path):
            print(f"Loading data from {file}")

            # Load workbook with openpyxl for inspection
            wb = openpyxl.load_workbook(file, data_only=True, read_only=True)
            file_basename = os.path.basename(file)

            print(f"\nWorkbook: {file_basename}")
            print(
                f"Contains {len(wb.sheetnames)} sheets: {', '.join(wb.sheetnames)}"
            )

            # Process each sheet
            for sheet_name in wb.sheetnames:
                df_name = f"df{df_count}"
                df = _load_sheet_with_header_detection(
                    file, sheet_name, wb[sheet_name]
                )

                if df is not None:
                    dfs[df_name] = df
                    print(f"  Loaded {df_name} with shape {df.shape}")
                    df_count += 1

            wb.close()

        print(f"\nSuccessfully loaded {df_count - 1} dataframes.")
        return dfs

    except Exception as e:
        print(f"Error loading data: {e}")
        raise


def _load_sheet_with_header_detection(
    file_path: str, sheet_name: str, sheet
) -> pd.DataFrame:
    """
    Load a sheet with automatic header detection.

    Args:
        file_path (str): Path to the Excel file
        sheet_name (str): Name of the sheet
        sheet: Openpyxl sheet object

    Returns:
        pd.DataFrame: Loaded DataFrame or None if sheet is empty
    """
    try:
        # Check if sheet is empty
        if not sheet.max_row or not sheet.max_column:
            print(
                f"  Warning: Sheet '{sheet_name}' appears to be empty or corrupted"
            )
            return None

        print(f"\n  Sheet: '{sheet_name}'")
        print(
            f"  Dimensions: {sheet.max_row} rows x {sheet.max_column} columns"
        )

        # Check for merged cells
        try:
            merged_cells = list(sheet.merged_cells.ranges)
            if merged_cells:
                print(f"  Contains {len(merged_cells)} merged cell ranges")
        except AttributeError:
            print("  Note: Cannot check merged cells in read-only mode")

        # Load initial DataFrame without headers
        df = pd.read_excel(
            file_path,
            sheet_name=sheet_name,
            header=None,
            na_values=["NA", "N/A", ""],
            keep_default_na=True,
        )

        # Detect header row
        header_row = _detect_header_row(df)

        if header_row is not None:
            # Reload with detected header
            df = pd.read_excel(
                file_path,
                sheet_name=sheet_name,
                header=header_row,
                na_values=["NA", "N/A", ""],
                keep_default_na=True,
            )
            print(f"  Detected header at row {header_row + 1}")

        return df

    except Exception as e:
        print(f"  Error loading sheet '{sheet_name}': {e}")
        return None


def _detect_header_row(df: pd.DataFrame) -> int:
    """
    Detect the most likely header row in a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame to analyze

    Returns:
        int: Index of the header row, or None if not found
    """
    for i in range(min(10, len(df))):
        str_count = sum(1 for x in df.iloc[i] if isinstance(x, str))
        if str_count > 0.5 * df.shape[1]:  # More than half are strings
            return i
    return None


# Step 1: Load data
print("\n" + "=" * 60)
print("STEP 1: LOADING DATA")
print("=" * 60)
dfs = load_excel_files(XLSX_DATA_PATH)



STEP 1: LOADING DATA
Loading data from ../../data/raw/kpis_ambientales_2025.xlsx

Workbook: kpis_ambientales_2025.xlsx
Contains 4 sheets: DATOS, Huella 2025, Calculo de reduccion, INDICADORES

  Sheet: 'DATOS'
  Dimensions: 57 rows x 8 columns
  Note: Cannot check merged cells in read-only mode
  Detected header at row 4
  Loaded df1 with shape (53, 8)

  Sheet: 'Huella 2025'
  Dimensions: 21 rows x 19 columns
  Note: Cannot check merged cells in read-only mode
  Detected header at row 6
  Loaded df2 with shape (15, 19)

  Sheet: 'Calculo de reduccion'
  Dimensions: 9 rows x 8 columns
  Note: Cannot check merged cells in read-only mode
  Detected header at row 3
  Loaded df3 with shape (6, 8)

  Sheet: 'INDICADORES'
  Dimensions: 14 rows x 9 columns
  Note: Cannot check merged cells in read-only mode
  Detected header at row 5
  Loaded df4 with shape (9, 9)
Loading data from ../../data/raw/objetivos_y_estrategias_2025.xlsx

Workbook: objetivos_y_estrategias_2025.xlsx
Contains 1 sheets

# =============================================================================

## Data Cleaning Functions

# =============================================================================


In [3]:
def clean_dataframes(dfs: dict) -> dict:
    """
    Apply comprehensive cleaning to all DataFrames.

    Args:
        dfs (dict): Dictionary of DataFrames

    Returns:
        dict: Dictionary of cleaned DataFrames
    """
    print("Starting data cleaning process...")

    # Apply cleaning steps in order
    dfs = standardize_headers(dfs)
    dfs = remove_empty_rows_and_cols(dfs)
    dfs = handle_nan_values(dfs)
    dfs = title_case_columns(dfs)

    print("Data cleaning completed!")
    return dfs


def standardize_headers(dfs: dict) -> dict:
    """
    Standardize column headers across all DataFrames.

    Args:
        dfs (dict): Dictionary of DataFrames

    Returns:
        dict: Dictionary with standardized headers
    """
    print(f"{'=' * 50}")
    print("STANDARDIZING DATAFRAME HEADERS")
    print(f"{'=' * 50}\n")

    for df_name, df_obj in dfs.items():
        print(f"Processing {df_name}...")

        # Handle unnamed columns
        df_obj = _fix_unnamed_columns(df_obj)

        # Standardize date columns
        df_obj = _standardize_date_columns(df_obj)

        dfs[df_name] = df_obj

    print("\nHeader standardization complete!")
    return dfs


def _fix_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Fix unnamed columns by finding header values in data rows."""
    unnamed_cols = [col for col in df.columns if "Unnamed" in str(col)]

    if len(unnamed_cols) > 0:
        print(f"  Found {len(unnamed_cols)} unnamed columns")

        for i in range(min(5, len(df))):
            row = df.iloc[i]
            str_values = [v for v in row if isinstance(v, str) and pd.notna(v)]

            if len(str_values) >= len(unnamed_cols) * 0.7:
                new_names = {}
                original_cols = df.columns.tolist()

                for j, col in enumerate(original_cols):
                    if (
                        "Unnamed" in str(col)
                        and j < len(row)
                        and pd.notna(row.iloc[j])
                        and isinstance(row.iloc[j], str)
                    ):
                        new_names[col] = str(row.iloc[j]).strip()

                if new_names:
                    print(f"  Found potential header values in row {i + 1}")
                    print(f"  Renaming {len(new_names)} columns")

                    df = df.rename(columns=new_names)
                    df = df.drop(index=i)
                    print(f"  New columns: {list(new_names.values())}")
                    break

    return df


def _standardize_date_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize date column names to Spanish format."""
    date_mappings = {
        "January": "Enero",
        "February": "Febrero",
        "March": "Marzo",
        "April": "Abril",
        "May": "Mayo",
        "June": "Junio",
        "July": "Julio",
        "August": "Agosto",
        "September": "Septiembre",
        "October": "Octubre",
        "November": "Noviembre",
        "December": "Diciembre",
    }

    date_cols = [col for col in df.columns if str(col) in date_mappings]
    if date_cols:
        print(
            f"  Standardizing {len(date_cols)} date-related columns to Spanish format"
        )
        df = df.rename(columns=date_mappings)

    return df


def remove_empty_rows_and_cols(dfs: dict) -> dict:
    """Remove completely empty rows and columns from all DataFrames."""
    for df_name, df_obj in dfs.items():
        original_shape = df_obj.shape
        df_obj = df_obj.dropna(how="all").dropna(axis=1, how="all")
        dfs[df_name] = df_obj
        print(f"Cleaned {df_name}: {original_shape} -> {df_obj.shape}")
    return dfs


def handle_nan_values(dfs: dict) -> dict:
    """Analyze and selectively handle NaN values in DataFrames."""
    for df_name, df_obj in dfs.items():
        print(f"\n{'-' * 50}")
        print(f"NaN analysis for {df_name}:")

        # Calculate NaN statistics
        total_cells = df_obj.shape[0] * df_obj.shape[1]
        nan_count = df_obj.isna().sum().sum()
        nan_percentage = (nan_count / total_cells) if total_cells > 0 else 0

        print(
            f"- Total NaN values: {nan_count} ({nan_percentage:.1%} of all cells)"
        )

        # Handle high-NaN columns
        df_obj = _remove_high_nan_columns(df_obj)

        # Handle critical row removal
        df_obj = _remove_critical_nan_rows(df_obj)

        dfs[df_name] = df_obj
        print(f"- Final shape after analysis: {df_obj.shape}")

    return dfs


def _remove_high_nan_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Remove columns with >80% NaN values."""
    column_nan_pct = df.isna().mean().sort_values(ascending=False)
    high_nan_cols = column_nan_pct[column_nan_pct > 0.8].index.tolist()

    if high_nan_cols:
        print(f"- Columns with >80% NaNs: {', '.join(high_nan_cols)}")
        df = df.drop(columns=high_nan_cols)

    return df


def _remove_critical_nan_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Remove rows where critical columns are all NaN."""
    critical_cols = [col for col in df.columns if "Unnamed" not in str(col)]

    if not critical_cols and len(df.columns) > 2:
        critical_cols = [df.columns[1], df.columns[2]]

    if critical_cols:
        print(f"- Critical columns: {', '.join(map(str, critical_cols))}")
        rows_to_drop = df[df[critical_cols].isna().all(axis=1)].index
        print(
            f"- Found {len(rows_to_drop)} rows where all critical columns are NaN"
        )
        df = df.drop(index=rows_to_drop)

    return df


def title_case_columns(dfs: dict) -> dict:
    """Apply title case to all column names."""
    for df_name, df_obj in dfs.items():
        df_obj.columns = df_obj.columns.str.title()
        dfs[df_name] = df_obj
    return dfs


# Step 2: Clean data
print("\n" + "=" * 60)
print("STEP 2: CLEANING DATA")
print("=" * 60)
dfs = clean_dataframes(dfs)



STEP 2: CLEANING DATA
Starting data cleaning process...
STANDARDIZING DATAFRAME HEADERS

Processing df1...
  Found 2 unnamed columns
  Standardizing 4 date-related columns to Spanish format
Processing df2...
  Found 7 unnamed columns
Processing df3...
  Found 1 unnamed columns
Processing df4...
  Found 2 unnamed columns
  Found potential header values in row 1
  Renaming 1 columns
  New columns: ['variación porcentual mensual del consumo']
Processing df5...

Header standardization complete!
Cleaned df1: (53, 8) -> (43, 7)
Cleaned df2: (15, 19) -> (14, 10)
Cleaned df3: (6, 8) -> (6, 7)
Cleaned df4: (8, 9) -> (5, 8)
Cleaned df5: (33, 11) -> (33, 11)

--------------------------------------------------
NaN analysis for df1:
- Total NaN values: 82 (27.2% of all cells)
- Columns with >80% NaNs: Unnamed: 3
- Critical columns: STIOS, INDICADOR, Enero, Febrero, Marzo, Abril
- Found 0 rows where all critical columns are NaN
- Final shape after analysis: (43, 6)

--------------------------------

# =============================================================================

## Data Analysis Functions

# =============================================================================


In [4]:
def analyze_dataframes(dfs: dict) -> None:
    """Print comprehensive analysis of all DataFrames."""
    print(f"Found {len(dfs)} dataframes in the environment\n")

    for df_name, df_obj in dfs.items():
        _print_single_dataframe_info(df_name, df_obj)


def _print_single_dataframe_info(df_name: str, df_obj: pd.DataFrame) -> None:
    """Print detailed information about a single DataFrame."""
    print(f"{'=' * 50}")
    print(f"DataFrame: {df_name} with shape {df_obj.shape}")
    print(f"{'=' * 50}")

    # Basic information
    total_cells = df_obj.shape[0] * df_obj.shape[1]
    missing_values = df_obj.isna().sum().sum()
    missing_pct = (missing_values / total_cells) if total_cells > 0 else 0

    print("\n📊 Basic Information:")
    print(f"  - Rows: {df_obj.shape[0]}")
    print(f"  - Columns: {df_obj.shape[1]}")
    print(
        f"  - Missing values: {missing_values} ({missing_pct:.1%} of all cells)"
    )

    # Data types
    print("\n📋 Data Types:")
    for dtype, count in df_obj.dtypes.value_counts().items():
        print(f"  - {dtype}: {count} columns")

    # Unnamed columns analysis
    unnamed_cols = [col for col in df_obj.columns if "Unnamed" in str(col)]
    if unnamed_cols:
        print(f"\n⚠️ Found {len(unnamed_cols)} unnamed columns")

    # Content analysis for specific patterns
    _analyze_content_patterns(df_obj)

    # Preview
    print("\n🔍 Data Preview:")
    print(df_obj.head(3))
    print("\n\n")


def _analyze_content_patterns(df: pd.DataFrame) -> None:
    """Analyze content patterns in DataFrame."""
    if "Unnamed: 1" in df.columns and "Unnamed: 2" in df.columns:
        unique_sites = df["Unnamed: 1"].dropna().unique()
        unique_indicators = df["Unnamed: 2"].dropna().unique()

        print("\n📈 Content Analysis:")
        print(f"  - Potential sites: {len(unique_sites)}")
        print(f"  - Potential indicators: {len(unique_indicators)}")

        if len(unique_sites) > 0:
            print("\n🏢 Sample sites:", ", ".join(map(str, unique_sites[:5])))
        if len(unique_indicators) > 0:
            print(
                "\n📏 Sample indicators:",
                ", ".join(map(str, unique_indicators[:5])),
            )


def preview_dataframes_html(dfs: dict) -> None:
    """Display DataFrames as scrollable HTML tables."""
    for df_name, df_obj in dfs.items():
        print(f"{'=' * 50}")
        print(f"Previewing {df_name}...")

        html_table = df_obj.head(10).to_html(index=False, max_rows=10)
        styled_html = f"""
        <div style="overflow-x: auto; max-height: 500px; overflow-y: auto;">
            {html_table}
        </div>
        """
        display(HTML(styled_html))
        print(f"{'=' * 50}\n")


def display_datatypes(dfs: dict) -> None:
    """Display data types for all DataFrames in HTML tables."""
    for df_name, df_obj in dfs.items():
        print(f"{'=' * 50}")
        print(f"DataFrame: {df_name} with shape {df_obj.shape}")
        print(f"{'=' * 50}")

        dtype_df = pd.DataFrame(df_obj.dtypes).reset_index()
        dtype_df.columns = ["Column", "Data Type"]
        display(HTML(dtype_df.to_html(index=False)))
        print(f"{'=' * 50}\n")


# Step 3: Analyze data
print("\n" + "=" * 60)
print("STEP 3: ANALYZING DATA")
print("=" * 60)
analyze_dataframes(dfs)



STEP 3: ANALYZING DATA
Found 5 dataframes in the environment

DataFrame: df1 with shape (43, 6)

📊 Basic Information:
  - Rows: 43
  - Columns: 6
  - Missing values: 46 (17.8% of all cells)

📋 Data Types:
  - object: 6 columns

🔍 Data Preview:
      Stios Indicador  Enero Febrero  Marzo  Abril
0      TSAN      Kw/h  54850   59992  48422  58710
1       GLP       NaN  38136   41690  33650  40797
2  AVIACION       NaN  15178    6500   8244   6612



DataFrame: df2 with shape (12, 9)

📊 Basic Information:
  - Rows: 12
  - Columns: 9
  - Missing values: 3 (2.8% of all cells)

📋 Data Types:
  - float64: 7 columns
  - object: 2 columns

⚠️ Found 5 unnamed columns

📈 Content Analysis:
  - Potential sites: 3
  - Potential indicators: 12

🏢 Sample sites: 1.0, 3.0, 2.0

📏 Sample indicators: Combustibles - Diesel (Remolcadores y Barcazas), Combustibles - Diesel Flota puma , Combustible - Nafta vehiculos Flota Puma , Consumo de Agua terminales , Electricidad (Terminales)

🔍 Data Preview:
   Unname

# =============================================================================

## Specialized Data Processing Functions

# =============================================================================


In [5]:
def process_main_dataframe(dfs: dict, df_key: str = "df1") -> dict:
    """
    Process the main DataFrame with specialized business logic.

    Args:
        dfs (dict): Dictionary of DataFrames
        df_key (str): Key of the main DataFrame to process

    Returns:
        dict: Dictionary of processed section DataFrames
    """
    if df_key not in dfs:
        raise ValueError(f"DataFrame '{df_key}' not found in dfs")

    df = dfs[df_key].copy()

    # Remove total rows and reset index
    df = df[~df["Stios"].isin(["TOTAL PY"])].reset_index(drop=True)

    # Split into sections
    site_dfs = _split_dataframe_by_sections(df)

    # Clean sections
    site_dfs = _clean_section_data(site_dfs)

    # Apply specialized processing
    site_dfs = _apply_specialized_processing(site_dfs)

    # Add calculated columns
    site_dfs = _add_calculated_columns(site_dfs)

    return site_dfs


def _split_dataframe_by_sections(df: pd.DataFrame) -> dict:
    """Split DataFrame into sections using TOTAL rows as markers."""
    site_dfs = {}
    current_section = 0
    current_rows = []

    for index, row in df.iterrows():
        if "TOTAL" in str(row["Stios"]):
            # Save previous section
            if current_rows:
                site_dfs[f"Section_{current_section}"] = pd.DataFrame(
                    current_rows
                ).reset_index(drop=True)
                current_section += 1
            current_rows = [row]
        else:
            current_rows.append(row)

    # Save last section
    if current_rows:
        site_dfs[f"Section_{current_section}"] = pd.DataFrame(
            current_rows
        ).reset_index(drop=True)

    # Handle Section_5 split
    if "Section_5" in site_dfs and len(site_dfs["Section_5"]) > 2:
        last_2_rows = site_dfs["Section_5"].tail(2).copy()
        site_dfs["Section_5"] = (
            site_dfs["Section_5"].iloc[:-2].reset_index(drop=True)
        )
        site_dfs["Section_6"] = last_2_rows.reset_index(drop=True)

    return site_dfs


def _clean_section_data(site_dfs: dict) -> dict:
    """Clean section data by removing header rows and filling NaN values."""
    for i in range(1, 6):
        section_name = f"Section_{i}"
        if section_name in site_dfs and len(site_dfs[section_name]) > 1:
            # Remove first two rows (headers)
            site_dfs[section_name] = (
                site_dfs[section_name].iloc[2:].reset_index(drop=True)
            )

            # Fill NaN values in Indicador column
            if "Indicador" in site_dfs[section_name].columns:
                df = site_dfs[section_name]
                valid_values = df["Indicador"].dropna()
                valid_values = valid_values[
                    ~valid_values.str.lower().eq("indicador")
                ]

                if not valid_values.empty:
                    first_valid = valid_values.iloc[0]
                    df["Indicador"] = df["Indicador"].fillna(first_valid)

    # Remove last row from Section_5
    if "Section_5" in site_dfs and len(site_dfs["Section_5"]) > 0:
        site_dfs["Section_5"] = (
            site_dfs["Section_5"].iloc[:-1].reset_index(drop=True)
        )

    return site_dfs


def _apply_specialized_processing(site_dfs: dict) -> dict:
    """Apply specialized processing to specific sections."""
    # Process Section_1: Replace NaNs with column means (df2 processing)
    if "Section_1" in site_dfs:
        site_dfs["Section_1"] = _replace_zeros_with_means(
            site_dfs["Section_1"]
        )

    # Process Section_2: Replace zeros with column means
    if "Section_2" in site_dfs:
        site_dfs["Section_2"] = _replace_zeros_with_means(
            site_dfs["Section_2"]
        )

    # Process Section_3: Replace missing values with column means
    if "Section_3" in site_dfs:
        site_dfs["Section_3"] = _replace_zeros_with_means(
            site_dfs["Section_3"]
        )

    # Process Section_6: Convert gallons to liters
    if "Section_6" in site_dfs:
        site_dfs["Section_6"] = _convert_gallons_to_liters(
            site_dfs["Section_6"]
        )

    return site_dfs


def _replace_zeros_with_means(df: pd.DataFrame) -> pd.DataFrame:
    """Replace zero values with column means for numeric columns."""
    month_columns = ["Enero", "Febrero", "Marzo", "Abril"]

    for col in month_columns:
        if col in df.columns:
            # Convert to numeric
            df[col] = pd.to_numeric(df[col], errors="coerce")

            # Calculate mean of non-zero values
            non_zero_values = df[col][(df[col] != 0) & (df[col].notna())]

            if len(non_zero_values) > 0:
                col_mean = non_zero_values.mean()

                # Replace zeros and NaNs with mean
                df[col] = df[col].replace(0, col_mean).fillna(col_mean)
                print(
                    f"Replaced zeros and NaNs in {col} with mean: {col_mean:.2f}"
                )

    return df


def _convert_gallons_to_liters(df: pd.DataFrame) -> pd.DataFrame:
    """Convert gallons to liters in the first row if applicable."""
    if len(df) > 0 and "Galones" in str(df.iloc[0]["Indicador"]):
        month_cols = ["Enero", "Febrero", "Marzo", "Abril"]
        for col in month_cols:
            if col in df.columns:
                df.loc[0, col] = df.iloc[0][col] * 3.78541

        df.loc[0, "Indicador"] = "Consumo de Diesel (Litros)"
        print("Converted gallons to liters and renamed indicator")

    return df


def _add_calculated_columns(site_dfs: dict) -> dict:
    """Add calculated Total column to all sections."""
    numeric_columns = ["Enero", "Febrero", "Marzo", "Abril"]

    for section_name, df in site_dfs.items():
        if "Total" not in df.columns:
            # Convert to numeric and calculate totals
            for col in numeric_columns:
                if col in df.columns:
                    df[col] = pd.to_numeric(df[col], errors="coerce")

            df["Total"] = df[numeric_columns].sum(axis=1, skipna=True)
            site_dfs[section_name] = df
            print(f"Added Total column to {section_name}")

    return site_dfs


def save_sections_to_csv(site_dfs: dict, output_dir: str) -> None:
    """Save each section to a separate CSV file."""
    os.makedirs(output_dir, exist_ok=True)

    for section_name, df in site_dfs.items():
        filename = section_name.replace("Section_", "section_").lower()
        filepath = os.path.join(output_dir, f"{filename}.csv")
        df.to_csv(filepath, index=False)
        print(f"Saved {filename} to {filepath}")


# Step 5: Process main DataFrame
print("\n" + "=" * 60)
print("STEP 5: PROCESSING MAIN DATAFRAME")
print("=" * 60)
site_dfs = process_main_dataframe(dfs)

# Step 6: Save processed data
print("\n" + "=" * 60)
print("STEP 6: SAVING PROCESSED DATA")
print("=" * 60)
save_sections_to_csv(site_dfs, OUTPUT_DIR)



STEP 5: PROCESSING MAIN DATAFRAME
Replaced zeros and NaNs in Enero with mean: 193360.00
Replaced zeros and NaNs in Febrero with mean: 960.00
Replaced zeros and NaNs in Marzo with mean: 740.00
Replaced zeros and NaNs in Abril with mean: 630.00
Replaced zeros and NaNs in Enero with mean: 14820.00
Replaced zeros and NaNs in Febrero with mean: 23023.33
Replaced zeros and NaNs in Marzo with mean: 29185.00
Replaced zeros and NaNs in Abril with mean: 16935.00
Replaced zeros and NaNs in Enero with mean: 1498.50
Replaced zeros and NaNs in Febrero with mean: 558.25
Replaced zeros and NaNs in Marzo with mean: 675.25
Replaced zeros and NaNs in Abril with mean: 1052.10
Converted gallons to liters and renamed indicator
Added Total column to Section_0
Added Total column to Section_1
Added Total column to Section_2
Added Total column to Section_3
Added Total column to Section_4
Added Total column to Section_5
Added Total column to Section_6

STEP 6: SAVING PROCESSED DATA
Saved section_0 to ../../data