# Pollutant Data Analysis and Visualization

This Jupyter Notebook performs a comprehensive analysis and visualization of pollutant data from a CSV file. It includes steps for data loading, cleaning, and generating 40 different charts to provide insights into pollutant concentrations and trends.

The charts are designed to be publication-ready with improved aesthetics, clarity, and resolution.

**Author:** codernumber1
**Coding Assistant ID:** 85317

## 1. Setup and Imports
First, we'll import all the necessary libraries. If you don't have them installed, you can uncomment and run the `pip install` commands.

In [None]:
# Required libraries:
# !pip install pandas matplotlib seaborn pathlib statsmodels scikit-learn

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import random
import numpy as np # For numerical operations like z-score
from statsmodels.tsa.seasonal import seasonal_decompose # For time series decomposition
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf # For autocorrelation plots
from sklearn.preprocessing import StandardScaler # For scaling data for Andrews Curves/Parallel Coordinates

def get_random_coding_assistant_id() -> int:
    """Generates a random integer for the coding assistant ID."""
    return random.randint(10000, 99999)

## 2. Data Loading and Cleaning Function
This function handles loading the CSV data, converting 'Date' columns to datetime objects, and filling missing numerical values with the mean of their respective columns. It also adds a 'DayOfWeek' column.

In [None]:
def load_and_clean_data(file_path: Path) -> pd.DataFrame:
    """
    Loads the pollutant data from a CSV file, performs initial cleaning,
    and ensures correct data types.

    Args:
        file_path (Path): The path to the CSV file.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    print(f"Loading data from: {file_path}")
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully.")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure the path is correct.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return pd.DataFrame()

    print("Initial data types:")
    print(df.info())

    # --- Data Cleaning ---
    # Convert 'Date' column to datetime objects
    # 'coerce' will turn unparseable dates into NaT (Not a Time)
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True) # Assuming DD.MM.YYYY format
        # Drop rows where 'Date' could not be parsed
        df.dropna(subset=['Date'], inplace=True)
        # Sort by date for time series analysis
        df.sort_values(by='Date', inplace=True)
        print("\n'Date' column converted to datetime and sorted.")

        # Add 'DayOfWeek' column to the original DataFrame here
        df['DayOfWeek'] = df['Date'].dt.day_name()
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        df['DayOfWeek'] = pd.Categorical(df['DayOfWeek'], categories=day_order, ordered=True)
        print("'DayOfWeek' column added to DataFrame.")

    else:
        print("Warning: 'Date' column not found. Time-series analysis might be limited.")

    # Identify pollutant columns (excluding 'Date', 'Sheet', 'Location', 'DayOfWeek')
    # Make sure 'DayOfWeek' is excluded from pollutant_cols as it's categorical
    pollutant_cols = [col for col in df.columns if col not in ['Date', 'Sheet', 'Location', 'DayOfWeek']]

    # Convert pollutant columns to numeric, coercing errors to NaN
    for col in pollutant_cols:
        # Check if column exists before attempting conversion
        if col in df.columns:
            # Replace any non-numeric markers (like '-') with NaN before converting
            df[col] = df[col].replace('-', pd.NA)
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")

    # Handle missing values: Fill numerical NaNs with the mean of their respective columns
    # This is a common imputation strategy to ensure numerical operations can proceed.
    print("\nHandling missing values (NaNs) by filling with column mean...")
    for col in pollutant_cols:
        if col in df.columns and df[col].isnull().any():
            # Fix for FutureWarning: use direct assignment instead of inplace=True
            mean_val = df[col].mean()
            df[col] = df[col].fillna(mean_val) # Changed from inplace=True
            print(f"  Filled NaN in '{col}' with mean: {mean_val:.2f}")

    print("\nData cleaning complete. Final data types:")
    print(df.info())
    print(f"DataFrame shape after cleaning: {df.shape}")
    return df

## 3. Chart Generation Function
This function generates 40 different plots, including histograms, time series, box plots, correlation heatmaps, and more advanced visualizations. It saves each chart as a high-resolution PNG file in the specified output directory.

In [None]:
def generate_charts(df: pd.DataFrame, output_dir: Path):
    """
    Generates 40 different charts to visualize the pollutant data and saves them
    to the specified output directory. Charts are designed for publication.

    Args:
        df (pd.DataFrame): The cleaned DataFrame containing pollutant data.
        output_dir (Path): The directory to save the charts.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"\nGenerating charts and saving to '{output_dir}'...")

    # Set a consistent style for plots for publication quality
    sns.set_style("whitegrid")
    plt.rcParams.update({
        'figure.figsize': (12, 7),  # Slightly larger default figure size
        'font.size': 12,            # Base font size
        'axes.labelsize': 14,       # Axis label font size
        'axes.titlesize': 16,       # Title font size
        'xtick.labelsize': 12,      # X-tick label font size
        'ytick.labelsize': 12,      # Y-tick label font size
        'legend.fontsize': 12,      # Legend font size
        'lines.linewidth': 1.5,     # Line width for line plots
        'axes.edgecolor': '0.15',   # Darker axes for better contrast
        'axes.linewidth': 1.2,      # Thicker axes lines
        'grid.linestyle': '--',     # Dashed gridlines
        'grid.linewidth': 0.7,      # Thinner gridlines
        'savefig.dpi': 300,         # Higher DPI for publication quality
        'savefig.bbox': 'tight'     # Ensures all elements fit in the saved figure
    })

    # Identify available pollutant columns for plotting
    available_pollutants = [
        col for col in ['NO2', 'SO2', 'PM10', 'PM2.5', 'O3_8hrs', 'CO_8hrs_mg_m3', 'NH3', 'Pb']
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]) and not df[col].isnull().all()
    ]
    if not available_pollutants:
        print("No valid numeric pollutant columns with data found for plotting. Skipping chart generation.")
        return

    # Define color palettes for consistency
    palette_hist = sns.color_palette("viridis", n_colors=4)
    palette_ts = sns.color_palette("plasma", n_colors=4)
    palette_categorical = sns.color_palette("pastel")


    # --- Original Charts (1-20) ---

    # --- Chart 1-4: Histograms/KDE for individual pollutants ---
    for i, pollutant in enumerate(available_pollutants[:4]): # Plot first 4 available
        plt.figure()
        sns.histplot(df[pollutant], kde=True, bins=30, color=palette_hist[i], edgecolor='black')
        plt.title(f'Distribution of {pollutant} Concentration in Bhilai', fontsize=16)
        plt.xlabel(f'{pollutant} Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
        plt.ylabel('Frequency', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / f'chart_{i+1}_hist_{pollutant}.png', dpi=300)
        plt.close()
        print(f"Generated chart {i+1}: Histogram for {pollutant}")

    # --- Chart 5: Box plot for all key pollutants ---
    plt.figure(figsize=(14, 8)) # Larger figure for multiple box plots
    sns.boxplot(data=df[available_pollutants], palette='Set2')
    plt.title('Box Plot of Key Pollutant Concentrations in Bhilai', fontsize=16)
    plt.ylabel('Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(output_dir / 'chart_5_boxplot_all_pollutants.png', dpi=300)
    plt.close()
    print("Generated chart 5: Box plot for all pollutants")

    # Ensure 'Date' column is available and has valid data for time series plots
    if 'Date' in df.columns and not df['Date'].empty and pd.api.types.is_datetime64_any_dtype(df['Date']):
        # Create a copy and aggregate by date to handle duplicates for time series
        # ONLY aggregate available_pollutants (numeric columns)
        df_time_series = df.groupby('Date')[available_pollutants].mean().reset_index()
        df_time_series.sort_values(by='Date', inplace=True) # Ensure sorted after aggregation

        # Re-add 'DayOfWeek' to df_time_series as it was lost in groupby.mean()
        # This is needed for charts that specifically use df_time_series AND DayOfWeek
        df_time_series['DayOfWeek'] = df_time_series['Date'].dt.day_name()
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        df_time_series['DayOfWeek'] = pd.Categorical(df_time_series['DayOfWeek'], categories=day_order, ordered=True)


        # --- Chart 6-9: Time series plots for individual pollutants ---
        for i, pollutant in enumerate(available_pollutants[:4]): # Plot first 4 available
            plt.figure(figsize=(14, 7))
            sns.lineplot(x='Date', y=pollutant, data=df_time_series, color=palette_ts[i], linewidth=2)
            plt.title(f'Time Series of {pollutant} Concentration in Bhilai', fontsize=16)
            plt.xlabel('Date', fontsize=14)
            plt.ylabel(f'{pollutant} Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(output_dir / f'chart_{i+6}_timeseries_{pollutant}.png', dpi=300)
            plt.close()
            print(f"Generated chart {i+6}: Time series for {pollutant}")

        # --- Chart 10: Multi-pollutant time series (e.g., top 3) ---
        if len(available_pollutants) >= 3:
            plt.figure(figsize=(16, 9)) # Larger figure for multiple lines
            for i, pollutant in enumerate(available_pollutants[:3]):
                sns.lineplot(x='Date', y=pollutant, data=df_time_series, label=pollutant, alpha=0.8, linewidth=2)
            plt.title('Time Series of Key Pollutant Concentrations in Bhilai', fontsize=18)
            plt.xlabel('Date', fontsize=14)
            plt.ylabel('Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.legend(title='Pollutant', fontsize=12, title_fontsize=14, loc='upper left', bbox_to_anchor=(1, 1))
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend outside
            plt.savefig(output_dir / 'chart_10_timeseries_multi_pollutant.png', dpi=300)
            plt.close()
            print("Generated chart 10: Multi-pollutant time series")

        # --- Chart 11-12: Monthly Averages ---
        # Changed 'M' to 'ME' as per FutureWarning
        df_monthly = df_time_series.set_index('Date').resample('ME').mean(numeric_only=True)

        if 'PM10' in df_monthly.columns and not df_monthly['PM10'].isnull().all():
            plt.figure(figsize=(14, 7))
            sns.lineplot(x=df_monthly.index, y='PM10', data=df_monthly, marker='o', color='teal', linewidth=2)
            plt.title('Monthly Average of PM10 Concentration in Bhilai', fontsize=16)
            plt.xlabel('Month', fontsize=14)
            plt.ylabel('Average PM10 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_11_monthly_avg_pm10.png', dpi=300)
            plt.close()
            print("Generated chart 11: Monthly average PM10")
        else:
            print("Skipping chart 11: PM10 data not sufficient for monthly average.")

        if 'NO2' in df_monthly.columns and not df_monthly['NO2'].isnull().all():
            plt.figure(figsize=(14, 7))
            sns.lineplot(x=df_monthly.index, y='NO2', data=df_monthly, marker='o', color='darkorange', linewidth=2)
            plt.title('Monthly Average of NO2 Concentration in Bhilai', fontsize=16)
            plt.xlabel('Month', fontsize=14)
            plt.ylabel('Average NO2 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_12_monthly_avg_no2.png', dpi=300)
            plt.close()
            print("Generated chart 12: Monthly average NO2")
        else:
            print("Skipping chart 12: NO2 data not sufficient for monthly average.")

        # --- Chart 13: Yearly Average (if data spans multiple years) ---
        if not df_time_series['Date'].empty and df_time_series['Date'].dt.year.nunique() > 1:
            # Changed 'Y' to 'YE' as per FutureWarning
            df_yearly = df_time_series.set_index('Date').resample('YE').mean(numeric_only=True)
            if 'PM2.5' in df_yearly.columns and not df_yearly['PM2.5'].isnull().all():
                plt.figure(figsize=(12, 7))
                sns.lineplot(x=df_yearly.index.year, y='PM2.5', data=df_yearly, marker='o', color='forestgreen', linewidth=2)
                plt.title('Yearly Average of PM2.5 Concentration in Bhilai', fontsize=16)
                plt.xlabel('Year', fontsize=14)
                plt.ylabel('Average PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
                plt.xticks(fontsize=12)
                plt.yticks(fontsize=12)
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.tight_layout()
                plt.savefig(output_dir / 'chart_13_yearly_avg_pm25.png', dpi=300)
                plt.close()
                print("Generated chart 13: Yearly average PM2.5")
            else:
                print("Skipping chart 13: PM2.5 data not sufficient for yearly average.")
        else:
            print("Skipping chart 13: Not enough years in data for yearly average PM2.5.")

        # --- Chart 14: Weekly Average (Day of Week) ---
        # Now uses df_time_series which has DayOfWeek
        if 'SO2' in df_time_series.columns and not df_time_series['SO2'].isnull().all():
            plt.figure(figsize=(12, 7))
            sns.boxplot(x='DayOfWeek', y='SO2', data=df_time_series, palette='pastel', hue='DayOfWeek', legend=False)
            plt.title('SO2 Concentration by Day of Week in Bhilai', fontsize=16)
            plt.xlabel('Day of Week', fontsize=14)
            plt.ylabel('SO2 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.xticks(rotation=30, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7, axis='y') # Only y-axis grid
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_14_weekly_avg_so2.png', dpi=300)
            plt.close()
            print("Generated chart 14: SO2 by Day of Week")
        else:
            print("Skipping chart 14: SO2 data not sufficient for weekly average.")

        # --- Chart 15: Rolling Average of PM10 ---
        if 'PM10' in df_time_series.columns and not df_time_series['PM10'].isnull().all():
            # Calculate a 7-day rolling average for PM10
            df_time_series['PM10_Rolling_Avg'] = df_time_series['PM10'].rolling(window=7, min_periods=1).mean()
            plt.figure(figsize=(16, 8))
            sns.lineplot(x='Date', y='PM10', data=df_time_series, label='Daily PM10', alpha=0.6, color='skyblue', linewidth=1.5)
            sns.lineplot(x='Date', y='PM10_Rolling_Avg', data=df_time_series, label='PM10 (7-Day Rolling Avg)', color='red', linewidth=2.5)
            plt.title('PM10 Concentration with 7-Day Rolling Average in Bhilai', fontsize=16)
            plt.xlabel('Date', fontsize=14)
            plt.ylabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
            plt.legend(title='Data Series', fontsize=12, title_fontsize=14)
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_15_rolling_avg_pm10.png', dpi=300)
            plt.close()
            print("Generated chart 15: Rolling average PM10")
        else:
            print("Skipping chart 15: PM10 data not sufficient for rolling average.")
    else:
        print("Skipping time series and temporal charts as 'Date' column is missing, empty, or not datetime type.")


    # --- Chart 16: Correlation Heatmap ---
    numeric_df = df[available_pollutants]
    if not numeric_df.empty and numeric_df.shape[1] > 1: # Ensure there's more than one numeric column to correlate
        plt.figure(figsize=(12, 10))
        sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, linecolor='black', annot_kws={"size": 10})
        plt.title('Correlation Heatmap of Pollutants in Bhilai', fontsize=16)
        plt.xticks(rotation=45, ha='right', fontsize=12)
        plt.yticks(rotation=0, fontsize=12)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_16_correlation_heatmap.png', dpi=300)
        plt.close()
        print("Generated chart 16: Correlation Heatmap")
    else:
        print("Skipping chart 16: Not enough numeric pollutants for a correlation heatmap.")


    # --- Chart 17: Scatter plot between two highly correlated pollutants (e.g., PM10 vs PM2.5) ---
    if 'PM10' in df.columns and 'PM2.5' in df.columns and not df['PM10'].isnull().all() and not df['PM2.5'].isnull().all():
        plt.figure(figsize=(10, 7))
        sns.scatterplot(x='PM10', y='PM2.5', data=df, alpha=0.7, color='purple', s=50) # Increased marker size
        plt.title('PM10 vs PM2.5 Concentration in Bhilai', fontsize=16)
        plt.xlabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
        plt.ylabel('PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_17_scatter_pm10_pm25.png', dpi=300)
        plt.close()
        print("Generated chart 17: Scatter plot PM10 vs PM2.5")
    else:
        print("Skipping chart 17: PM10 or PM2.5 columns not found or insufficient data for scatter plot.")

    # --- Chart 18: Pairplot of key pollutants ---
    if len(available_pollutants) >= 2:
        pairplot_cols = available_pollutants[:5] # Consider up to 5 for readability
        if len(pairplot_cols) > 1:
            g = sns.pairplot(df[pairplot_cols], diag_kind='kde', plot_kws={'alpha': 0.6, 's': 30}) # KDE on diagonal, smaller points
            g.fig.suptitle('Pairplot of Key Pollutants in Bhilai', y=1.02, fontsize=18) # Adjust suptitle position
            # Adjusting axis labels for each subplot
            for ax in g.axes.flatten():
                if ax.get_xlabel():
                    ax.set_xlabel(ax.get_xlabel().replace('_8hrs', ' (8hrs)').replace('_mg_m3', ' (mg/m3)') + ' ($\\mu g/m^3$)', fontsize=10)
                if ax.get_ylabel():
                    ax.set_ylabel(ax.get_ylabel().replace('_8hrs', ' (8hrs)').replace('_mg_m3', ' (mg/m3)') + ' ($\\mu g/m^3$)', fontsize=10)
                ax.tick_params(labelsize=8) # Smaller ticks for subplots
            plt.tight_layout(rect=[0, 0.03, 1, 0.98]) # Adjust layout to prevent title overlap
            plt.savefig(output_dir / 'chart_18_pairplot_pollutants.png', dpi=300)
            plt.close()
            print("Generated chart 18: Pairplot of key pollutants")
        else:
            print("Skipping chart 18: Not enough pollutants for a pairplot.")
    else:
        print("Skipping chart 18: Not enough pollutants for a pairplot.")

    # --- Chart 19: Violin Plot for PM10 and PM2.5 (if available) ---
    if 'PM10' in df.columns and 'PM2.5' in df.columns and not df['PM10'].isnull().all() and not df['PM2.5'].isnull().all():
        plt.figure(figsize=(10, 7))
        sns.violinplot(data=df[['PM10', 'PM2.5']], palette='coolwarm', inner='quartile') # Show quartiles
        plt.title('Violin Plot of PM10 and PM2.5 Concentrations in Bhilai', fontsize=16)
        plt.ylabel('Concentration ($\\mu g/m^3$)', fontsize=14) # Added units
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7, axis='y')
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_19_violin_pm.png', dpi=300)
        plt.close()
        print("Generated chart 19: Violin plot for PM10 and PM2.5")
    else:
        print("Skipping chart 19: PM10 or PM2.5 columns not found or insufficient data for violin plot.")

    # --- Chart 20: Count of data points per sheet (if 'Sheet' column exists) ---
    if 'Sheet' in df.columns and not df['Sheet'].empty:
        plt.figure(figsize=(12, 7))
        # Fix for FutureWarning: Pass `hue` explicitly and set `legend=False`
        sns.countplot(y='Sheet', data=df, palette='cividis', order=df['Sheet'].value_counts().index, hue='Sheet', legend=False)
        plt.title('Number of Data Points per Original Sheet in Bhilai Data', fontsize=16)
        plt.xlabel('Count of Data Points', fontsize=14)
        plt.ylabel('Original Sheet Name', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7, axis='x') # Only x-axis grid
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_20_data_points_per_sheet.png', dpi=300)
        plt.close()
        print("Generated chart 20: Data points per sheet")
    else:
        print("Skipping chart 20: 'Sheet' column not found or is empty in DataFrame.")


    # --- New Charts (21-40) ---

    # --- Chart 21: Rug Plot (for PM10) ---
    if 'PM10' in df.columns and not df['PM10'].isnull().all():
        plt.figure(figsize=(10, 4))
        sns.histplot(df['PM10'], kde=False, bins=30, color='lightgray')
        sns.rugplot(df['PM10'], color='darkblue', height=0.1)
        plt.title('Distribution of PM10 Concentration with Rug Plot in Bhilai', fontsize=16)
        plt.xlabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.ylabel('Frequency', fontsize=14)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_21_rug_plot_pm10.png', dpi=300)
        plt.close()
        print("Generated chart 21: Rug Plot for PM10")
    else:
        print("Skipping chart 21: PM10 data not sufficient for Rug Plot.")

    # --- Chart 22: Strip Plot (PM10 by Day of Week) ---
    if 'PM10' in df.columns and 'DayOfWeek' in df.columns and not df['PM10'].isnull().all() and not df['DayOfWeek'].empty:
        plt.figure(figsize=(12, 7))
        # Fix for FutureWarning: Pass `hue` explicitly and set `legend=False`
        sns.stripplot(x='DayOfWeek', y='PM10', data=df, jitter=0.2, alpha=0.6, palette='viridis', hue='DayOfWeek', legend=False)
        plt.title('PM10 Concentration by Day of Week (Strip Plot) in Bhilai', fontsize=16)
        plt.xlabel('Day of Week', fontsize=14)
        plt.ylabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks(rotation=30, ha='right', fontsize=12)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_22_strip_plot_pm10_dayofweek.png', dpi=300)
        plt.close()
        print("Generated chart 22: Strip Plot for PM10 by Day of Week")
    else:
        print("Skipping chart 22: PM10 or DayOfWeek data not sufficient for Strip Plot.")

    # --- Chart 23: ECDF Plot (for NO2) ---
    if 'NO2' in df.columns and not df['NO2'].isnull().all():
        plt.figure(figsize=(10, 7))
        sns.ecdfplot(df['NO2'], color='darkgreen')
        plt.title('Empirical Cumulative Distribution Function (ECDF) for NO2 in Bhilai', fontsize=16)
        plt.xlabel('NO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.ylabel('Cumulative Probability', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_23_ecdf_plot_no2.png', dpi=300)
        plt.close()
        print("Generated chart 23: ECDF Plot for NO2")
    else:
        print("Skipping chart 23: NO2 data not sufficient for ECDF Plot.")

    # --- Chart 24: Hexbin Plot (PM10 vs PM2.5) ---
    if 'PM10' in df.columns and 'PM2.5' in df.columns and not df['PM10'].isnull().all() and not df['PM2.5'].isnull().all():
        plt.figure(figsize=(10, 7))
        plt.hexbin(df['PM10'], df['PM2.5'], gridsize=30, cmap='Blues', mincnt=1)
        plt.colorbar(label='Count in bin')
        plt.title('Hexbin Plot of PM10 vs PM2.5 Concentration in Bhilai', fontsize=16)
        plt.xlabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.ylabel('PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_24_hexbin_pm10_pm25.png', dpi=300)
        plt.close()
        print("Generated chart 24: Hexbin Plot for PM10 vs PM2.5")
    else:
        print("Skipping chart 24: PM10 or PM2.5 data not sufficient for Hexbin Plot.")

    # --- Chart 25: 2D KDE Plot (SO2 vs NO2) ---
    if 'SO2' in df.columns and 'NO2' in df.columns and not df['SO2'].isnull().all() and not df['NO2'].isnull().all():
        plt.figure(figsize=(10, 7))
        sns.kdeplot(x=df['SO2'], y=df['NO2'], cmap='Reds', fill=True, cbar=True)
        plt.title('2D Kernel Density Estimate of SO2 vs NO2 in Bhilai', fontsize=16)
        plt.xlabel('SO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.ylabel('NO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_25_2d_kde_so2_no2.png', dpi=300)
        plt.close()
        print("Generated chart 25: 2D KDE Plot for SO2 vs NO2")
    else:
        print("Skipping chart 25: SO2 or NO2 data not sufficient for 2D KDE Plot.")

    # --- Chart 26: Swarm Plot (SO2 by Day of Week) ---
    if 'SO2' in df.columns and 'DayOfWeek' in df.columns and not df['SO2'].isnull().all() and not df['DayOfWeek'].empty:
        plt.figure(figsize=(12, 7))
        # Fix for FutureWarning: Pass `hue` explicitly and set `legend=False`
        sns.swarmplot(x='DayOfWeek', y='SO2', data=df, palette='cubehelix', s=5, hue='DayOfWeek', legend=False) # s for marker size
        plt.title('SO2 Concentration by Day of Week (Swarm Plot) in Bhilai', fontsize=16)
        plt.xlabel('Day of Week', fontsize=14)
        plt.ylabel('SO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks(rotation=30, ha='right', fontsize=12)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_26_swarm_plot_so2_dayofweek.png', dpi=300)
        plt.close()
        print("Generated chart 26: Swarm Plot for SO2 by Day of Week")
    else:
        print("Skipping chart 26: SO2 or DayOfWeek data not sufficient for Swarm Plot.")

    # --- Chart 27: Joint Plot (PM10 vs PM2.5 with KDE) ---
    if 'PM10' in df.columns and 'PM2.5' in df.columns and not df['PM10'].isnull().all() and not df['PM2.5'].isnull().all():
        g = sns.jointplot(x='PM10', y='PM2.5', data=df, kind='kde', cmap='Greens', fill=True, height=8)
        g.set_axis_labels('PM10 Concentration ($\\mu g/m^3$)', 'PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14)
        g.fig.suptitle('Joint Plot of PM10 vs PM2.5 Concentration in Bhilai', y=1.03, fontsize=16) # Adjust title position
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_27_joint_plot_pm10_pm25_kde.png', dpi=300)
        plt.close()
        print("Generated chart 27: Joint Plot for PM10 vs PM2.5 (KDE)")
    else:
        print("Skipping chart 27: PM10 or PM2.5 data not sufficient for Joint Plot.")

    # --- Chart 28: Andrews Curves (for top 5 pollutants) ---
    # Requires scaling the data
    if len(available_pollutants) >= 5:
        andrews_cols = available_pollutants[:5]
        df_andrews = df[andrews_cols].dropna() # Drop NaNs for this plot
        if not df_andrews.empty:
            scaler = StandardScaler()
            df_andrews_scaled = pd.DataFrame(scaler.fit_transform(df_andrews), columns=andrews_cols)
            # Add a dummy class column to satisfy the 'class_column' requirement
            df_andrews_scaled['_dummy_class'] = 'A'

            plt.figure(figsize=(12, 8))
            # Pass the dummy class column
            pd.plotting.andrews_curves(df_andrews_scaled, class_column='_dummy_class', ax=plt.gca())
            plt.title('Andrews Curves of Top 5 Pollutants in Bhilai', fontsize=16)
            plt.xlabel('Function Value', fontsize=14)
            plt.ylabel('Pollutant Value (Scaled)', fontsize=14)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.legend().set_visible(False) # Hide legend for dummy class
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_28_andrews_curves_top5.png', dpi=300)
            plt.close()
            print("Generated chart 28: Andrews Curves for Top 5 Pollutants")
        else:
            print("Skipping chart 28: Insufficient non-null data for Andrews Curves with top 5 pollutants.")
    else:
        print("Skipping chart 28: Not enough pollutants for Andrews Curves.")

    # --- Chart 29: Parallel Coordinates Plot (for top 5 pollutants) ---
    # Requires scaling the data
    if len(available_pollutants) >= 5:
        parallel_cols = available_pollutants[:5]
        df_parallel = df[parallel_cols].dropna() # Drop NaNs for this plot
        if not df_parallel.empty:
            scaler = StandardScaler()
            df_parallel_scaled = pd.DataFrame(scaler.fit_transform(df_parallel), columns=parallel_cols)
            # Add a dummy class column to satisfy the 'class_column' requirement
            df_parallel_scaled['_dummy_class'] = 'A'

            plt.figure(figsize=(14, 8))
            # Pass the dummy class column
            pd.plotting.parallel_coordinates(df_parallel_scaled, class_column='_dummy_class', ax=plt.gca())
            plt.title('Parallel Coordinates Plot of Top 5 Pollutants in Bhilai', fontsize=16)
            plt.xlabel('Pollutant (Scaled)', fontsize=14)
            plt.ylabel('Value (Scaled)', fontsize=14)
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.legend().set_visible(False) # Hide legend for dummy class
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_29_parallel_coordinates_top5.png', dpi=300)
            plt.close()
            print("Generated chart 29: Parallel Coordinates Plot for Top 5 Pollutants")
        else:
            print("Skipping chart 29: Insufficient non-null data for Parallel Coordinates Plot with top 5 pollutants.")
    else:
        print("Skipping chart 29: Not enough pollutants for Parallel Coordinates Plot.")

    # --- Chart 30: Seasonal Decomposition Plot (PM10) ---
    if 'PM10' in df_time_series.columns and 'Date' in df_time_series.columns and not df_time_series['PM10'].isnull().all() and not df_time_series['Date'].empty:
        # Use the aggregated df_time_series for this
        ts_pm10 = df_time_series.set_index('Date')['PM10'].asfreq('D').interpolate(method='linear')
        if len(ts_pm10) > 2 * 365: # Need at least two full cycles for annual seasonality
            try:
                # Assuming annual seasonality (period=365) for daily data
                # Adjust period if data frequency is different (e.g., weekly, monthly)
                result = seasonal_decompose(ts_pm10, model='additive', period=365) # Or 'multiplicative'
                fig = result.plot()
                fig.set_size_inches(14, 10)
                fig.suptitle('Seasonal Decomposition of PM10 Concentration in Bhilai', fontsize=16, y=1.02)
                plt.tight_layout(rect=[0, 0.03, 1, 0.98])
                plt.savefig(output_dir / 'chart_30_seasonal_decomposition_pm10.png', dpi=300)
                plt.close()
                print("Generated chart 30: Seasonal Decomposition Plot for PM10")
            except Exception as e:
                print(f"Skipping chart 30: Error during seasonal decomposition for PM10: {e}")
        else:
            print("Skipping chart 30: Not enough data points for meaningful seasonal decomposition (need at least 2 full periods).")
    else:
        print("Skipping chart 30: PM10 or Date data not sufficient for Seasonal Decomposition Plot.")

    # --- Chart 31: Lag Plot (PM10) ---
    if 'PM10' in df_time_series.columns and not df_time_series['PM10'].isnull().all():
        # Use the aggregated df_time_series for this
        plt.figure(figsize=(8, 8))
        pd.plotting.lag_plot(df_time_series['PM10'], lag=1, c='blue', alpha=0.7) # Lag 1 for immediate autocorrelation
        plt.title('Lag Plot of PM10 Concentration (Lag 1) in Bhilai', fontsize=16)
        plt.xlabel('PM10 (t)', fontsize=14)
        plt.ylabel('PM10 (t+1)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_31_lag_plot_pm10.png', dpi=300)
        plt.close()
        print("Generated chart 31: Lag Plot for PM10")
    else:
        print("Skipping chart 31: PM10 data not sufficient for Lag Plot.")

    # --- Chart 32: Autocorrelation Function (ACF) Plot (PM10) ---
    if 'PM10' in df_time_series.columns and not df_time_series['PM10'].isnull().all():
        # Use the aggregated df_time_series for this
        plt.figure(figsize=(12, 6))
        plot_acf(df_time_series['PM10'].dropna(), lags=30, ax=plt.gca(), title='Autocorrelation Function (ACF) for PM10 in Bhilai')
        plt.xlabel('Lag', fontsize=14)
        plt.ylabel('Autocorrelation', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_32_acf_plot_pm10.png', dpi=300)
        plt.close()
        print("Generated chart 32: ACF Plot for PM10")
    else:
        print("Skipping chart 32: PM10 data not sufficient for ACF Plot.")

    # --- Chart 33: Partial Autocorrelation Function (PACF) Plot (PM10) ---
    if 'PM10' in df_time_series.columns and not df_time_series['PM10'].isnull().all():
        # Use the aggregated df_time_series for this
        plt.figure(figsize=(12, 6))
        plot_pacf(df_time_series['PM10'].dropna(), lags=30, ax=plt.gca(), title='Partial Autocorrelation Function (PACF) for PM10 in Bhilai')
        plt.xlabel('Lag', fontsize=14)
        plt.ylabel('Partial Autocorrelation', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_33_pacf_plot_pm10.png', dpi=300)
        plt.close()
        print("Generated chart 33: PACF Plot for PM10")
    else:
        print("Skipping chart 33: PM10 data not sufficient for PACF Plot.")

    # --- Chart 34: Monthly Average of O3 ---
    if 'O3_8hrs' in df_time_series.columns and 'Date' in df_time_series.columns and not df_time_series['O3_8hrs'].isnull().all() and not df_time_series['Date'].empty:
        df_monthly_o3 = df_time_series.set_index('Date').resample('ME').mean(numeric_only=True) # Changed 'M' to 'ME'
        if 'O3_8hrs' in df_monthly_o3.columns and not df_monthly_o3['O3_8hrs'].isnull().all():
            plt.figure(figsize=(14, 7))
            sns.lineplot(x=df_monthly_o3.index, y='O3_8hrs', data=df_monthly_o3, marker='o', color='darkviolet', linewidth=2)
            plt.title('Monthly Average of O3 (8hrs) Concentration in Bhilai', fontsize=16)
            plt.xlabel('Month', fontsize=14)
            plt.ylabel('Average O3 (8hrs) Concentration ($\\mu g/m^3$)', fontsize=14)
            plt.xticks(rotation=45, ha='right', fontsize=12)
            plt.yticks(fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(output_dir / 'chart_34_monthly_avg_o3.png', dpi=300)
            plt.close()
            print("Generated chart 34: Monthly Average for O3")
        else:
            print("Skipping chart 34: O3 data not sufficient for monthly average.")
    else:
        print("Skipping chart 34: O3 or Date data not sufficient for monthly average.")

    # --- Chart 35: Distribution of PM2.5 by Location (if multiple locations) ---
    # This chart uses the original df, not df_time_series, as it's about raw distribution
    if 'PM2.5' in df.columns and 'Location' in df.columns and not df['PM2.5'].isnull().all() and df['Location'].nunique() > 1:
        plt.figure(figsize=(10, 7))
        sns.violinplot(x='Location', y='PM2.5', data=df, palette='coolwarm', inner='quartile')
        plt.title('PM2.5 Concentration Distribution by Location', fontsize=16)
        plt.xlabel('Location', fontsize=14)
        plt.ylabel('PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7, axis='y')
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_35_pm25_by_location.png', dpi=300)
        plt.close()
        print("Generated chart 35: PM2.5 Distribution by Location")
    else:
        print("Skipping chart 35: PM2.5 or Location data not sufficient, or only one location present.")

    # --- Chart 36: Boxen Plot (for PM10) ---
    if 'PM10' in df.columns and not df['PM10'].isnull().all():
        plt.figure(figsize=(8, 7))
        sns.boxenplot(y=df['PM10'], color='skyblue')
        plt.title('Boxen Plot of PM10 Concentration in Bhilai', fontsize=16)
        plt.ylabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks([]) # No x-ticks needed for single variable
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7, axis='y')
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_36_boxen_plot_pm10.png', dpi=300)
        plt.close()
        print("Generated chart 36: Boxen Plot for PM10")
    else:
        print("Skipping chart 36: PM10 data not sufficient for Boxen Plot.")

    # --- Chart 37: Z-Score Outlier Plot (for NO2) ---
    if 'NO2' in df_time_series.columns and 'Date' in df_time_series.columns and not df_time_series['NO2'].isnull().all() and not df_time_series['Date'].empty:
        # Use the aggregated df_time_series for this to ensure unique dates
        df_temp_zscore = df_time_series.copy()
        # Calculate Z-scores
        df_temp_zscore['NO2_ZScore'] = np.abs(StandardScaler().fit_transform(df_temp_zscore[['NO2']]))
        outlier_threshold = 3 # Common threshold for outliers (3 standard deviations)

        plt.figure(figsize=(12, 7))
        sns.scatterplot(x='Date', y='NO2', data=df_temp_zscore, color='blue', alpha=0.7, label='NO2 Concentration')
        # Highlight outliers
        outliers = df_temp_zscore[df_temp_zscore['NO2_ZScore'] > outlier_threshold]
        if not outliers.empty:
            sns.scatterplot(x='Date', y='NO2', data=outliers, color='red', s=100, marker='X', label=f'Outlier (Z > {outlier_threshold})')
        plt.title(f'NO2 Concentration with Z-Score Outliers (> {outlier_threshold} SD) in Bhilai', fontsize=16)
        plt.xlabel('Date', fontsize=14)
        plt.ylabel('NO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks(rotation=45, ha='right', fontsize=12)
        plt.yticks(fontsize=12)
        plt.legend(title='Data Points', fontsize=12, title_fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(output_dir / 'chart_37_zscore_outlier_no2.png', dpi=300)
        plt.close()
        print("Generated chart 37: Z-Score Outlier Plot for NO2")
    else:
        print("Skipping chart 37: NO2 or Date data not sufficient for Z-Score Outlier Plot.")

    # --- Chart 38: Stacked Bar Chart (Average Pollutant by Sheet) ---
    if 'Sheet' in df.columns and not df['Sheet'].empty and available_pollutants:
        # Aggregate data: calculate mean of pollutants per sheet
        df_agg = df.groupby('Sheet')[available_pollutants].mean().reset_index()
        if not df_agg.empty:
            # Melt the DataFrame for stacked bar plot
            df_melted = df_agg.melt(id_vars='Sheet', var_name='Pollutant', value_name='Average Concentration')

            # Change to horizontal bar chart for better readability of many sheet names
            plt.figure(figsize=(14, max(8, len(df_agg['Sheet']) * 0.5))) # Dynamic height based on number of sheets
            sns.barplot(y='Sheet', x='Average Concentration', hue='Pollutant', data=df_melted, palette='tab20',
                        order=df_agg.sort_values(by=available_pollutants[0] if available_pollutants else 'Sheet', ascending=False)['Sheet']) # Order by first pollutant or sheet name
            plt.title('Average Pollutant Concentration by Original Sheet in Bhilai', fontsize=16)
            plt.xlabel('Average Concentration ($\\mu g/m^3$)', fontsize=14)
            plt.ylabel('Original Sheet Name', fontsize=14)
            plt.xticks(fontsize=12)
            plt.yticks(fontsize=12)
            plt.legend(title='Pollutant', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, title_fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7, axis='x') # Only x-axis grid for horizontal bars
            plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
            plt.savefig(output_dir / 'chart_38_stacked_bar_avg_pollutant_by_sheet.png', dpi=300)
            plt.close()
            print("Generated chart 38: Stacked Bar Chart for Average Pollutant by Sheet")
        else:
            print("Skipping chart 38: Aggregated data by Sheet is empty.")
    else:
        print("Skipping chart 38: 'Sheet' column not found, empty, or no available pollutants.")

    # --- Chart 39: Catplot (Boxen plot of NO2 by Day of Week) ---
    # This chart uses the original df, not df_time_series
    if 'NO2' in df.columns and 'DayOfWeek' in df.columns and not df['NO2'].isnull().all() and not df['DayOfWeek'].empty:
        # DayOfWeek is already ordered in load_and_clean_data
        g = sns.catplot(x='DayOfWeek', y='NO2', data=df, kind='boxen', height=7, aspect=1.5, palette='coolwarm')
        g.set_axis_labels('Day of Week', 'NO2 Concentration ($\\mu g/m^3$)', fontsize=14)
        g.set_xticklabels(rotation=30, ha='right', fontsize=12)
        g.set_yticklabels(fontsize=12)
        g.fig.suptitle('NO2 Concentration by Day of Week (Boxen Plot) in Bhilai', y=1.02, fontsize=16)
        plt.grid(True, linestyle='--', alpha=0.7, axis='y')
        plt.tight_layout(rect=[0, 0.03, 1, 0.98])
        plt.savefig(output_dir / 'chart_39_catplot_boxen_no2_dayofweek.png', dpi=300)
        plt.close()
        print("Generated chart 39: Catplot (Boxen plot of NO2 by Day of Week)")
    else:
        print("Skipping chart 39: NO2 or DayOfWeek data not sufficient for Catplot.")

    # --- Chart 40: Scatter plot with Hue (PM10 vs PM2.5 by Day of Week) ---
    # This chart uses the original df, not df_time_series
    if 'PM10' in df.columns and 'PM2.5' in df.columns and 'DayOfWeek' in df.columns and \
       not df['PM10'].isnull().all() and not df['PM2.5'].isnull().all() and not df['DayOfWeek'].empty:
        plt.figure(figsize=(12, 8))
        sns.scatterplot(x='PM10', y='PM2.5', hue='DayOfWeek', data=df, palette='tab10', s=60, alpha=0.7)
        plt.title('PM10 vs PM2.5 Concentration by Day of Week in Bhilai', fontsize=16)
        plt.xlabel('PM10 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.ylabel('PM2.5 Concentration ($\\mu g/m^3$)', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.legend(title='Day of Week', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, title_fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout(rect=[0, 0, 0.85, 1])
        plt.savefig(output_dir / 'chart_40_scatter_pm10_pm25_by_dayofweek.png', dpi=300)
        plt.close()
        print("Generated chart 40: Scatter plot with Hue (PM10 vs PM2.5 by Day of Week)")
    else:
        print("Skipping chart 40: PM10, PM2.5 or DayOfWeek data not sufficient for Scatter plot with Hue.")


    print("\nAll 40 requested charts generated and saved with publication-ready quality.")

## 4. Main Execution Block
This block sets up the file paths and calls the data loading, cleaning, and chart generation functions. You can modify `csv_file_path`, `cleaned_data_output_dir`, and `charts_output_dir` as needed.

In [None]:
def main():
    """
    Main function to execute the data loading, cleaning, and chart generation.
    """
    # Define the path to your CSV file.
    # If the file is in the same directory as the script, you can just use the filename.
    csv_file_path = Path("consolidated_pollutant_data_bhilai.csv")

    # Define the output directory for cleaned data CSV
    cleaned_data_output_dir = Path("path to files/cleaned_data")
    cleaned_data_output_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
    cleaned_csv_filename = cleaned_data_output_dir / "cleaned_pollutant_data_bhilai.csv"

    # Define the output directory for charts
    charts_output_dir = Path("path to files/charts_output") # This will create 'charts_output' inside 'path to files'

    # Load and clean the data
    cleaned_df = load_and_clean_data(csv_file_path)

    if not cleaned_df.empty:
        # Save the cleaned DataFrame to a new CSV file
        cleaned_df.to_csv(cleaned_csv_filename, index=False)
        print(f"\nCleaned data saved to: {cleaned_csv_filename}")

        # Generate and save the charts
        generate_charts(cleaned_df, charts_output_dir)
    else:
        print("No data to process or cleaned data is empty. Chart generation and CSV saving skipped.")

if __name__ == "__main__":
    main()