# Air Quality Data Exploration

This notebook provides comprehensive exploratory data analysis (EDA) for the air quality prediction project.

## Objectives
1. Load and examine the raw air quality dataset
2. Analyze data quality and completeness
3. Explore city-wise patterns and distributions
4. Identify key insights for feature engineering
5. Generate initial visualizations

## Dataset Information
- **Source**: Kaggle "Air Quality Data in India (2015–2020)"
- **Target Cities**: Delhi, Bangalore, Kolkata, Hyderabad, Chennai, Visakhapatnam
- **Features**: PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, AQI, Date, City


In [1]:
# Install required packages if not already installed
import subprocess
import sys

def install_package(package):
    """Install package using pip if not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
required_packages = [
    "pandas",
    "numpy", 
    "matplotlib",
    "seaborn",
    "plotly",
    "scikit-learn",
    "lightgbm",
    "optuna",
    "imbalanced-learn",
    "tqdm",
    "joblib"
]

for package in required_packages:
    install_package(package)

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import os

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Libraries imported successfully


pandas is already installed
numpy is already installed
matplotlib is already installed
seaborn is already installed
plotly is already installed
Installing scikit-learn...
lightgbm is already installed


  from .autonotebook import tqdm as notebook_tqdm


optuna is already installed
Installing imbalanced-learn...
tqdm is already installed
joblib is already installed


## 1. Data Loading and Initial Inspection


In [2]:
# Load the new India AQI 2023-2025 dataset
data_path = "../data/raw/"

# Load the uploaded AQI dataset
df = pd.read_csv(data_path + 'aqi.csv')

print(f"Successfully loaded: aqi.csv")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Target cities for analysis (updated for new dataset)
# Note: The new dataset uses 'area' instead of 'city' and includes state information
target_cities = ['Delhi', 'Mumbai', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai']


Successfully loaded: aqi.csv
Dataset shape: (235785, 9)
Columns: ['date', 'state', 'area', 'number_of_monitoring_stations', 'prominent_pollutants', 'aqi_value', 'air_quality_status', 'unit', 'note']


## 2. Data Quality Assessment


In [3]:
# NEW DATASET ANALYSIS - India AQI 2023-2025
print("=" * 60)
print("INDIA AQI DATASET 2023-2025 ANALYSIS")
print("=" * 60)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Check for target cities/areas
if 'area' in df.columns:
    available_areas = df['area'].unique()
    target_cities_found = [city for city in target_cities if city in available_areas]
    print(f"\nTarget cities found: {target_cities_found}")
    print(f"Total areas in dataset: {len(available_areas)}")
    print(f"Sample areas: {list(available_areas[:10])}")

# Check states
if 'state' in df.columns:
    available_states = df['state'].unique()
    print(f"\nTotal states: {len(available_states)}")
    print(f"States: {list(available_states)}")

# Check date range
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
    print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
    print(f"Duration: {(df['date'].max() - df['date'].min()).days} days")

# Check for AQI column
if 'aqi_value' in df.columns:
    print(f"\nAQI Statistics:")
    print(f"  Range: {df['aqi_value'].min():.2f} to {df['aqi_value'].max():.2f}")
    print(f"  Mean: {df['aqi_value'].mean():.2f}")
    print(f"  Median: {df['aqi_value'].median():.2f}")
    print(f"  Missing AQI values: {df['aqi_value'].isnull().sum()}")

# Check prominent pollutants
if 'prominent_pollutants' in df.columns:
    pollutants = df['prominent_pollutants'].value_counts()
    print(f"\nProminent Pollutants Distribution:")
    print(pollutants.head(10))

# Check air quality status
if 'air_quality_status' in df.columns:
    status_dist = df['air_quality_status'].value_counts()
    print(f"\nAir Quality Status Distribution:")
    print(status_dist)

# Data types and missing values
print(f"\nData types:")
print(df.dtypes.value_counts())
print(f"\nMissing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Policy relevance analysis
print(f"\nPolicy Relevance Analysis:")
print(f"  Dataset structure: State-level AQI data with prominent pollutants")
print(f"  Policy-controllable pollutants: PM2.5, PM10, NO2, SO2, CO, O3")
print(f"  Note: This dataset provides AQI values and prominent pollutants per area")
print(f"  For ACO implementation, we'll need to restructure this data")


INDIA AQI DATASET 2023-2025 ANALYSIS
Dataset shape: (235785, 9)
Columns: ['date', 'state', 'area', 'number_of_monitoring_stations', 'prominent_pollutants', 'aqi_value', 'air_quality_status', 'unit', 'note']

First 5 rows:
         date           state      area  number_of_monitoring_stations  \
0  30-04-2025     Maharashtra  Amravati                              2   
1  30-04-2025           Bihar    Purnia                              1   
2  30-04-2025  Madhya Pradesh     Katni                              1   
3  30-04-2025    Chhattisgarh   Tumidih                              1   
4  30-04-2025           Assam  Byrnihat                              1   

  prominent_pollutants  aqi_value air_quality_status  \
0                 PM10         78       Satisfactory   
1                   CO         56       Satisfactory   
2                   O3         98       Satisfactory   
3                 PM10        103           Moderate   
4                PM2.5         61       Satisfactory 

In [4]:
# DATA TRANSFORMATION FOR ACO IMPLEMENTATION
print("=" * 60)
print("DATA TRANSFORMATION FOR ACO FEATURE SELECTION")
print("=" * 60)

def transform_aqi_data_for_aco(df, target_cities=None):
    """
    Transform the AQI dataset into a format suitable for ACO feature selection
    
    Args:
        df: Original AQI dataset
        target_cities: List of target cities to focus on
        
    Returns:
        Transformed dataframe with features suitable for ACO
    """
    
    # Create a copy for transformation
    df_transformed = df.copy()
    
    # Convert date to datetime
    df_transformed['date'] = pd.to_datetime(df_transformed['date'], format='%d-%m-%Y')
    
    # Filter for target cities if specified
    if target_cities:
        df_transformed = df_transformed[df_transformed['area'].isin(target_cities)]
    
    # Create pollutant features based on prominent_pollutants
    # This is a simplified approach - in reality, we'd need actual pollutant concentrations
    pollutant_features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
    
    # Create dummy pollutant concentrations based on AQI and prominent pollutant
    for pollutant in pollutant_features:
        # Create a feature that represents the concentration of this pollutant
        # This is a simplified approach - in practice, we'd need actual concentration data
        df_transformed[f'{pollutant}_concentration'] = 0.0
        
        # Set concentration based on prominent pollutant and AQI
        mask = df_transformed['prominent_pollutants'] == pollutant
        df_transformed.loc[mask, f'{pollutant}_concentration'] = df_transformed.loc[mask, 'aqi_value']
        
        # For other pollutants, use a fraction of AQI (simplified approach)
        other_mask = df_transformed['prominent_pollutants'] != pollutant
        df_transformed.loc[other_mask, f'{pollutant}_concentration'] = df_transformed.loc[other_mask, 'aqi_value'] * 0.3
    
    # Create temporal features
    df_transformed['year'] = df_transformed['date'].dt.year
    df_transformed['month'] = df_transformed['date'].dt.month
    df_transformed['day'] = df_transformed['date'].dt.day
    df_transformed['day_of_week'] = df_transformed['date'].dt.dayofweek
    df_transformed['day_of_year'] = df_transformed['date'].dt.dayofyear
    
    # Create season feature
    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
    
    df_transformed['season'] = df_transformed['month'].apply(get_season)
    
    # Create weather features (simulated - in practice, we'd need actual weather data)
    np.random.seed(42)  # For reproducibility
    df_transformed['temperature'] = np.random.normal(25, 5, len(df_transformed))
    df_transformed['humidity'] = np.random.normal(65, 15, len(df_transformed))
    df_transformed['wind_speed'] = np.random.normal(10, 3, len(df_transformed))
    df_transformed['pressure'] = np.random.normal(1013, 10, len(df_transformed))
    
    # Create lag features (simplified)
    df_transformed = df_transformed.sort_values(['area', 'date'])
    for pollutant in pollutant_features:
        df_transformed[f'{pollutant}_lag1'] = df_transformed.groupby('area')[f'{pollutant}_concentration'].shift(1)
        df_transformed[f'{pollutant}_lag2'] = df_transformed.groupby('area')[f'{pollutant}_concentration'].shift(2)
        df_transformed[f'{pollutant}_lag3'] = df_transformed.groupby('area')[f'{pollutant}_concentration'].shift(3)
    
    # Create rolling averages
    for pollutant in pollutant_features:
        df_transformed[f'{pollutant}_avg3'] = df_transformed.groupby('area')[f'{pollutant}_concentration'].rolling(3).mean().reset_index(0, drop=True)
        df_transformed[f'{pollutant}_avg7'] = df_transformed.groupby('area')[f'{pollutant}_concentration'].rolling(7).mean().reset_index(0, drop=True)
    
    # Create interaction features
    df_transformed['PM_ratio'] = df_transformed['PM2.5_concentration'] / (df_transformed['PM10_concentration'] + 1e-6)
    df_transformed['NOx_ratio'] = df_transformed['NO2_concentration'] / (df_transformed['CO_concentration'] + 1e-6)
    
    # Remove rows with NaN values
    df_transformed = df_transformed.dropna()
    
    return df_transformed

# Transform the data
print("Transforming dataset for ACO implementation...")
df_aco = transform_aqi_data_for_aco(df, target_cities)

print(f"Transformed dataset shape: {df_aco.shape}")
print(f"Transformed dataset columns: {list(df_aco.columns)}")

# Display sample of transformed data
print("\nSample of transformed data:")
print(df_aco[['area', 'date', 'aqi_value', 'PM2.5_concentration', 'PM10_concentration', 'temperature', 'humidity']].head())

# Check feature categories
policy_controllable = [col for col in df_aco.columns if any(pollutant in col for pollutant in ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3'])]
weather_features = [col for col in df_aco.columns if col in ['temperature', 'humidity', 'wind_speed', 'pressure']]
temporal_features = [col for col in df_aco.columns if col in ['year', 'month', 'day', 'day_of_week', 'day_of_year', 'season']]

print(f"\nFeature Categories:")
print(f"  Policy-controllable features: {len(policy_controllable)}")
print(f"  Weather features: {len(weather_features)}")
print(f"  Temporal features: {len(temporal_features)}")
print(f"  Total features: {len(df_aco.columns)}")

# Save transformed data
df_aco.to_csv('../data/processed/aqi_transformed_for_aco.csv', index=False)
print(f"\nTransformed data saved to: ../data/processed/aqi_transformed_for_aco.csv")


DATA TRANSFORMATION FOR ACO FEATURE SELECTION
Transforming dataset for ACO implementation...
Transformed dataset shape: (0, 57)
Transformed dataset columns: ['date', 'state', 'area', 'number_of_monitoring_stations', 'prominent_pollutants', 'aqi_value', 'air_quality_status', 'unit', 'note', 'PM2.5_concentration', 'PM10_concentration', 'NO2_concentration', 'SO2_concentration', 'CO_concentration', 'O3_concentration', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'season', 'temperature', 'humidity', 'wind_speed', 'pressure', 'PM2.5_lag1', 'PM2.5_lag2', 'PM2.5_lag3', 'PM10_lag1', 'PM10_lag2', 'PM10_lag3', 'NO2_lag1', 'NO2_lag2', 'NO2_lag3', 'SO2_lag1', 'SO2_lag2', 'SO2_lag3', 'CO_lag1', 'CO_lag2', 'CO_lag3', 'O3_lag1', 'O3_lag2', 'O3_lag3', 'PM2.5_avg3', 'PM2.5_avg7', 'PM10_avg3', 'PM10_avg7', 'NO2_avg3', 'NO2_avg7', 'SO2_avg3', 'SO2_avg7', 'CO_avg3', 'CO_avg7', 'O3_avg3', 'O3_avg7', 'PM_ratio', 'NOx_ratio']

Sample of transformed data:
Empty DataFrame
Columns: [area, date, aqi_valu

In [5]:
# Display basic information about the loaded dataset
if not df.empty:
    print(" DATASET OVERVIEW")
    print("=" * 50)
    
    # Basic info
    print(f"Dataset Shape: {df.shape}")
    print(f"Number of Rows: {df.shape[0]:,}")
    print(f"Number of Columns: {df.shape[1]}")
    
    # Display first few rows
    print("\n First 5 rows:")
    print(df.head())
    
    # Display column information
    print("\n Column Information:")
    print(df.info())
    
    # Check for target cities
    if 'City' in df.columns:
        available_cities = df['City'].unique()
        print(f"\n Available Cities: {len(available_cities)}")
        print("Cities in dataset:", available_cities[:10])  # Show first 10 cities
        
        # Check if our target cities are available
        target_cities_found = [city for city in target_cities if city in available_cities]
        print(f"\n Target Cities Found: {target_cities_found}")
        
        if len(target_cities_found) < len(target_cities):
            missing_cities = [city for city in target_cities if city not in available_cities]
            print(f" Missing Target Cities: {missing_cities}")
    
    # Check for AQI column
    if 'AQI' in df.columns:
        print(f"\n AQI Column Found!")
        print(f"AQI Range: {df['AQI'].min():.2f} to {df['AQI'].max():.2f}")
        print(f"Missing AQI values: {df['AQI'].isnull().sum()}")
    else:
        print(f"\n AQI column not found. Available columns: {list(df.columns)}")
        
else:
    print(" No data loaded. Please check the file paths and formats.")


 DATASET OVERVIEW
Dataset Shape: (235785, 9)
Number of Rows: 235,785
Number of Columns: 9

 First 5 rows:
        date           state      area  number_of_monitoring_stations  \
0 2025-04-30     Maharashtra  Amravati                              2   
1 2025-04-30           Bihar    Purnia                              1   
2 2025-04-30  Madhya Pradesh     Katni                              1   
3 2025-04-30    Chhattisgarh   Tumidih                              1   
4 2025-04-30           Assam  Byrnihat                              1   

  prominent_pollutants  aqi_value air_quality_status  \
0                 PM10         78       Satisfactory   
1                   CO         56       Satisfactory   
2                   O3         98       Satisfactory   
3                 PM10        103           Moderate   
4                PM2.5         61       Satisfactory   

                                                                     unit  \
0  number_of_monitoring_stations in Absol

In [6]:
# TODO: Implement data quality assessment
# This section will be completed once the actual dataset is loaded

def assess_data_quality(df):
    """
    Comprehensive data quality assessment function.
    
    Args:
        df (pd.DataFrame): Input dataset
        
    Returns:
        dict: Data quality metrics
    """
    quality_metrics = {
        'shape': df.shape,
        'columns': list(df.columns),
        'data_types': df.dtypes.to_dict(),
        'missing_values': df.isnull().sum().to_dict(),
        'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict(),
        'duplicate_rows': df.duplicated().sum(),
        'memory_usage': df.memory_usage(deep=True).sum()
    }
    
    return quality_metrics

# Placeholder for actual data loading
print("Data quality assessment function defined")
print("Will be executed once actual dataset is loaded")

# Example of what the output will look like:
print("\nExample output structure:")
example_metrics = {
    'shape': (100000, 14),
    'columns': ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'AQI', 'Date', 'City'],
    'missing_values': {'PM2.5': 1500, 'PM10': 1200, 'AQI': 0, 'City': 0},
    'duplicate_rows': 50
}
print(example_metrics)


Data quality assessment function defined
Will be executed once actual dataset is loaded

Example output structure:
{'shape': (100000, 14), 'columns': ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'AQI', 'Date', 'City'], 'missing_values': {'PM2.5': 1500, 'PM10': 1200, 'AQI': 0, 'City': 0}, 'duplicate_rows': 50}
