# Air Quality Data Exploration

This notebook provides comprehensive exploratory data analysis (EDA) for the air quality prediction project.

## Objectives
1. Load and examine the raw air quality dataset
2. Analyze data quality and completeness
3. Explore city-wise patterns and distributions
4. Identify key insights for feature engineering
5. Generate initial visualizations

## Dataset Information
- **Source**: Kaggle "Air Quality Data in India (2015–2020)"
- **Target Cities**: Delhi, Bangalore, Kolkata, Hyderabad, Chennai, Visakhapatnam
- **Features**: PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, AQI, Date, City


In [5]:
# Install required packages if not already installed
import subprocess
import sys

def install_package(package):
    """Install package using pip if not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
required_packages = [
    "pandas",
    "numpy", 
    "matplotlib",
    "seaborn",
    "plotly",
    "scikit-learn",
    "lightgbm",
    "optuna",
    "imbalanced-learn",
    "tqdm",
    "joblib"
]

for package in required_packages:
    install_package(package)

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import os

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")


pandas is already installed
numpy is already installed
matplotlib is already installed
seaborn is already installed
plotly is already installed
Installing scikit-learn...
lightgbm is already installed
optuna is already installed
Installing imbalanced-learn...
tqdm is already installed
joblib is already installed
Libraries imported successfully!
Current working directory: d:\Nandana\MTECH\PREDICTIVE ANALYSIS\AirqualityPrediction\air_quality_prediction\notebooks


## 1. Data Loading and Initial Inspection


In [6]:
# Define data path and load the dataset
data_path = "../data/raw/"

# Check if data directory exists and list files
if os.path.exists(data_path):
    files = os.listdir(data_path)
    print(f"Files found in data directory: {files}")
    
    # Load the main dataset - city_day.csv (daily city-level data)
    try:
        # Try loading as CSV first
        if 'city_day.csv' in files:
            df = pd.read_csv(data_path + 'city_day.csv')
            print(f"✅ Loaded city_day.csv successfully!")
        elif 'city_day' in files:
            # If it's an Excel file without extension
            df = pd.read_excel(data_path + 'city_day')
            print(f"✅ Loaded city_day Excel file successfully!")
        else:
            print("⚠️ city_day file not found, checking other available files...")
            # Try the first available file as fallback
            available_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls')) or '.' not in f]
            if available_files:
                first_file = available_files[0]
                if first_file.endswith('.csv'):
                    df = pd.read_csv(data_path + first_file)
                else:
                    df = pd.read_excel(data_path + first_file)
                print(f"✅ Loaded {first_file} as fallback")
            else:
                print("❌ No suitable data files found")
                df = pd.DataFrame()
        
        print(f"Dataset shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        df = pd.DataFrame()
        
else:
    print(f"❌ Data directory not found: {data_path}")
    df = pd.DataFrame()

# Target cities for analysis
target_cities = ['Delhi', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai', 'Visakhapatnam']
print(f"\nTarget cities for analysis: {target_cities}")


Files found in data directory: ['city_day.csv', 'city_hour.csv', 'stations.csv', 'station_day.csv', 'station_hour.csv']
✅ Loaded city_day.csv successfully!
Dataset shape: (29531, 16)
Columns: ['City', 'Date', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket']

Target cities for analysis: ['Delhi', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai', 'Visakhapatnam']


## 2. Data Quality Assessment


In [7]:
# Display basic information about the loaded dataset
if not df.empty:
    print("📊 DATASET OVERVIEW")
    print("=" * 50)
    
    # Basic info
    print(f"Dataset Shape: {df.shape}")
    print(f"Number of Rows: {df.shape[0]:,}")
    print(f"Number of Columns: {df.shape[1]}")
    
    # Display first few rows
    print("\n📋 First 5 rows:")
    print(df.head())
    
    # Display column information
    print("\n📝 Column Information:")
    print(df.info())
    
    # Check for target cities
    if 'City' in df.columns:
        available_cities = df['City'].unique()
        print(f"\n🏙️ Available Cities: {len(available_cities)}")
        print("Cities in dataset:", available_cities[:10])  # Show first 10 cities
        
        # Check if our target cities are available
        target_cities_found = [city for city in target_cities if city in available_cities]
        print(f"\n🎯 Target Cities Found: {target_cities_found}")
        
        if len(target_cities_found) < len(target_cities):
            missing_cities = [city for city in target_cities if city not in available_cities]
            print(f"⚠️ Missing Target Cities: {missing_cities}")
    
    # Check for AQI column
    if 'AQI' in df.columns:
        print(f"\n🌬️ AQI Column Found!")
        print(f"AQI Range: {df['AQI'].min():.2f} to {df['AQI'].max():.2f}")
        print(f"Missing AQI values: {df['AQI'].isnull().sum()}")
    else:
        print(f"\n⚠️ AQI column not found. Available columns: {list(df.columns)}")
        
else:
    print("❌ No data loaded. Please check the file paths and formats.")


📊 DATASET OVERVIEW
Dataset Shape: (29531, 16)
Number of Rows: 29,531
Number of Columns: 16

📋 First 5 rows:
        City        Date  PM2.5  PM10     NO    NO2    NOx  NH3     CO    SO2  \
0  Ahmedabad  2015-01-01    NaN   NaN   0.92  18.22  17.15  NaN   0.92  27.64   
1  Ahmedabad  2015-01-02    NaN   NaN   0.97  15.69  16.46  NaN   0.97  24.55   
2  Ahmedabad  2015-01-03    NaN   NaN  17.40  19.30  29.70  NaN  17.40  29.07   
3  Ahmedabad  2015-01-04    NaN   NaN   1.70  18.48  17.97  NaN   1.70  18.59   
4  Ahmedabad  2015-01-05    NaN   NaN  22.10  21.42  37.76  NaN  22.10  39.33   

       O3  Benzene  Toluene  Xylene  AQI AQI_Bucket  
0  133.36     0.00     0.02    0.00  NaN        NaN  
1   34.06     3.68     5.50    3.77  NaN        NaN  
2   30.70     6.80    16.40    2.25  NaN        NaN  
3   36.08     4.43    10.14    1.00  NaN        NaN  
4   39.31     7.01    18.89    2.78  NaN        NaN  

📝 Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 en

In [8]:
# TODO: Implement data quality assessment
# This section will be completed once the actual dataset is loaded

def assess_data_quality(df):
    """
    Comprehensive data quality assessment function.
    
    Args:
        df (pd.DataFrame): Input dataset
        
    Returns:
        dict: Data quality metrics
    """
    quality_metrics = {
        'shape': df.shape,
        'columns': list(df.columns),
        'data_types': df.dtypes.to_dict(),
        'missing_values': df.isnull().sum().to_dict(),
        'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict(),
        'duplicate_rows': df.duplicated().sum(),
        'memory_usage': df.memory_usage(deep=True).sum()
    }
    
    return quality_metrics

# Placeholder for actual data loading
print("Data quality assessment function defined")
print("Will be executed once actual dataset is loaded")

# Example of what the output will look like:
print("\nExample output structure:")
example_metrics = {
    'shape': (100000, 14),
    'columns': ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'AQI', 'Date', 'City'],
    'missing_values': {'PM2.5': 1500, 'PM10': 1200, 'AQI': 0, 'City': 0},
    'duplicate_rows': 50
}
print(example_metrics)


Data quality assessment function defined
Will be executed once actual dataset is loaded

Example output structure:
{'shape': (100000, 14), 'columns': ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'AQI', 'Date', 'City'], 'missing_values': {'PM2.5': 1500, 'PM10': 1200, 'AQI': 0, 'City': 0}, 'duplicate_rows': 50}
