# 🌿 Environmental Impact Analysis

> **PM Accelerator Mission**: "By making industry-leading tools and education available to individuals from all backgrounds, we level the playing field for future PM leaders."

---

## Objectives
1. **Air Quality Analysis**: Examine PM2.5, Ozone, NO2, CO, SO2 distributions
2. **Weather-Pollution Correlation**: Analyze relationships between weather and air quality
3. **Regional Patterns**: Study pollution patterns across different regions

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries loaded!")

✅ Libraries loaded!


In [2]:
# Load CLEANED data
df = pd.read_csv("../data/weather_cleaned.csv", parse_dates=['last_updated'])

# Air quality columns
aq_cols = [col for col in df.columns if 'air_quality' in col.lower()]
print(f"🌿 Air Quality Columns Found: {aq_cols}")

🌿 Air Quality Columns Found: ['air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10', 'air_quality_us-epa-index', 'air_quality_gb-defra-index']


## 1. Air Quality Overview

In [3]:
# Rename columns for clarity
aq_rename = {
    'air_quality_Carbon_Monoxide': 'CO',
    'air_quality_Ozone': 'Ozone',
    'air_quality_Nitrogen_dioxide': 'NO2',
    'air_quality_Sulphur_dioxide': 'SO2',
    'air_quality_PM2.5': 'PM2.5',
    'air_quality_PM10': 'PM10'
}

# Apply renaming for columns that exist
for old_name, new_name in aq_rename.items():
    if old_name in df.columns:
        df[new_name] = df[old_name]

# Available pollutants
pollutants = [v for v in aq_rename.values() if v in df.columns]
print(f"📊 Analyzing pollutants: {pollutants}")

📊 Analyzing pollutants: ['CO', 'Ozone', 'NO2', 'SO2', 'PM2.5', 'PM10']


In [4]:
# Air quality statistics
if pollutants:
    aq_stats = df[pollutants].describe().round(2)
    print("📊 Air Quality Statistics:")
    display(aq_stats)

📊 Air Quality Statistics:


Unnamed: 0,CO,Ozone,NO2,SO2,PM2.5,PM10
count,114203.0,114203.0,114203.0,114203.0,114203.0,114203.0
mean,490.46,60.14,15.48,10.87,25.0,50.23
std,805.35,31.59,24.92,38.22,38.77,154.87
min,-9999.0,0.0,0.0,-9999.0,0.17,-1848.15
25%,220.15,40.0,1.48,0.95,7.22,10.25
50%,310.8,57.0,5.3,2.4,14.5,20.75
75%,485.85,76.0,17.94,8.75,28.49,43.1
max,38879.4,480.7,427.7,521.33,1614.1,6037.29


In [5]:
# Distribution of each pollutant
if pollutants:
    fig = make_subplots(rows=2, cols=3, subplot_titles=pollutants[:6])
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD']
    
    for i, pol in enumerate(pollutants[:6]):
        row = i // 3 + 1
        col = i % 3 + 1
        fig.add_trace(
            go.Histogram(x=df[pol].dropna(), name=pol, marker_color=colors[i]),
            row=row, col=col
        )
    
    fig.update_layout(
        title='🌿 Air Pollutant Distributions',
        template='plotly_dark',
        height=500,
        showlegend=False
    )
    fig.show()

## 2. Weather-Pollution Correlation

In [6]:
# Correlation between weather and pollutants
weather_cols = ['temperature_celsius', 'humidity', 'pressure_mb', 'wind_kph', 'cloud', 'uv_index']
weather_cols = [c for c in weather_cols if c in df.columns]

if pollutants and weather_cols:
    corr_data = df[weather_cols + pollutants].corr()
    
    # Extract weather vs pollutant correlations
    weather_aq_corr = corr_data.loc[weather_cols, pollutants]
    
    fig = px.imshow(weather_aq_corr,
                    labels=dict(color='Correlation'),
                    title='🔗 Weather vs Air Quality Correlation',
                    color_continuous_scale='RdBu_r',
                    zmin=-1, zmax=1)
    fig.update_layout(template='plotly_dark', height=400)
    fig.show()

In [7]:
# PM2.5 vs Temperature
if 'PM2.5' in df.columns:
    fig = px.scatter(df.sample(min(10000, len(df))), 
                     x='temperature_celsius', y='PM2.5',
                     opacity=0.3,
                     title='🌡️ Temperature vs PM2.5',
                     trendline='ols')
    fig.update_layout(template='plotly_dark', height=500)
    fig.show()

In [8]:
# Wind speed impact on air quality
if 'PM2.5' in df.columns and 'wind_kph' in df.columns:
    # Bin wind speed
    df['wind_category'] = pd.cut(df['wind_kph'], 
                                  bins=[0, 5, 15, 30, 100],
                                  labels=['Calm (0-5)', 'Light (5-15)', 'Moderate (15-30)', 'Strong (30+)'])
    
    wind_pm = df.groupby('wind_category')['PM2.5'].mean().reset_index()
    
    fig = px.bar(wind_pm, x='wind_category', y='PM2.5',
                 title='💨 Wind Speed Impact on PM2.5 Levels',
                 labels={'PM2.5': 'Average PM2.5', 'wind_category': 'Wind Category'})
    fig.update_layout(template='plotly_dark', height=400)
    fig.show()

## 3. Regional Pollution Patterns

In [9]:
# Average pollutant levels by country (top 20)
if 'PM2.5' in df.columns:
    country_pm25 = df.groupby('country')['PM2.5'].agg(['mean', 'count'])
    country_pm25 = country_pm25[country_pm25['count'] >= 100]  # Min samples
    country_pm25 = country_pm25.sort_values('mean', ascending=False).head(20)
    
    fig = go.Figure(data=[
        go.Bar(
            x=country_pm25['mean'].values,
            y=country_pm25.index,
            orientation='h',
            marker_color='#FF6B6B'
        )
    ])
    
    fig.update_layout(
        title='🌍 Top 20 Countries by Average PM2.5 Levels',
        xaxis_title='Average PM2.5 (μg/m³)',
        yaxis_title='Country',
        template='plotly_dark',
        height=600
    )
    fig.show()

In [10]:
# Monthly pollution patterns
if 'PM2.5' in df.columns:
    df['month'] = df['last_updated'].dt.month
    monthly_pollution = df.groupby('month')[pollutants].mean()
    
    fig = go.Figure()
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
    
    for i, pol in enumerate(pollutants[:5]):
        fig.add_trace(go.Scatter(
            x=monthly_pollution.index,
            y=monthly_pollution[pol] / monthly_pollution[pol].max(),  # Normalize
            name=pol,
            mode='lines+markers',
            line=dict(color=colors[i])
        ))
    
    fig.update_layout(
        title='📅 Seasonal Pollution Patterns (Normalized)',
        xaxis_title='Month',
        yaxis_title='Normalized Level',
        template='plotly_dark',
        height=500
    )
    fig.update_xaxes(tickmode='array', tickvals=list(range(1, 13)),
                     ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    fig.show()

## 4. Environmental Health Insights

In [11]:
print("="*60)
print("🌿 ENVIRONMENTAL IMPACT - KEY INSIGHTS")
print("="*60)

if pollutants:
    print(f"\n📊 Air Quality Overview:")
    for pol in pollutants:
        mean_val = df[pol].mean()
        max_val = df[pol].max()
        print(f"   • {pol}: Mean = {mean_val:.2f}, Max = {max_val:.2f}")

if 'PM2.5' in df.columns:
    print(f"\n🔗 Key Correlations with PM2.5:")
    for col in weather_cols:
        corr = df['PM2.5'].corr(df[col])
        print(f"   • {col}: {corr:.3f}")

print("\n" + "="*60)

🌿 ENVIRONMENTAL IMPACT - KEY INSIGHTS

📊 Air Quality Overview:
   • CO: Mean = 490.46, Max = 38879.40
   • Ozone: Mean = 60.14, Max = 480.70
   • NO2: Mean = 15.48, Max = 427.70
   • SO2: Mean = 10.87, Max = 521.33
   • PM2.5: Mean = 25.00, Max = 1614.10
   • PM10: Mean = 50.23, Max = 6037.29

🔗 Key Correlations with PM2.5:
   • temperature_celsius: 0.045
   • humidity: -0.196
   • pressure_mb: 0.001
   • wind_kph: -0.039
   • cloud: -0.175
   • uv_index: 0.036

