In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import datetime as dt

# Read data

In [2]:
data = pd.read_csv('../data/global_air_quality_data_10000.csv')

In [3]:
data.head(5)

Unnamed: 0,City,Country,Date,PM2.5,PM10,NO2,SO2,CO,O3,Temperature,Humidity,Wind Speed
0,Bangkok,Thailand,2023-03-19,86.57,25.19,99.88,30.63,4.46,36.29,17.67,59.35,13.76
1,Istanbul,Turkey,2023-02-16,50.63,97.39,48.14,8.71,3.4,144.16,3.46,67.51,6.36
2,Rio de Janeiro,Brazil,2023-11-13,130.21,57.22,98.51,9.92,0.12,179.31,25.29,29.3,12.87
3,Mumbai,India,2023-03-16,119.7,130.52,10.96,33.03,7.74,38.65,23.15,99.97,7.71
4,Paris,France,2023-04-04,55.2,36.62,76.85,21.85,2.0,67.09,16.02,90.28,14.16


# **Question-Answer Analyzing**

## **Question 1:** What are the cities with the highest levels of PM2.5, and how do these values compare globally?

### **Benefit:** PM2.5 is a key indicator of air quality and poses significant health risks. Identifying hotspots can help understand which cities need targeted interventions.

In [4]:
pm25_by_city = data.groupby('City', as_index=False)['PM2.5'].mean()

top_cities_pm25 = pm25_by_city.sort_values(by='PM2.5', ascending=False).head(10)

fig = px.bar(top_cities_pm25, 
             x='City', 
             y='PM2.5', 
             title='Top 10 Cities with the Highest Average PM2.5 Levels',
             labels={'PM2.5': 'Average PM2.5 Level (µg/m³)', 'City': 'City'},
             color='PM2.5',
             color_continuous_scale='Reds')

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_title='PM2.5 Levels (µg/m³)',
    xaxis_title='City',
    template='plotly',
)

fig.show()

### **Answer:**
Based on the chart, we can have some comments and insights:
1. Dubai hits the top place:
- Dubai has the highest PM2.5 levels, nearing 80 µg/m³, which makes it the most polluted city in this analysis in terms of fine particulate matter. PM2.5 levels above 35 µg/m³ are considered harmful to human health, so residents in Dubai may suffer from several health problems due to this problem.

2. Other High PM2.5 Cities:
- Cities like Sydney, Mumbai, Tokyo, Mexico City, and Beijing follow closely behind Dubai. Their PM2.5 values are still greatly high, ranging between 78.5 and 79.5 µg/m³, which indicates serious air quality issues across these cities.

3. Global Comparisons:
- While the chart indicates a small range (78–80 µg/m³) among the top 10 cities, it highlights how air quality is seriously poor across different regions globally, spanning cities in Asia (Mumbai, Tokyo, Beijing), Australia (Sydney), North America (Mexico City, New York, Toronto), and Europe (Moscow, Istanbul).

4. Some Insights related to geographical location:

- Middle East: Dubai's pollution levels may be influenced by industrial emissions, and desert dust.

- Asia: Cities like Mumbai and Beijing are well-known for heavy industry, vehicular emissions, and dense populations, contributing to severe air quality issues.

- Sydney's inclusion may reflect seasonal bushfires or unique local conditions.

5. Global Health Affects:
- According to World Health Organization's (WHO) guideline, annual average concentrations of PM2.5 should not exceed 5 µg/m3; therefore, residents in these cities may be at high risk of respiratory and cardiovascular illnesses.

## **Question 2:** How does air quality (e.g., PM2.5, NO2, and SO2 levels) vary across seasons or months?
### **Benefit:** Analyzing trends over time can uncover seasonal patterns, such as higher pollution during winter months due to heating or weather-related phenomena.

In [5]:
# Step 1: Convert the 'Date' column to datetime format and extract month and season
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Season'] = data['Month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
})

# Step 2: Group data by Season and calculate mean pollutant levels
seasonal_pollution = data.groupby('Season', as_index=False)[['PM2.5', 'PM10', 'NO2', 'SO2']].mean()

# Step 3: Visualize seasonal trends with a bar chart
fig_season = px.bar(seasonal_pollution, 
                    x='Season', 
                    y=['PM2.5', 'PM10', 'NO2', 'SO2'],
                    title='Average Air Quality Levels Across Seasons',
                    labels={'value': 'Average Level (µg/m³)', 'Season': 'Season'},
                    barmode='group')
fig_season.update_layout(template='plotly')
fig_season.show()

# Step 4: Group data by Month and calculate mean pollutant levels
monthly_pollution = data.groupby('Month', as_index=False)[['PM2.5', 'PM10', 'NO2', 'SO2']].mean()

# Step 5: Visualize monthly trends with a line chart
fig_month = px.line(monthly_pollution, 
                    x='Month', 
                    y=['PM2.5', 'PM10', 'NO2', 'SO2'],
                    title='Monthly Variation of Air Quality Levels',
                    labels={'value': 'Average Level (µg/m³)', 'Month': 'Month'})
fig_month.update_layout(template='plotly', xaxis=dict(tickmode='array', tickvals=list(range(1, 13)),
                                                     ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']))
fig_month.show()

### **Answer:**
Based on the charts, we can have some comments and insights:

1. PM10 Levels are the highest across the year:

- PM10 consistently has the highest average levels in all four seasons (Autumn, Spring, Summer, and Winter), with values slightly exceeding 100 µg/m³.

- This suggests that coarse particulate matter (PM10) is a severe problem year-round, possibly due to the cause of dust, construction activities, and natural sources like pollen or desert.

2. PM2.5 Levels show steady changes across the seasons:

- PM2.5 levels remain steady at approximately 75–78 µg/m³ throughout all seasons.

- Health problem may be affected negatively, which is a concern as PM2.5 has greater health impacts due to its ability to penetrate deep into the lungs.

- Sources like vehicle emissions, industrial activities, and burning of fuels may remain constant year-round.

3. NO2 (Nitrogen Dioxide) Levels vary vlightly year-round:

- NO2 levels moves between 50–54 µg/m³ and are relatively lower compared to PM10 and PM2.5.

- Slight seasonal variations might be attributed to changes in traffic emissions, heating systems (higher in winter), and industrial activities.

4.  SO2 (Sulfur Dioxide) Levels are the lowest:

- SO2 has the lowest levels among all pollutants, with average values under 26 µg/m³ across seasons.

- This could indicate some advancement in technologies in term of industries, clean and renewable fuels, as SO2 mostly comes from coal and industrial emissions.

## **Question 3:** How often does a city (e.g. Bangkok) experience different air quality categories ?
### **Benefit:**
- Public Health Awareness: Understanding how often poor air quality occurs helps in assessing health risks to the population. If Bad or Very Bad air quality levels are frequent, it indicates a need for stronger public health warnings and interventions

- Forecasting: By recognizing patterns of pollution, the analysis helps forecast pollution levels. For example, if Bad air quality is more common during certain seasons or conditions, proactive measures (like advisories or changes in local activities) can be put in place.

In [9]:
# Step 1: Define a function to categorize air quality based on PM2.5 and PM10 levels
def categorize_air_quality(row):
    if row['PM2.5'] <= 25 and row['PM10'] <= 50:
        return 'Good'
    elif 25 < row['PM2.5'] <= 37.5 and 50 < row['PM10'] <= 75:
        return 'Fair'
    elif 37.5 < row['PM2.5'] <= 50 and 75 < row['PM10'] <= 100:
        return 'Bad'
    else:
        return 'Very Bad'

# Step 2: Apply the categorization function to the entire dataset
data['Air Quality'] = data.apply(categorize_air_quality, axis=1)

# Step 3: Filter the data for Bangkok
bangkok_data = data[data['City'] == 'Bangkok']

# Step 4: Visualize Air Quality Categories in Bangkok using Plotly
fig_bangkok = px.histogram(bangkok_data, 
                           x='Air Quality', 
                           title='Air Quality Categories in Bangkok',
                           labels={'Air Quality': 'Air Quality Category', 'count': 'Count of Days'},
                           color='Air Quality',
                           color_discrete_sequence=px.colors.sequential.Viridis)
fig_bangkok.update_layout(template='plotly')
fig_bangkok.show()

# Step 5: Find top 5 cities with the highest counts of "Very Bad" Air Quality
top_very_bad_cities = data[data['Air Quality'] == 'Very Bad'].groupby('City', as_index=False).size()
top_5_very_bad_cities = top_very_bad_cities.sort_values(by='size', ascending=False).head(5)

# Step 6: Visualize top 5 cities with the highest "Very Bad" Air Quality counts
fig_top_very_bad = px.bar(top_5_very_bad_cities, 
                          x='City', 
                          y='size',
                          title='Top 5 Cities with the Highest Very Bad Air Quality Days',
                          labels={'size': 'Count of Very Bad Air Quality Days', 'City': 'City'},
                          color='size',
                          color_continuous_scale='Reds')
fig_top_very_bad.update_layout(template='plotly')
fig_top_very_bad.show()


## **Question 4:** : Which pollutants contribute the most to overall Air Pollution level in each city?
### **Benefit:**
- Identifying Major Contributors: It helps determine which pollutants (e.g., PM2.5, PM10, NO2, SO2, CO, O3) are the primary drivers of poor air quality in different cities.

- By pinpointing the dominant pollutants, policymakers and environmental agencies can design targeted solutions, such as stricter emission controls for industries or traffic regulations.

In [7]:
pollutant_columns = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
city_pollutants = data.groupby('City')[pollutant_columns].mean().reset_index()

# Melt the dataframe to a long format for visualization
city_pollutants_melted = city_pollutants.melt(id_vars='City', 
                                              value_vars=pollutant_columns, 
                                              var_name='Pollutant', 
                                              value_name='Average Level')

# Step 8: Visualize pollutant contributions per city
fig_pollutants = px.bar(city_pollutants_melted, 
                        x='City', 
                        y='Average Level',
                        color='Pollutant',
                        title='Average Pollutant Contribution to Air Pollution Level in Each City',
                        labels={'Average Level': 'Average Pollutant Level (µg/m³)', 'City': 'City'},
                        barmode='group',
                        color_discrete_sequence=px.colors.qualitative.Set1)
fig_pollutants.update_layout(template='plotly')
fig_pollutants.show()