**Install required libraries**

In [None]:
!pip install pandas matplotlib seaborn plotly scikit-learn statsmodels

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm



In [None]:
Data_20 = pd.read_csv('/content/drive/MyDrive/datasets/Dhaka_PM2.5_2020_YTD.csv')
Data_21 = pd.read_csv('/content/drive/MyDrive/datasets/Dhaka_PM2.5_2021_YTD.csv')
Data_22 = pd.read_csv('/content/drive/MyDrive/datasets/Dhaka_PM2.5_2022_YTD.csv')
Data_23 = pd.read_csv('/content/drive/MyDrive/datasets/Dhaka_PM2.5_2023_YTD.csv')
Data_24 = pd.read_csv('/content/drive/MyDrive/datasets/Dhaka_PM2.5_2024_YTD.csv')



In [None]:
# Add a 'Year' column to each dataset
Data_20['Year'] = 2020
Data_21['Year'] = 2021
Data_22['Year'] = 2022
Data_23['Year'] = 2023
Data_24['Year'] = 2024

**Combine all datasets into one**

In [None]:
combined_data = pd.concat([Data_20, Data_21, Data_22, Data_23, Data_24], ignore_index=True)

# Reverse the hour column (somehow the original data is reevrsed)
combined_data['Hour'] = 23 - combined_data['Hour']

**Basic statistics**

In [None]:
print(combined_data.describe())

# Check for missing values
print(combined_data.isnull().sum())

               Year         Month           Day          Hour  NowCast Conc.  \
count  42793.000000  42793.000000  42793.000000  42793.000000   42793.000000   
mean    2022.019606      6.513332     15.660692     11.496927      94.260323   
std        1.420072      3.473482      8.779572      6.924023      88.952068   
min     2020.000000      1.000000      1.000000      0.000000    -999.000000   
25%     2021.000000      3.000000      8.000000      5.000000      38.600000   
50%     2022.000000      7.000000     16.000000     11.000000      69.900000   
75%     2023.000000     10.000000     23.000000     17.000000     134.600000   
max     2024.000000     12.000000     31.000000     23.000000     844.900000   

                AQI     Raw Conc.  
count  42793.000000  42793.000000  
mean     163.240623     89.119622  
std       91.506064    120.239665  
min     -999.000000   -999.000000  
25%      109.000000     37.000000  
50%      159.000000     69.000000  
75%      194.000000    134.

**Handling Negative values**

In [None]:
numeric_columns = combined_data.select_dtypes(include=[np.number]).columns
negative_values = combined_data[numeric_columns].lt(0).any()

print("Columns with negative values:")
print(negative_values[negative_values].index.tolist())

Columns with negative values:
['NowCast Conc.', 'AQI', 'Raw Conc.']


In [None]:
mean_raw_conc = combined_data.loc[combined_data['Raw Conc.'] >= 0, 'Raw Conc.'].mean()
combined_data['Raw Conc.'] = combined_data['Raw Conc.'].apply(lambda x: mean_raw_conc if x < 0 else x)

# Verify that negative values have been replaced
print("Negative values in 'Raw Conc.' after replacement:")
print((combined_data['Raw Conc.'] < 0).sum())

Negative values in 'Raw Conc.' after replacement:
0


In [None]:
for col in numeric_columns:
    if (combined_data[col] < 0).any():  # Check if the column has negative values
        col_mean = combined_data.loc[combined_data[col] >= 0, col].mean()  # Calculate mean excluding negatives
        combined_data[col] = combined_data[col].apply(lambda x: col_mean if x < 0 else x)

# Verify that all negative values have been replaced
print("Negative values in all numeric columns after replacement:")
print((combined_data[numeric_columns] < 0).sum().sum())

Negative values in all numeric columns after replacement:
0


**Distribution of PM2.5 concentrations**

In [None]:
fig = px.histogram(combined_data, x='Raw Conc.', nbins=50, title='Distribution of PM2.5 Concentrations')
fig.show()

**Group by Year and Month, then calculate the mean PM2.5 concentration**

In [None]:
monthly_avg = combined_data.groupby(['Year', 'Month'])['Raw Conc.'].mean().reset_index()

# Plot monthly trends for each year
fig = px.line(monthly_avg, x='Month', y='Raw Conc.', color='Year',
              title='Monthly Average PM2.5 Concentrations (2020-2024)')
fig.show()

# Identify the month with the highest and lowest PM2.5 for each year
for year in monthly_avg['Year'].unique():
    year_data = monthly_avg[monthly_avg['Year'] == year]
    max_month = year_data.loc[year_data['Raw Conc.'].idxmax()]
    min_month = year_data.loc[year_data['Raw Conc.'].idxmin()]
    print(f"Year {year}:")
    print(f"  Highest PM2.5 in Month {max_month['Month']} with {max_month['Raw Conc.']:.2f} µg/m³")
    print(f"  Lowest PM2.5 in Month {min_month['Month']} with {min_month['Raw Conc.']:.2f} µg/m³")

Year 2020:
  Highest PM2.5 in Month 1.0 with 179.48 µg/m³
  Lowest PM2.5 in Month 8.0 with 27.98 µg/m³
Year 2021:
  Highest PM2.5 in Month 1.0 with 210.47 µg/m³
  Lowest PM2.5 in Month 7.0 with 29.81 µg/m³
Year 2022:
  Highest PM2.5 in Month 12.0 with 194.81 µg/m³
  Lowest PM2.5 in Month 7.0 with 39.42 µg/m³
Year 2023:
  Highest PM2.5 in Month 1.0 with 234.07 µg/m³
  Lowest PM2.5 in Month 7.0 with 32.25 µg/m³
Year 2024:
  Highest PM2.5 in Month 1.0 with 208.34 µg/m³
  Lowest PM2.5 in Month 7.0 with 32.84 µg/m³


**Plot hourly trends for each year**

In [None]:
# Group by Year and Hour, then calculate the mean PM2.5 concentration
hourly_avg = combined_data.groupby(['Year', 'Hour'])['Raw Conc.'].mean().reset_index()

fig = px.line(hourly_avg, x='Hour', y='Raw Conc.', color='Year',
              title='Hourly Average PM2.5 Concentrations (2020-2024)')
fig.show()

**Seasonal PM2.5 concentrations**

In [None]:
# Define seasons based on months
def get_season(month):
    if month in [4, 5]:
        return 'Grishmo (Summer)'
    elif month in [6, 7, 8]:
        return 'Barsha (Rainy)'
    elif month in [9, 10]:
        return 'Sharat (Autumn)'
    elif month in [11]:
        return 'Hemanto (Late Autumn)'
    elif month in [12, 1]:
        return 'Sheet (Winter)'
    elif month in [2, 3]:
        return 'Bashonto (Spring)'

combined_data['Season'] = combined_data['Month'].apply(get_season)

In [None]:
seasonal_avg = combined_data.groupby(['Year', 'Season'])['Raw Conc.'].mean().reset_index()

# Plot seasonal trends for each year
fig = px.line(seasonal_avg, x='Season', y='Raw Conc.', color='Year',
              title='Seasonal Average PM2.5 Concentrations (2020-2024)')
fig.show()

**Yearly PM2.5 concentrations**

In [None]:
yearly_avg = combined_data.groupby('Year')['Raw Conc.'].mean().reset_index()

# Plot yearly trends
fig = px.bar(yearly_avg, x='Year', y='Raw Conc.', title='Yearly Average PM2.5 Concentrations')
fig.show()

In [None]:
# Yearly AQI categories
yearly_aqi = combined_data.groupby(['Year', 'AQI Category']).size().reset_index(name='Count')

# Plot yearly AQI categories
fig = px.bar(yearly_aqi, x='Year', y='Count', color='AQI Category',
             title='Yearly AQI Categories (2020-2024)')
fig.show()

**line plot for PM2.5 concentrations over time**

In [None]:
fig = px.line(combined_data, x='Date (LT)', y='Raw Conc.', color='Year',
              title='PM2.5 Concentrations Over Time (2020-2024)')
fig.show()

**scatter plot for PM2.5 vs AQI**

In [None]:
fig = px.scatter(combined_data, x='Raw Conc.', y='AQI', color='Year',
                 title='PM2.5 vs AQI (2020-2024)')
fig.show()

**Calculate rolling averages for PM2.5 concentrations**

In [None]:
combined_data['Rolling Avg'] = combined_data.groupby('Year')['Raw Conc.'].transform(lambda x: x.rolling(window=30).mean())

# Plot rolling averages
fig = px.line(combined_data, x='Date (LT)', y='Rolling Avg', color='Year',
              title='30-Day Rolling Average PM2.5 Concentrations (2020-2024)')
fig.show()

**Conclusion**


1. Impact of COVID-19 (2020):
  The year 2020 had better air quality compared to other years, primarily due to the COVID-19 lockdowns. Reduced industrial activity, vehicular emissions, and human movement significantly lowered PM2.5 concentrations during this period.

2. Monthly Trends:

  January is consistently the month with the highest PM2.5 concentrations across most years (except 2022, where December had the highest levels). This is likely due to winter conditions, such as temperature inversions and increased use of biomass for heating, which trap pollutants close to the ground.

  July and August (the rainy season) show the lowest PM2.5 concentrations due to rainfall, which helps clear pollutants from the air.

3. Hourly Trends:

  Morning hours (5 AM to 10 AM) generally have better air quality, likely due to lower human activity and cooler temperatures.

  Afternoon hours (especially between 12 PM to 6 PM) are the most polluted, likely due to increased vehicular traffic, industrial activity, and higher temperatures that enhance pollutant dispersion.

4. Seasonal Trends:

  The rainy season has the lowest PM2.5 concentrations due to frequent rainfall, which washes away pollutants.

  The winter season has the highest PM2.5 concentrations, making it the most hazardous time of the year for air quality in Dhaka. This is due to factors like temperature inversions, increased use of biomass for heating, and reduced wind speeds.

**Overall Air Quality in Dhaka:**

Dhaka's air quality is consistently poor, with PM2.5 concentrations often exceeding safe limits. The air quality is frequently in the "Unhealthy" to "Very Unhealthy" range, and occasionally reaches "Hazardous" levels.

It is rare for Dhaka to experience "Good" air quality, and such instances are typically short-lived and occur during the rainy season.

**Additional Insights**
1. Yearly Comparison:

  2020 stands out as the year with the best air quality due to the COVID-19 lockdowns.

  2022 shows a deviation from the trend, with December having the highest PM2.5 concentrations instead of January. This could be due to specific meteorological conditions or increased pollution sources during that period.

2. Long-Term Trends:

  Over the years, there is no significant improvement in air quality, indicating that pollution control measures have not been effective enough to combat the rising levels of PM2.5.

3. Health Implications:

  The winter season poses the greatest health risk due to extremely high PM2.5 levels. Prolonged exposure to such air quality can lead to severe respiratory and cardiovascular diseases.

**Final Thoughts**

*Dhaka's air quality is a critical issue that requires immediate attention. While seasonal and hourly variations exist, the overall trend indicates that the city's air quality remains dangerously poor for most of the year. The rainy season provides temporary relief, but the winter season is particularly hazardous. Policymakers and stakeholders must implement strict pollution control measures to address this growing public health crisis.*

