In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

### Data taken from - Mandir Marg, Delhi air quality monitoring station 

In [26]:
df = pd.read_csv('mandir-marg, delhi-air-quality.csv', skipinitialspace=True)


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3605 entries, 0 to 3604
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    3605 non-null   object 
 1   pm25    3512 non-null   float64
 2   pm10    3542 non-null   float64
 3   o3      3515 non-null   float64
 4   no2     3525 non-null   float64
 5   so2     3412 non-null   float64
 6   co      3495 non-null   float64
dtypes: float64(6), object(1)
memory usage: 197.3+ KB


In [28]:
len(df)

3605

In [29]:
df.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co
0,2024/2/1,152.0,99.0,2.0,19.0,1.0,6.0
1,2024/2/2,180.0,101.0,2.0,20.0,1.0,5.0
2,2024/2/3,167.0,144.0,2.0,21.0,1.0,22.0
3,2024/2/4,213.0,91.0,4.0,17.0,1.0,6.0
4,2024/2/5,166.0,83.0,3.0,19.0,1.0,10.0


In [30]:
df['pm25'].max()

999.0

In [31]:
df.columns

Index(['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co'], dtype='object')

In [32]:
from datetime import date
from datetime import datetime

df['date']= pd.to_datetime(df['date'])

In [33]:
start_date = '2020-06-01'
end_date = '2023-06-02'

df = df[(df['date']<=end_date) & (df['date']>=start_date)]

df.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co
231,2023-04-01,104.0,73.0,89.0,24.0,2.0,9.0
232,2023-04-02,107.0,92.0,48.0,30.0,6.0,14.0
233,2023-04-03,138.0,67.0,43.0,48.0,3.0,8.0
234,2023-04-04,98.0,83.0,57.0,45.0,3.0,8.0
235,2023-04-05,108.0,90.0,48.0,48.0,3.0,7.0


In [34]:
len(df)

1097

In [35]:
df = df.sort_values(by='date')

## Handling missing values using Exponentially Weighted Moving Average 

In [36]:
df['pm25'] = df['pm25'].ewm(halflife=5).mean()
df['pm10'] = df['pm10'].ewm(halflife=5).mean()
df['o3'] = df['o3'].ewm(halflife=5).mean()
df['no2'] = df['no2'].ewm(halflife=5).mean()
df['so2'] = df['so2'].ewm(halflife=5).mean()
df['co'] = df['co'].ewm(halflife=5).mean()

In [37]:
df.isnull().sum()

date    0
pm25    0
pm10    0
o3      0
no2     0
so2     0
co      0
dtype: int64

# Calculating AQI (Air Quality Index)
reference:https://www.kaggle.com/code/rohanrao/calculating-aqi-air-quality-index-tutorial

In [38]:
def get_PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0

df["PM2.5_SubIndex"] = df["pm25"].apply(lambda x: get_PM25_subindex(x))

def get_PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0

df["PM10_SubIndex"] = df["pm10"].apply(lambda x: get_PM10_subindex(x))

def get_SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0

df["SO2_SubIndex"] = df["so2"].apply(lambda x: get_SO2_subindex(x))

def get_NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0

df["NOx_SubIndex"] = df["no2"].apply(lambda x: get_NOx_subindex(x))

def get_CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0

df["CO_SubIndex"] = df["co"].apply(lambda x: get_CO_subindex(x))

def get_O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

df["O3_SubIndex"] = df["o3"].apply(lambda x: get_O3_subindex(x))

def get_AQI_bucket(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Satisfactory"
    elif x <= 200:
        return "Moderate"
    elif x <= 300:
        return "Poor"
    elif x <= 400:
        return "Very Poor"
    elif x > 400:
        return "Severe"
    else:
        return np.NaN

df["Checks"] = (df["PM2.5_SubIndex"] > 0).astype(int) + \
                (df["PM10_SubIndex"] > 0).astype(int) + \
                (df["SO2_SubIndex"] > 0).astype(int) + \
                (df["NOx_SubIndex"] > 0).astype(int) + \
                (df["CO_SubIndex"] > 0).astype(int) + \
                (df["O3_SubIndex"] > 0).astype(int)

df["AQI_calculated"] = round(df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex",
                                  "CO_SubIndex", "O3_SubIndex"]].max(axis = 1))
df.loc[df["PM2.5_SubIndex"] + df["PM10_SubIndex"] <= 0, "AQI_calculated"] = np.NaN
df.loc[df.Checks < 3, "AQI_calculated"] = np.NaN

df["AQI_bucket_calculated"] = df["AQI_calculated"].apply(lambda x: get_AQI_bucket(x))
df[~df.AQI_calculated.isna()].head(13)


Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,PM2.5_SubIndex,PM10_SubIndex,SO2_SubIndex,NOx_SubIndex,CO_SubIndex,O3_SubIndex,Checks,AQI_calculated,AQI_bucket_calculated
1380,2020-06-01,95.0,46.0,4.0,10.0,19.0,4.0,216.666667,46.0,23.75,12.5,125.0,4.0,6,217.0,Poor
1381,2020-06-02,96.069204,50.811418,4.534602,8.930796,21.138408,4.0,220.23068,50.811418,26.42301,11.163495,125.0,4.534602,6,220.0,Poor
1382,2020-06-03,109.358916,55.829123,3.570292,8.957125,19.183458,4.0,264.529719,55.829123,23.979322,11.196407,125.0,3.570292,6,265.0,Poor
1383,2020-06-04,117.765155,54.056363,3.396854,9.274286,17.91118,4.0,292.550518,54.056363,22.388975,11.592857,125.0,3.396854,6,293.0,Poor
1384,2020-06-05,114.978069,56.112962,3.553008,8.944374,15.604085,3.741101,283.260229,56.112962,19.505107,11.180468,121.763764,3.553008,6,283.0,Poor
1385,2020-06-06,108.794001,60.213131,2.967793,9.186351,16.611742,3.800447,262.646671,60.213131,20.764678,11.482939,122.505593,2.967793,6,263.0,Poor
1386,2020-06-07,106.127355,58.501273,3.182935,8.522222,17.526384,3.84204,253.75785,58.501273,21.90798,10.652777,123.025501,3.182935,6,254.0,Poor
1387,2020-06-08,109.193516,59.370304,3.533942,8.22817,16.265665,3.872554,263.978386,59.370304,20.332082,10.285213,123.40692,3.533942,6,264.0,Poor
1388,2020-06-09,116.603993,59.666258,3.98178,8.005134,15.854219,3.895698,288.679976,59.666258,19.817774,10.006417,123.696224,3.98178,6,289.0,Poor
1389,2020-06-10,122.885916,60.586858,4.502722,7.659049,15.016385,4.0863,302.219936,60.586858,18.770481,9.573812,126.078745,4.502722,6,302.0,Very Poor


In [39]:
df.columns

Index(['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'PM2.5_SubIndex',
       'PM10_SubIndex', 'SO2_SubIndex', 'NOx_SubIndex', 'CO_SubIndex',
       'O3_SubIndex', 'Checks', 'AQI_calculated', 'AQI_bucket_calculated'],
      dtype='object')

In [40]:
df.drop(['PM2.5_SubIndex',
       'PM10_SubIndex', 'SO2_SubIndex', 'NOx_SubIndex', 'CO_SubIndex',
       'O3_SubIndex', 'Checks','AQI_bucket_calculated'],axis=1,inplace =True)

In [41]:
df.columns

Index(['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'AQI_calculated'], dtype='object')

In [42]:
df.head(10)

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,AQI_calculated
1380,2020-06-01,95.0,46.0,4.0,10.0,19.0,4.0,217.0
1381,2020-06-02,96.069204,50.811418,4.534602,8.930796,21.138408,4.0,220.0
1382,2020-06-03,109.358916,55.829123,3.570292,8.957125,19.183458,4.0,265.0
1383,2020-06-04,117.765155,54.056363,3.396854,9.274286,17.91118,4.0,293.0
1384,2020-06-05,114.978069,56.112962,3.553008,8.944374,15.604085,3.741101,283.0
1385,2020-06-06,108.794001,60.213131,2.967793,9.186351,16.611742,3.800447,263.0
1386,2020-06-07,106.127355,58.501273,3.182935,8.522222,17.526384,3.84204,254.0
1387,2020-06-08,109.193516,59.370304,3.533942,8.22817,16.265665,3.872554,264.0
1388,2020-06-09,116.603993,59.666258,3.98178,8.005134,15.854219,3.895698,289.0
1389,2020-06-10,122.885916,60.586858,4.502722,7.659049,15.016385,4.0863,302.0


In [21]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,AQI_calculated,year,month,day
1019,2021-06-01,106.0,74.0,14.0,30.0,4.0,7.0,253.0,2021,6,1
1020,2021-06-02,122.572661,78.811418,22.019029,30.534602,4.534602,7.534602,302.0,2021,6,2
1021,2021-06-03,137.192662,80.02454,22.392248,30.331208,4.711667,7.331208,313.0,2021,6,3
1022,2021-06-04,140.175281,78.800592,21.056472,30.230481,4.495234,7.230481,316.0,2021,6,4
1023,2021-06-05,138.835407,77.816623,20.265155,29.653012,4.625917,7.17081,314.0,2021,6,5


In [22]:
df.drop('date',axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0,pm25,pm10,o3,no2,so2,co,AQI_calculated,year,month,day
1019,106.0,74.0,14.0,30.0,4.0,7.0,253.0,2021,6,1
1020,122.572661,78.811418,22.019029,30.534602,4.534602,7.534602,302.0,2021,6,2
1021,137.192662,80.02454,22.392248,30.331208,4.711667,7.331208,313.0,2021,6,3
1022,140.175281,78.800592,21.056472,30.230481,4.495234,7.230481,316.0,2021,6,4
1023,138.835407,77.816623,20.265155,29.653012,4.625917,7.17081,314.0,2021,6,5


In [231]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
df[['pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'AQI_calculated']] = scaler.fit_transform(df[['pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'AQI_calculated']])

In [43]:
df.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,AQI_calculated
1380,2020-06-01,95.0,46.0,4.0,10.0,19.0,4.0,217.0
1381,2020-06-02,96.069204,50.811418,4.534602,8.930796,21.138408,4.0,220.0
1382,2020-06-03,109.358916,55.829123,3.570292,8.957125,19.183458,4.0,265.0
1383,2020-06-04,117.765155,54.056363,3.396854,9.274286,17.91118,4.0,293.0
1384,2020-06-05,114.978069,56.112962,3.553008,8.944374,15.604085,3.741101,283.0


In [44]:
df.to_csv('AQI.csv')

In [52]:
first_row = df.iloc[0]

In [55]:
first_row

date              2020-06-01 00:00:00
pm25                             95.0
pm10                             46.0
o3                                4.0
no2                              10.0
so2                              19.0
co                                4.0
AQI_calculated                  217.0
Name: 1380, dtype: object