In [60]:
import numpy as np
import pandas as pd

In [61]:
df = pd.read_csv('delhi_aqi.csv')

In [62]:
df.head()

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2020-11-25 01:00:00,2616.88,2.18,70.6,13.59,38.62,364.61,411.73,28.63
1,2020-11-25 02:00:00,3631.59,23.25,89.11,0.33,54.36,420.96,486.21,41.04
2,2020-11-25 03:00:00,4539.49,52.75,100.08,1.11,68.67,463.68,541.95,49.14
3,2020-11-25 04:00:00,4539.49,50.96,111.04,6.44,78.2,454.81,534.0,48.13
4,2020-11-25 05:00:00,4379.27,42.92,117.9,17.17,87.74,448.14,529.19,46.61


In [63]:
df.shape

(18776, 9)

In [64]:
df.isnull().sum()

date     0
co       0
no       0
no2      0
o3       0
so2      0
pm2_5    0
pm10     0
nh3      0
dtype: int64

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18776 entries, 0 to 18775
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    18776 non-null  object 
 1   co      18776 non-null  float64
 2   no      18776 non-null  float64
 3   no2     18776 non-null  float64
 4   o3      18776 non-null  float64
 5   so2     18776 non-null  float64
 6   pm2_5   18776 non-null  float64
 7   pm10    18776 non-null  float64
 8   nh3     18776 non-null  float64
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [66]:
df['co'] = df['co'] / 1000

In [67]:
df["PM10_24hr_avg"] = df["pm10"].rolling(window=24, min_periods=16).mean()
df["PM2.5_24hr_avg"] = df["pm2_5"].rolling(window=24, min_periods=16).mean()
df["SO2_24hr_avg"] = df["so2"].rolling(window=24, min_periods=16).mean()
df["NOx_24hr_avg"] = df["no"] + df["no2"]
df["NOx_24hr_avg"] = df["NOx_24hr_avg"].rolling(window=24, min_periods=16).mean()
df["NH3_24hr_avg"] = df["nh3"].rolling(window=24, min_periods=16).mean()

# Calculate 8-hour maximum for CO and O3
df["CO_8hr_max"] = df["co"].rolling(window=8, min_periods=1).max()
df["O3_8hr_max"] = df["o3"].rolling(window=8, min_periods=1).max()

In [68]:
def get_PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0

df["PM2.5_SubIndex"] = df["PM2.5_24hr_avg"].apply(lambda x: get_PM25_subindex(x))

In [69]:
def get_PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0

df["PM10_SubIndex"] = df["PM10_24hr_avg"].apply(lambda x: get_PM10_subindex(x))

In [70]:
def get_NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0

df["NOx_SubIndex"] = df["NOx_24hr_avg"].apply(lambda x: get_NOx_subindex(x))

In [71]:
def get_SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0

df["SO2_SubIndex"] = df["SO2_24hr_avg"].apply(lambda x: get_SO2_subindex(x))

In [72]:
def get_NH3_subindex(x):
    if x <= 200:
        return x * 50 / 200
    elif x <= 400:
        return 50 + (x - 200) * 50 / 200
    elif x <= 800:
        return 100 + (x - 400) * 100 / 400
    elif x <= 1200:
        return 200 + (x - 800) * 100 / 400
    elif x <= 1800:
        return 300 + (x - 1200) * 100 / 600
    elif x > 1800:
        return 400 + (x - 1800) * 100 / 600
    else:
        return 0

df["NH3_SubIndex"] = df["NH3_24hr_avg"].apply(lambda x: get_NH3_subindex(x))

In [73]:
def get_CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0

df["CO_SubIndex"] = df["CO_8hr_max"].apply(lambda x: get_CO_subindex(x))

In [74]:
def get_O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

df["O3_SubIndex"] = df["O3_8hr_max"].apply(lambda x: get_O3_subindex(x))

In [75]:
def get_AQI_bucket(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Satisfactory"
    elif x <= 200:
        return "Moderate"
    elif x <= 300:
        return "Poor"
    elif x <= 400:
        return "Very Poor"
    elif x > 400:
        return "Severe"
    else:
        return np.NaN

df["Checks"] = (df["PM2.5_SubIndex"] > 0).astype(int) + \
                (df["PM10_SubIndex"] > 0).astype(int) + \
                (df["SO2_SubIndex"] > 0).astype(int) + \
                (df["NOx_SubIndex"] > 0).astype(int) + \
                (df["NH3_SubIndex"] > 0).astype(int) + \
                (df["CO_SubIndex"] > 0).astype(int) + \
                (df["O3_SubIndex"] > 0).astype(int)

df["AQI_calculated"] = round(df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex",
                                 "NH3_SubIndex", "CO_SubIndex", "O3_SubIndex"]].max(axis = 1))
df.loc[df["PM2.5_SubIndex"] + df["PM10_SubIndex"] <= 0, "AQI_calculated"] = np.NaN
df.loc[df.Checks < 3, "AQI_calculated"] = np.NaN

df["AQI_bucket_calculated"] = df["AQI_calculated"].apply(lambda x: get_AQI_bucket(x))
df[~df.AQI_calculated.isna()].head(13)

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3,PM10_24hr_avg,...,PM2.5_SubIndex,PM10_SubIndex,NOx_SubIndex,SO2_SubIndex,NH3_SubIndex,CO_SubIndex,O3_SubIndex,Checks,AQI_calculated,AQI_bucket_calculated
15,2020-11-25 16:00:00,4.05884,19.67,98.71,0.08,59.61,387.52,490.96,42.05,413.994375,...,479.652885,379.992969,142.235625,107.546667,8.071719,125.7355,106.514706,7,480.0,Severe
16,2020-11-25 17:00:00,3.89862,18.11,91.85,0.11,52.93,394.51,498.87,31.92,418.987059,...,481.506335,386.233824,141.513529,106.571961,8.066324,125.7355,106.514706,7,482.0,Severe
17,2020-11-25 18:00:00,3.41797,13.19,83.63,0.31,42.92,369.0,461.24,20.52,421.334444,...,482.063675,389.168056,140.141667,105.520185,7.903194,125.7355,86.55,7,482.0,Severe
18,2020-11-25 19:00:00,2.8038,7.15,74.71,0.89,33.85,327.49,400.17,12.16,420.220526,...,480.881781,387.775658,138.126842,104.42,7.647237,125.7355,50.78,7,481.0,Severe
19,2020-11-25 20:00:00,2.45667,4.75,65.8,1.4,30.99,303.81,365.32,9.63,417.4755,...,478.907308,384.344375,135.748,103.382167,7.38525,125.7355,19.67,7,479.0,Severe
20,2020-11-25 21:00:00,2.26975,4.25,58.26,1.39,30.52,285.16,340.62,8.74,413.815714,...,476.437729,379.769643,133.212857,102.435714,7.137619,125.7355,3.0,7,476.0,Severe
21,2020-11-25 22:00:00,2.16293,5.42,53.47,0.83,30.99,272.18,320.68,7.92,409.582273,...,473.738811,374.477841,130.743636,101.582424,6.903182,125.7355,1.4,7,474.0,Severe
22,2020-11-25 23:00:00,2.13623,8.16,49.35,0.31,31.47,264.01,305.06,7.09,405.037826,...,471.001338,368.797283,128.42913,100.81029,6.680109,125.7355,1.4,7,471.0,Severe
23,2020-11-26 00:00:00,2.18964,13.19,46.61,0.07,33.38,260.05,296.45,7.28,400.513333,...,468.365064,363.141667,126.402917,100.129028,6.477604,123.73275,1.4,7,468.0,Severe
24,2020-11-26 01:00:00,2.48337,23.92,45.93,0.01,39.1,268.08,305.4,10.26,396.082917,...,465.271154,357.603646,126.280833,100.135694,6.28625,117.724625,1.4,7,465.0,Severe


In [76]:
df[~df.AQI_calculated.isna()].AQI_bucket_calculated.value_counts()

AQI_bucket_calculated
Severe          6773
Very Poor       6741
Poor            2412
Moderate        2071
Satisfactory     759
Good               5
Name: count, dtype: int64

In [77]:
df1 = df[['date', 'AQI_calculated']]


In [83]:

df1 = df1.dropna(subset=['AQI_calculated'])
df1.reset_index(drop=True, inplace=True)

In [84]:
df1.shape

(18761, 2)

In [80]:
df1.to_csv('final_dataset.csv', index=False)