Feature Engineering

In [4]:

import pandas as pd
aqi_data = pd.read_csv('../dataset/data_date.csv')
aqi_data['Date'] = pd.to_datetime(aqi_data['Date'])

sorted_aqi_by_country = aqi_data.sort_values(['Country', 'Date'])
print(sorted_aqi_by_country)

            Date  Country                          Status  AQI Value
0     2022-07-21  Albania                            Good         14
142   2022-07-21  Albania                            Good         17
284   2022-07-22  Albania                            Good         15
425   2022-07-22  Albania                            Good         15
566   2022-07-22  Albania                            Good         20
...          ...      ...                             ...        ...
14552 2024-08-15   Zambia                       Unhealthy        174
14681 2024-08-22   Zambia  Unhealthy for Sensitive Groups        129
14809 2024-08-29   Zambia                       Unhealthy        153
15194 2024-09-19   Zambia                       Unhealthy        170
15568 2024-10-10   Zambia  Unhealthy for Sensitive Groups        138

[19737 rows x 4 columns]


In [6]:
# date and time features
aqi_data['dayofweek'] = aqi_data['Date'].dt.dayofweek   # 0=Monday
aqi_data['month'] = aqi_data['Date'].dt.month
aqi_data['is_weekend'] = aqi_data['dayofweek'] >= 5


print(aqi_data)


            Date                                            Country  \
0     2022-07-21                                            Albania   
1     2022-07-21                                            Algeria   
2     2022-07-21                                            Andorra   
3     2022-07-21                                             Angola   
4     2022-07-21                                          Argentina   
...          ...                                                ...   
19732 2025-06-19  United Kingdom of Great Britain and Northern I...   
19733 2025-06-19                           United States of America   
19734 2025-06-19                                         Uzbekistan   
19735 2025-06-19                                            Vatican   
19736 2025-06-19                                            Vietnam   

                               Status  AQI Value  dayofweek  month  is_weekend  
0                                Good         14          3      7

In [8]:
# lag features (for memory of past AQI values)
aqi_data['AQI_lag1'] = aqi_data.groupby('Country')['AQI Value'].shift(1)
aqi_data['AQI_lag3'] = aqi_data.groupby('Country')['AQI Value'].shift(3)
aqi_data['AQI_lag7'] = aqi_data.groupby('Country')['AQI Value'].shift(7)

In [9]:
# rolling stats to capture any short-term trends
aqi_data['AQI_roll3'] = aqi_data.groupby('Country')['AQI Value'].transform(lambda x: x.shift(1).rolling(3).mean())
aqi_data['AQI_roll7'] = aqi_data.groupby('Country')['AQI Value'].transform(lambda x: x.shift(1).rolling(7).mean())
aqi_data['AQI_std3'] = aqi_data.groupby('Country')['AQI Value'].transform(lambda x: x.shift(1).rolling(3).std())

In [10]:
# one-hot encoding
aqi_data = pd.get_dummies(aqi_data, columns=['Country'], drop_first=True)

In [11]:
# drop rows with NaNs
aqi_data = aqi_data.dropna()

In [13]:
# save derived data
aqi_data.to_csv('../dataset/processed/aqi_features.csv', index=False)