In [2]:
# ---------------------------
# STEP 1: IMPORT LIBRARIES
# ---------------------------
import pandas as pd
import numpy as np
import os

# ---------------------------
# STEP 2: LOAD TRAINING DATA
# ---------------------------
train_path = "../data/raw/training/concatenated_dataset_Aug_2021_to_July_2024.csv"

df_train = pd.read_csv(train_path)

print("Training Data Loaded!")
print("Shape:", df_train.shape)
df_train.head()

Training Data Loaded!
Shape: (123134, 18)


Unnamed: 0,datetime,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,surface_pressure,wind_speed_10m,wind_direction_10m,shortwave_radiation
0,8/24/2021 0:00,5,1014.71,0.0,17.14,35.76,6.44,59.59,66.81,5.95,28.1,80.0,24.3,0.0,976.6,6.2,170.0,0
1,8/24/2021 1:00,5,1054.76,0.01,19.36,32.19,6.68,66.96,74.88,6.08,27.8,83.0,24.7,0.0,976.4,6.5,180.0,0
2,8/24/2021 2:00,5,1295.09,1.4,30.16,22.89,8.7,79.88,89.9,7.73,27.4,87.0,25.0,0.0,976.2,5.4,180.0,0
3,8/24/2021 3:00,5,1682.28,9.16,41.13,28.25,12.99,98.91,111.25,9.75,27.1,89.0,25.2,0.0,976.2,5.8,184.0,0
4,8/24/2021 4:00,5,1321.79,4.81,32.56,91.55,27.9,89.59,97.95,7.28,26.9,91.0,25.2,0.0,976.1,6.1,183.0,0


In [3]:
# Check missing values
print("\nMissing Values Per Column:")
print(df_train.isna().sum())

# Check duplicates
duplicate_count = df_train.duplicated().sum()
print("\nTotal Duplicates:", duplicate_count)


Missing Values Per Column:
datetime                0
main_aqi                0
components_co           0
components_no           0
components_no2          0
components_o3           0
components_so2          0
components_pm2_5        0
components_pm10         0
components_nh3          0
temperature_2m          0
relative_humidity_2m    0
dew_point_2m            0
precipitation           0
surface_pressure        0
wind_speed_10m          0
wind_direction_10m      0
shortwave_radiation     0
dtype: int64

Total Duplicates: 0


In [4]:
# Convert datetime column
df_train['datetime'] = pd.to_datetime(df_train['datetime'], errors='coerce')

# Check conversion
print("\nDatetime Conversion Successful. Sample:")
df_train['datetime'].head()


Datetime Conversion Successful. Sample:


0   2021-08-24 00:00:00
1   2021-08-24 01:00:00
2   2021-08-24 02:00:00
3   2021-08-24 03:00:00
4   2021-08-24 04:00:00
Name: datetime, dtype: datetime64[ns]

In [5]:
# Extract time-based features
df_train['year'] = df_train['datetime'].dt.year
df_train['month'] = df_train['datetime'].dt.month
df_train['day'] = df_train['datetime'].dt.day
df_train['hour'] = df_train['datetime'].dt.hour
df_train['day_of_week'] = df_train['datetime'].dt.dayofweek
df_train['is_weekend'] = df_train['day_of_week'].isin([5,6]).astype(int)

# Optional: seasons (good for AQI patterns)
def get_season(month):
    if month in [12,1,2]:
        return "winter"
    elif month in [3,4,5]:
        return "spring"
    elif month in [6,7,8]:
        return "summer"
    else:
        return "autumn"

df_train['season'] = df_train['month'].apply(get_season)

In [7]:
# Output folder
output_path = "../data/cleaned/"
os.makedirs(output_path, exist_ok=True)

save_path = "../data/cleaned/concatenated_training_cleaned.csv"
df_train.to_csv(save_path, index=False)

print("\nTraining data cleaned & saved at:")
print(save_path)


Training data cleaned & saved at:
../data/cleaned/concatenated_training_cleaned.csv


In [8]:
df_train.head()

Unnamed: 0,datetime,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,...,wind_speed_10m,wind_direction_10m,shortwave_radiation,year,month,day,hour,day_of_week,is_weekend,season
0,2021-08-24 00:00:00,5,1014.71,0.0,17.14,35.76,6.44,59.59,66.81,5.95,...,6.2,170.0,0,2021,8,24,0,1,0,summer
1,2021-08-24 01:00:00,5,1054.76,0.01,19.36,32.19,6.68,66.96,74.88,6.08,...,6.5,180.0,0,2021,8,24,1,1,0,summer
2,2021-08-24 02:00:00,5,1295.09,1.4,30.16,22.89,8.7,79.88,89.9,7.73,...,5.4,180.0,0,2021,8,24,2,1,0,summer
3,2021-08-24 03:00:00,5,1682.28,9.16,41.13,28.25,12.99,98.91,111.25,9.75,...,5.8,184.0,0,2021,8,24,3,1,0,summer
4,2021-08-24 04:00:00,5,1321.79,4.81,32.56,91.55,27.9,89.59,97.95,7.28,...,6.1,183.0,0,2021,8,24,4,1,0,summer
