<h1><b>Data Collation</b></h1>

# Import Libraries

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
from pathlib import Path

src_path = Path().resolve().parents[1] / "src"
sys.path.append(str(src_path))

from tools.CONFIG import CONFIG

import warnings
warnings.filterwarnings(action='ignore')

# Define Datapoints

In [20]:
try:
    ISPU_df = pd.read_csv(
        CONFIG['DATAPOINT']['ISPU'],
        parse_dates=['tanggal']
    )

    cuaca_harian_df = pd.read_csv(
        CONFIG['DATAPOINT']['cuaca-harian'],
        parse_dates=['time']
    )

    jumlah_penduduk_df = pd.read_csv(
        CONFIG['DATAPOINT']['jumlah-penduduk']
    )

    kualitas_air_sungai_df = pd.read_csv(
        CONFIG['DATAPOINT']['kualitas-air-sungai']
    )

    libur_nasional_df = pd.read_csv(
        CONFIG['DATAPOINT']['libur-nasional'],
        parse_dates=['tanggal']
    )

    NDVI_df = pd.read_csv(
        CONFIG['DATAPOINT']['NDVI'],
        parse_dates=['tanggal']
    )

except FileNotFoundError as e:
    raise RuntimeError(f"❌ Error loading data: {e}")


# Collating Features

In [21]:
station_map = {
    "DKI1 (Bunderan HI)": "DKI1",
    "DKI2 (Kelapa Gading)": "DKI2",
    "DKI3 (Jagakarsa)": "DKI3",
    "DKI4 (Lubang Buaya)": "DKI4",
    "DKI5 (Kebon Jeruk)": "DKI5",
}

ISPU_df["stasiun_id"] = ISPU_df["stasiun"].map(station_map)

In [22]:
collated_df.head()

Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,stasiun_id,is_holiday_nasional,is_weekend,temperature_2m_mean,relative_humidity_2m_mean,wind_speed_10m_mean,precipitation_sum
0,201001,2010-01-01,DKI5 (Kebon Jeruk),,,,,,,0.0,,TIDAK ADA DATA,,1,0,26.48,82.2,10.54,4.48
1,201001,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG,,1,0,26.48,82.2,10.54,4.48
2,201001,2010-01-01,DKI4 (Lubang Buaya),,,,,,,0.0,,TIDAK ADA DATA,,1,0,26.48,82.2,10.54,4.48
3,201001,2010-01-01,DKI2 (Kelapa Gading),,,,,,,0.0,,TIDAK ADA DATA,,1,0,26.48,82.2,10.54,4.48
4,201001,2010-01-01,DKI3 (Jagakarsa),,,,,,,0.0,,TIDAK ADA DATA,,1,0,26.48,82.2,10.54,4.48


In [23]:
print(ISPU_df.columns.to_list())
print(len(ISPU_df.columns.to_list()))

['periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori', 'stasiun_id']
13


# Merge Libur Nasional

In [24]:
ISPU_df = ISPU_df.merge(
    libur_nasional_df[["tanggal", "is_holiday_nasional", "is_weekend"]],
    on="tanggal",
    how="left"
)

ISPU_df[["is_holiday_nasional", "is_weekend"]] = (
    ISPU_df[["is_holiday_nasional", "is_weekend"]]
    .fillna(0)
    .astype("int8")
)


# Prepare Cuaca

In [25]:
cuaca_harian_df = cuaca_harian_df.rename(columns={
    "time": "tanggal",
    "temperature_2m_mean (°C)": "temperature_2m_mean",
    "relative_humidity_2m_mean (%)": "relative_humidity_2m_mean",
    "wind_speed_10m_mean (km/h)": "wind_speed_10m_mean",
    "precipitation_sum (mm)": "precipitation_sum"
})

cuaca_cols = [
    "tanggal",
    "temperature_2m_mean",
    "relative_humidity_2m_mean",
    "wind_speed_10m_mean",
    "precipitation_sum"
]

cuaca_harian_df = cuaca_harian_df[cuaca_cols]

cuaca_harian_df = (
    cuaca_harian_df
    .groupby("tanggal", as_index=False)
    .mean()
)


# Merge Data

In [26]:
collated_df = ISPU_df.merge(
    cuaca_harian_df,
    on="tanggal",
    how="left"
)


In [27]:
print(collated_df.isna().sum().sort_values(ascending=False))
print(collated_df.shape)


stasiun_id                   5799
pm_duakomalima               4398
pm_sepuluh                   1842
nitrogen_dioksida            1533
ozon                         1469
sulfur_dioksida              1463
karbon_monoksida             1457
parameter_pencemar_kritis    1408
max                            10
is_weekend                      0
wind_speed_10m_mean             0
relative_humidity_2m_mean       0
temperature_2m_mean             0
periode_data                    0
is_holiday_nasional             0
kategori                        0
tanggal                         0
stasiun                         0
precipitation_sum               0
dtype: int64
(8920, 19)


In [34]:
collated_df.to_csv(
    "data_collation_reset_v3.csv",
    index=False
)
