# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

import warnings
warnings.filterwarnings("ignore")


# Load Dataset

In [40]:
df = pd.read_csv("/Users/paulinadevinawijaya/Downloads/ARKAVIDIA 10.0 - DSC/data/main_csv/data_collation_reset_v3.csv")
df.head()

Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,stasiun_id,is_holiday_nasional,is_weekend,temperature_2m_mean,relative_humidity_2m_mean,wind_speed_10m_mean,precipitation_sum
0,201001,2010-01-01,DKI5 (Kebon Jeruk),,,,,,,0.0,,TIDAK ADA DATA,DKI5,1,0,26.48,82.2,10.54,4.48
1,201001,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG,DKI1,1,0,26.48,82.2,10.54,4.48
2,201001,2010-01-01,DKI4 (Lubang Buaya),,,,,,,0.0,,TIDAK ADA DATA,DKI4,1,0,26.48,82.2,10.54,4.48
3,201001,2010-01-01,DKI2 (Kelapa Gading),,,,,,,0.0,,TIDAK ADA DATA,DKI2,1,0,26.48,82.2,10.54,4.48
4,201001,2010-01-01,DKI3 (Jagakarsa),,,,,,,0.0,,TIDAK ADA DATA,DKI3,1,0,26.48,82.2,10.54,4.48


# Basic Clean

In [41]:
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")
df = df.dropna(subset=["tanggal"])

df = (
    df.sort_values(["stasiun", "tanggal"])
      .drop_duplicates(subset=["stasiun", "tanggal"])
      .reset_index(drop=True)
)

print(df["tanggal"].isna().sum())
df.head(10)

0


Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,stasiun_id,is_holiday_nasional,is_weekend,temperature_2m_mean,relative_humidity_2m_mean,wind_speed_10m_mean,precipitation_sum
0,201105,2011-01-05,DKI1,71.0,,18.0,54.0,88.0,31.0,88.0,O3,SEDANG,,0,0,25.36,86.4,5.48,6.82
1,201101,2011-02-01,DKI1,35.0,,15.0,15.0,47.0,11.0,47.0,O3,BAIK,,0,0,25.14,88.6,8.86,16.02
2,201101,2011-03-01,DKI1,41.0,,16.0,35.0,31.0,15.0,41.0,PM10,BAIK,,0,0,24.66,85.6,13.9,6.34
3,201105,2011-03-05,DKI1,69.0,,22.0,31.0,69.0,28.0,69.0,PM10,SEDANG,,1,1,25.34,85.4,9.68,6.4
4,201104,2011-09-04,DKI1,50.0,,21.0,33.0,44.0,14.0,50.0,PM10,BAIK,,0,1,27.32,70.4,6.56,0.0
5,201110,2011-10-10,DKI1,83.0,,16.0,34.0,107.0,19.0,107.0,O3,TIDAK SEHAT,,0,0,26.86,80.2,4.1,3.86
6,201202,2012-02-02,DKI1,54.0,,52.0,34.0,74.0,17.0,74.0,O3,SEDANG,,0,0,25.48,88.0,5.9,22.96
7,201201,2012-03-01,DKI1,65.0,,15.0,42.0,86.0,22.0,86.0,O3,SEDANG,,0,0,25.26,89.4,5.54,17.0
8,201202,2012-03-02,DKI1,55.0,,31.0,44.0,59.0,14.0,59.0,O3,SEDANG,,0,0,26.16,84.2,8.26,6.66
9,201203,2012-03-03,DKI1,56.0,,25.0,51.0,76.0,17.0,76.0,O3,SEDANG,,0,1,26.72,80.6,9.24,2.1


# Basic Time Features

In [42]:
df["dayofweek"] = df["tanggal"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)
df["month"] = df["tanggal"].dt.month

# Defining the Numeric Core

In [43]:
base_pollutants = [
    "pm_sepuluh",
    "pm_duakomalima",
    "sulfur_dioksida",
    "karbon_monoksida",
    "ozon",
    "nitrogen_dioksida"
]

cuaca_cols = [
    "temperature_2m_mean",
    "relative_humidity_2m_mean",
    "wind_speed_10m_mean",
    "precipitation_sum"
]


# Lag Features

In [44]:
for lag in [1, 3, 7]:
    df[f"pm25_lag_{lag}"] = (
        df.groupby("stasiun")["pm_duakomalima"]
          .shift(lag)
    )

# Rolling Feature (Decreasing Noise)

In [45]:
df["pm25_roll_3"] = (
    df.groupby("stasiun")["pm_duakomalima"]
      .shift(1)
      .rolling(3)
      .mean()
)


In [46]:
print(df.columns.tolist())


['periode_data', 'tanggal', 'stasiun', 'pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'max', 'parameter_pencemar_kritis', 'kategori', 'stasiun_id', 'is_holiday_nasional', 'is_weekend', 'temperature_2m_mean', 'relative_humidity_2m_mean', 'wind_speed_10m_mean', 'precipitation_sum', 'dayofweek', 'month', 'pm25_lag_1', 'pm25_lag_3', 'pm25_lag_7', 'pm25_roll_3']


# Handle Missing Data

In [47]:
num_cols = (
    base_pollutants +
    cuaca_cols +
    ["pm25_lag_1", "pm25_lag_3", "pm25_lag_7", "pm25_roll_3"]
)


In [48]:
df = df.rename(columns={
    "temperature_2m_mean (°C)": "temperature_2m_mean",
    "relative_humidity_2m_mean (%)": "relative_humidity_2m_mean",
    "wind_speed_10m_mean (km/h)": "wind_speed_10m_mean",
    "precipitation_sum (mm)": "precipitation_sum"
})


In [49]:
df[num_cols] = (
    df.groupby("stasiun")[num_cols]
      .transform(lambda x: x.fillna(x.median()))
)


In [50]:
print(df[num_cols].isna().sum().sum())
print(df.shape)


19735
(8817, 25)


In [51]:
core_features = (
    base_pollutants +
    cuaca_cols +
    ["pm25_lag_1", "pm25_lag_3", "pm25_lag_7", "pm25_roll_3"]
)


In [52]:
before = df.shape[0]

df = df.dropna(subset=core_features)

after = df.shape[0]

print(f"Dropped rows: {before - after}")
print("New shape:", df.shape)


Dropped rows: 3947
New shape: (4870, 25)


In [53]:
print("Total NaN (numeric):", df[core_features].isna().sum().sum())
print("Shape:", df.shape)


Total NaN (numeric): 0
Shape: (4870, 25)


In [54]:
core_features = (
    base_pollutants +
    cuaca_cols +
    ["pm25_lag_1", "pm25_lag_3", "pm25_lag_7", "pm25_roll_3"]
)

print("NaN before:", df[core_features].isna().sum().sum())

df = df.dropna(subset=core_features).reset_index(drop=True)

print("NaN after:", df[core_features].isna().sum().sum())
print("Shape:", df.shape)


NaN before: 0
NaN after: 0
Shape: (4870, 25)


# Encoding

In [55]:
le_stasiun = LabelEncoder()
le_target = LabelEncoder()

df["stasiun_enc"] = le_stasiun.fit_transform(df["stasiun"])
df["kategori_enc"] = le_target.fit_transform(df["kategori"])


# Outlier Handling

In [56]:
def iqr_cap(df, cols, factor=1.5):
    df = df.copy()
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - factor * IQR
        upper = Q3 + factor * IQR
        df[col] = df[col].clip(lower, upper)
    return df

continuous_cols = (
    base_pollutants +
    cuaca_cols +
    ["pm25_lag_1", "pm25_lag_3", "pm25_lag_7", "pm25_roll_3"]
)

df = iqr_cap(df, continuous_cols)


# Scaling (Normalization)

In [57]:
scaler = RobustScaler()

df_scaled = df.copy()
df_scaled[continuous_cols] = scaler.fit_transform(
    df_scaled[continuous_cols]
)

# Encoder

In [58]:
le_station = LabelEncoder()
df_scaled["stasiun_enc"] = le_station.fit_transform(df_scaled["stasiun"])

# Feature Engineering

## Pollution intensity Ratio

In [59]:
df_scaled["pm25_pm10_ratio"] = (
    df_scaled["pm_duakomalima"] /
    (df_scaled["pm_sepuluh"] + 1e-6)
)


## Weather Pollution Interaction

In [60]:
df_scaled["pm25_temp_interaction"] = (
    df_scaled["pm_duakomalima"] *
    df_scaled["temperature_2m_mean"]
)

df_scaled["pm25_wind_inverse"] = (
    df_scaled["pm_duakomalima"] /
    (df_scaled["wind_speed_10m_mean"] + 1)
)

## Temporal Cycling Encoding

In [61]:
df_scaled["month_sin"] = np.sin(2 * np.pi * df_scaled["month"] / 12)
df_scaled["month_cos"] = np.cos(2 * np.pi * df_scaled["month"] / 12)

df_scaled["dow_sin"] = np.sin(2 * np.pi * df_scaled["dayofweek"] / 7)
df_scaled["dow_cos"] = np.cos(2 * np.pi * df_scaled["dayofweek"] / 7)


In [62]:
print("Final NaN count:", df_scaled.isna().sum().sum())
print("Final shape:", df_scaled.shape)


Final NaN count: 4955
Final shape: (4870, 34)


In [63]:
nan_cols = (
    df_scaled.isna()
    .sum()
    .sort_values(ascending=False)
)

nan_cols[nan_cols > 0]


stasiun_id                   4870
parameter_pencemar_kritis      75
max                            10
dtype: int64

# Handling Missing Value

In [64]:
df = df.drop(columns=["stasiun_id"], errors="ignore")

df["parameter_pencemar_kritis"] = (
    df["parameter_pencemar_kritis"]
    .fillna("TIDAK_DOMINAN")
)

In [65]:
print("Final NaN count:", df.isna().sum().sum())
print("Final shape:", df.shape)


Final NaN count: 10
Final shape: (4870, 26)


In [67]:
df.isna().sum()[df.isna().sum() > 0]

max    10
dtype: int64

In [68]:
df["max"] = (
    df.groupby("stasiun")["max"]
      .transform(lambda x: x.fillna(x.median()))
)

df["max"] = df["max"].fillna(df["max"].median())


In [69]:
print("Final NaN count:", df.isna().sum().sum())
df.isna().sum()[df.isna().sum() > 0]


Final NaN count: 0


Series([], dtype: int64)

In [70]:
print("Final shape:", df.shape)
print("Final NaN count:", df.isna().sum().sum())


Final shape: (4870, 26)
Final NaN count: 0


In [71]:
df = df.reset_index(drop=True)


In [None]:
SAVE_PATH = "data_collation_catboost_ready.csv"

df.to_csv(
    SAVE_PATH,
    index=False
)

print(f"Dataset saved to {SAVE_PATH}")

✅ Dataset saved to data_collation_catboost_ready.csv
