In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")

df = pd.read_csv(PATH, sep=";")

In [2]:
# parse tanggal
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)

# sort
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

# drop kolom
DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

# label mapping
LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)


# REINDEX 
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index": "tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


# PM2.5 HANDLING
df["pm25_missing"] = df["pm_duakomalima"].isna().astype(int)

median_pm25 = (
    df[df["tanggal"] >= "2021-01-01"]
    .groupby("lokasi_clean")["pm_duakomalima"]
    .median()
)

df["pm_duakomalima"] = df["pm_duakomalima"].fillna(
    df["lokasi_clean"].map(median_pm25)
)


# buang baris tanpa target (penting untuk training)
df = df[df["target"].notna()].copy()


# TIME FEATURES 
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)

df["doy_sin"] = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"] = np.cos(2*np.pi*df["dayofyear"]/365)

df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7)
df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7)

df.head()


  df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


Unnamed: 0,tanggal,periode_data,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,kategori,temperature_2m_max (°C),...,pm25_missing,month,dayofyear,dayofweek,month_sin,month_cos,doy_sin,doy_cos,dow_sin,dow_cos
0,2010-01-01,201001.0,60.0,73.0,4.0,73.0,27.0,14.0,SEDANG,29.4,...,1,1,1,4,0.5,0.866025,0.017213,0.999852,-0.433884,-0.900969
1,2010-01-02,201001.0,32.0,73.0,2.0,16.0,33.0,9.0,BAIK,30.8,...,1,1,2,5,0.5,0.866025,0.034422,0.999407,-0.974928,-0.222521
2,2010-01-03,201001.0,27.0,73.0,2.0,19.0,20.0,9.0,BAIK,30.4,...,1,1,3,6,0.5,0.866025,0.05162,0.998667,-0.781831,0.62349
3,2010-01-04,201001.0,22.0,73.0,2.0,16.0,15.0,6.0,BAIK,30.3,...,1,1,4,0,0.5,0.866025,0.068802,0.99763,0.0,1.0
4,2010-01-05,201001.0,25.0,73.0,2.0,17.0,15.0,8.0,BAIK,29.9,...,1,1,5,1,0.5,0.866025,0.085965,0.996298,0.781831,0.62349


In [3]:
# fitur lag dan rolling

POLLUTANT_COLS = [
    "pm_sepuluh",
    "pm_duakomalima",
    "ozon",
    "nitrogen_dioksida",
    "sulfur_dioksida",
    "karbon_monoksida",
]

WEATHER_COLS = [
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "wind_speed_10m_mean (km/h)",
    "precipitation_sum (mm)",
    "cloud_cover_mean (%)",
    "surface_pressure_mean (hPa)",
]

LAG_FEATURES = POLLUTANT_COLS + WEATHER_COLS

LAGS = [1, 2, 3]
ROLL_WINDOWS = [3, 7]
def create_temporal_features(g):
    g = g.sort_values("tanggal")
    for col in LAG_FEATURES:
        for lag in LAGS:
            g[f"{col}_lag_{lag}"] = g[col].shift(lag)


    for col in POLLUTANT_COLS:
        for w in ROLL_WINDOWS:
            g[f"{col}_roll_mean_{w}"] = g[col].shift(1).rolling(w).mean()
            g[f"{col}_roll_std_{w}"]  = g[col].shift(1).rolling(w).std()

    return g

df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
lag_cols = [c for c in df.columns if "lag_" in c or "roll_" in c]

for col in lag_cols:
    df[f"{col}_isnan"] = df[col].isna().astype(int)


df.head()


  df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isn

Unnamed: 0,tanggal,periode_data,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,kategori,temperature_2m_max (°C),...,nitrogen_dioksida_roll_mean_7_isnan,nitrogen_dioksida_roll_std_7_isnan,sulfur_dioksida_roll_mean_3_isnan,sulfur_dioksida_roll_std_3_isnan,sulfur_dioksida_roll_mean_7_isnan,sulfur_dioksida_roll_std_7_isnan,karbon_monoksida_roll_mean_3_isnan,karbon_monoksida_roll_std_3_isnan,karbon_monoksida_roll_mean_7_isnan,karbon_monoksida_roll_std_7_isnan
0,2010-01-01,201001.0,60.0,73.0,4.0,73.0,27.0,14.0,SEDANG,29.4,...,1,1,1,1,1,1,1,1,1,1
1,2010-01-02,201001.0,32.0,73.0,2.0,16.0,33.0,9.0,BAIK,30.8,...,1,1,1,1,1,1,1,1,1,1
2,2010-01-03,201001.0,27.0,73.0,2.0,19.0,20.0,9.0,BAIK,30.4,...,1,1,1,1,1,1,1,1,1,1
3,2010-01-04,201001.0,22.0,73.0,2.0,16.0,15.0,6.0,BAIK,30.3,...,1,1,0,0,1,1,0,0,1,1
4,2010-01-05,201001.0,25.0,73.0,2.0,17.0,15.0,8.0,BAIK,29.9,...,1,1,0,0,1,1,0,0,1,1


In [4]:
# ===============================
# DROP BARIS TANPA TARGET
# ===============================
df_model = df.dropna(subset=["target"]).reset_index(drop=True)


# ===============================
# ENCODE LOKASI (WAJIB SEBELUM FEATURES)
# ===============================
from sklearn.preprocessing import LabelEncoder

le_loc = LabelEncoder()
df_model["lokasi_enc"] = le_loc.fit_transform(df_model["lokasi_clean"])


# ===============================
# DEFINISI TARGET & FEATURES
# ===============================
TARGET = "target"

FEATURES = [
    # ===== lag polutan =====
    *[f"{col}_lag_{lag}" for col in POLLUTANT_COLS for lag in [1,2,3]],

    # ===== rolling polutan =====
    *[f"{col}_roll_mean_{w}" for col in POLLUTANT_COLS for w in [3,7]],
    *[f"{col}_roll_std_{w}"  for col in POLLUTANT_COLS for w in [3,7]],

    # ===== lag cuaca =====
    *[f"{col}_lag_{lag}" for col in WEATHER_COLS for lag in [1,2,3]],

    # ===== time features =====
    "month","dayofyear","dayofweek",
    "month_sin","month_cos",
    "doy_sin","doy_cos",
    "dow_sin","dow_cos",

    # ===== lokasi =====
    "lokasi_enc",
]



# ===============================
# SPLIT X, y
# ===============================
X = df_model[FEATURES]
y = df_model[TARGET]

groups = df_model["lokasi_enc"]      # pakai yang sudah encoded
time_index = df_model["tanggal"]


In [5]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score

LGB_PARAMS = {
    "objective": "multiclass",      # WAJIB
    "num_class": 3,                 # jumlah kelas
    "metric": "multi_logloss",      # untuk early stopping
    "learning_rate": 0.03,
    "num_leaves": 63,
    "max_depth": -1,
    "min_data_in_leaf": 40,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.5,
    "lambda_l2": 0.5,
    "min_gain_to_split": 0.01,
    "max_bin": 255,
    "verbosity": -1,
    "seed": 42,
}



In [6]:
scores = []
models = []
N_FOLDS = 3
DATE_COL = "tanggal"

unique_dates = np.sort(df_model[DATE_COL].unique())
fold_sizes = np.linspace(0.6, 0.9, N_FOLDS)
cutoffs = [unique_dates[int(len(unique_dates) * q)] for q in fold_sizes]

for fold, cutoff in enumerate(cutoffs, 1):

    train_idx = df_model["tanggal"] < cutoff
    valid_idx = df_model["tanggal"] >= cutoff

    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)

    model = lgb.train(
        LGB_PARAMS,
        train_data,
        num_boost_round=2000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )

    preds = model.predict(X_valid)
    preds_label = np.argmax(preds, axis=1)

    score = f1_score(y_valid, preds_label, average="macro")
    scores.append(score)
    models.append(model)

    print(f"Fold {fold} | cutoff={np.datetime_as_string(cutoff, unit='D')} | Macro-F1={score:.4f}")

print("\nCV Macro-F1 mean:", np.mean(scores))
print("CV Macro-F1 std :", np.std(scores))


Fold 1 | cutoff=2019-05-09 | Macro-F1=0.4708
Fold 2 | cutoff=2021-10-05 | Macro-F1=0.5193
Fold 3 | cutoff=2024-02-10 | Macro-F1=0.5140

CV Macro-F1 mean: 0.501352922015494
CV Macro-F1 std : 0.021723109569842223


### Forecasting

In [7]:
train_data = lgb.Dataset(X, label=y)

final_model = lgb.train(
    LGB_PARAMS,
    train_data,
    num_boost_round=int(np.mean([m.best_iteration for m in models]))
)

In [8]:
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)
sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])

history = df_model.copy()
history = history.sort_values(["lokasi_clean", "tanggal"])
feature_template = pd.Series(0.0, index=FEATURES)

def build_features_from_history(hist, current_row):
    loc = current_row["lokasi_clean"]
    tgl = current_row["tanggal"]

    h = hist[hist["lokasi_clean"] == loc].sort_values("tanggal")

    feat = {}

    # ===== lag polutan =====
    for col in POLLUTANT_COLS:
        for lag in [1, 2, 3]:
            feat[f"{col}_lag_{lag}"] = h[col].iloc[-lag]

    # ===== rolling =====
    for col in POLLUTANT_COLS:
        feat[f"{col}_roll_mean_3"] = h[col].iloc[-3:].mean()
        feat[f"{col}_roll_mean_7"] = h[col].iloc[-7:].mean()
        feat[f"{col}_roll_std_7"]  = h[col].iloc[-7:].std()

    # ===== lag cuaca =====
    for col in WEATHER_COLS:
        for lag in [1, 2, 3]:
            feat[f"{col}_lag_{lag}"] = h[col].iloc[-lag]


    # ===== time features =====
    feat["month"] = tgl.month
    feat["dayofyear"] = tgl.dayofyear
    feat["dayofweek"] = tgl.dayofweek

    feat["month_sin"] = np.sin(2*np.pi*feat["month"]/12)
    feat["month_cos"] = np.cos(2*np.pi*feat["month"]/12)

    feat["doy_sin"] = np.sin(2*np.pi*feat["dayofyear"]/365)
    feat["doy_cos"] = np.cos(2*np.pi*feat["dayofyear"]/365)

    feat["dow_sin"] = np.sin(2*np.pi*feat["dayofweek"]/7)
    feat["dow_cos"] = np.cos(2*np.pi*feat["dayofweek"]/7)

    feat["lokasi_enc"] = le_loc.transform([loc])[0]

    # ===== pastikan struktur sama dengan training =====
    feat_series = feature_template.copy()
    feat_series.update(feat)

    return feat_series




In [9]:
preds = []

for _, row in sub.iterrows():

    # ===== buat fitur dari history =====
    feat = build_features_from_history(history, row)
    X_pred = feat[FEATURES].values.reshape(1, -1)


    # ===== prediksi =====
    prob = final_model.predict(X_pred)
    label = int(np.argmax(prob))
    preds.append(label)

    # ===== update history untuk step berikutnya =====
    new_row = row.copy()
    new_row["target"] = label

    last_hist = history[history["lokasi_clean"] == row["lokasi_clean"]].iloc[-1]

    # persistence polutan (standar kompetisi)
    for col in POLLUTANT_COLS:
        series = history[history["lokasi_clean"] == row["lokasi_clean"]][col]

        last = series.iloc[-1]

        if len(series) >= 7:
            trend = series.iloc[-7:].diff().mean()
            noise = series.iloc[-7:].std() * 0.1
        else:
            trend = 0
            noise = 0

        # forecast = last + trend + noise kecil
        new_row[col] = last + trend + noise



    # penting: tambahkan kolom tanggal & lokasi agar lag berikutnya benar
    new_row["tanggal"] = row["tanggal"]
    new_row["lokasi_clean"] = row["lokasi_clean"]

    history = pd.concat([history, pd.DataFrame([new_row])], ignore_index=True)


In [10]:
INV_LABEL_MAP = {0: "BAIK", 1: "SEDANG", 2: "TIDAK SEHAT"}

sub["kategori"] = [INV_LABEL_MAP[p] for p in preds]

sub[["id", "kategori"]].to_csv("submission_10_100.csv", index=False)
