In [43]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")

df = pd.read_csv(PATH, sep=";")

In [44]:
# parse tanggal
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)

# sort
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

# drop kolom
DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

# label mapping
LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)


# REINDEX 
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index": "tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


# PM2.5 HANDLING
df["pm25_missing"] = df["pm_duakomalima"].isna().astype(int)

median_pm25 = (
    df[df["tanggal"] >= "2021-01-01"]
    .groupby("lokasi_clean")["pm_duakomalima"]
    .median()
)

df["pm_duakomalima"] = df["pm_duakomalima"].fillna(
    df["lokasi_clean"].map(median_pm25)
)


# buang baris tanpa target (penting untuk training)
df = df[df["target"].notna()].copy()


# TIME FEATURES 
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)

df["doy_sin"] = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"] = np.cos(2*np.pi*df["dayofyear"]/365)

df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7)
df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7)

df.head()


  df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


Unnamed: 0,tanggal,periode_data,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,kategori,temperature_2m_max (°C),...,pm25_missing,month,dayofyear,dayofweek,month_sin,month_cos,doy_sin,doy_cos,dow_sin,dow_cos
0,2010-01-01,201001.0,60.0,73.0,4.0,73.0,27.0,14.0,SEDANG,29.4,...,1,1,1,4,0.5,0.866025,0.017213,0.999852,-0.433884,-0.900969
1,2010-01-02,201001.0,32.0,73.0,2.0,16.0,33.0,9.0,BAIK,30.8,...,1,1,2,5,0.5,0.866025,0.034422,0.999407,-0.974928,-0.222521
2,2010-01-03,201001.0,27.0,73.0,2.0,19.0,20.0,9.0,BAIK,30.4,...,1,1,3,6,0.5,0.866025,0.05162,0.998667,-0.781831,0.62349
3,2010-01-04,201001.0,22.0,73.0,2.0,16.0,15.0,6.0,BAIK,30.3,...,1,1,4,0,0.5,0.866025,0.068802,0.99763,0.0,1.0
4,2010-01-05,201001.0,25.0,73.0,2.0,17.0,15.0,8.0,BAIK,29.9,...,1,1,5,1,0.5,0.866025,0.085965,0.996298,0.781831,0.62349


In [45]:
# fitur lag dan rolling

POLLUTANT_COLS = [
    "pm_sepuluh",
    "pm_duakomalima",
    "ozon",
    "nitrogen_dioksida",
    "sulfur_dioksida",
    "karbon_monoksida",
]

WEATHER_COLS = [
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "wind_speed_10m_mean (km/h)",
    "precipitation_sum (mm)",
    "cloud_cover_mean (%)",
    "surface_pressure_mean (hPa)",
]

LAG_FEATURES = POLLUTANT_COLS + WEATHER_COLS

LAGS = [1, 2, 3]
ROLL_WINDOWS = [3, 7]
def create_temporal_features(g):
    g = g.sort_values("tanggal")

    # ===== LAG untuk POLUTAN + CUACA =====
    for col in POLLUTANT_COLS + WEATHER_COLS:
        for lag in LAGS:
            g[f"{col}_lag_{lag}"] = g[col].shift(lag)

    # ===== ROLLING hanya untuk polutan =====
    for col in POLLUTANT_COLS:
        for w in ROLL_WINDOWS:
            g[f"{col}_roll_mean_{w}"] = g[col].shift(1).rolling(w).mean()
            g[f"{col}_roll_std_{w}"]  = g[col].shift(1).rolling(w).std()

    return g


df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
lag_cols = [c for c in df.columns if "lag_" in c or "roll_" in c]

for col in lag_cols:
    df[f"{col}_isnan"] = df[col].isna().astype(int)


df.head()


  df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isn

Unnamed: 0,tanggal,periode_data,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,kategori,temperature_2m_max (°C),...,nitrogen_dioksida_roll_mean_7_isnan,nitrogen_dioksida_roll_std_7_isnan,sulfur_dioksida_roll_mean_3_isnan,sulfur_dioksida_roll_std_3_isnan,sulfur_dioksida_roll_mean_7_isnan,sulfur_dioksida_roll_std_7_isnan,karbon_monoksida_roll_mean_3_isnan,karbon_monoksida_roll_std_3_isnan,karbon_monoksida_roll_mean_7_isnan,karbon_monoksida_roll_std_7_isnan
0,2010-01-01,201001.0,60.0,73.0,4.0,73.0,27.0,14.0,SEDANG,29.4,...,1,1,1,1,1,1,1,1,1,1
1,2010-01-02,201001.0,32.0,73.0,2.0,16.0,33.0,9.0,BAIK,30.8,...,1,1,1,1,1,1,1,1,1,1
2,2010-01-03,201001.0,27.0,73.0,2.0,19.0,20.0,9.0,BAIK,30.4,...,1,1,1,1,1,1,1,1,1,1
3,2010-01-04,201001.0,22.0,73.0,2.0,16.0,15.0,6.0,BAIK,30.3,...,1,1,0,0,1,1,0,0,1,1
4,2010-01-05,201001.0,25.0,73.0,2.0,17.0,15.0,8.0,BAIK,29.9,...,1,1,0,0,1,1,0,0,1,1


In [46]:
from sklearn.preprocessing import LabelEncoder
df_model = df.dropna(subset=["target"]).reset_index(drop=True)

H = 30

rows = []

for loc, g in df_model.groupby("lokasi_clean"):
    g = g.sort_values("tanggal").reset_index(drop=True)

    # precompute lag POLUTAN + CUACA
    for col in POLLUTANT_COLS + WEATHER_COLS:
        g[f"{col}_lag_1"] = g[col].shift(0)
        g[f"{col}_lag_2"] = g[col].shift(1)
        g[f"{col}_lag_3"] = g[col].shift(2)

    for h in range(1, H + 1):
        g_future = g.shift(-h)

        temp = pd.DataFrame({
            "lokasi_clean": loc,
            "tanggal": g["tanggal"],
            "horizon": h,
            "month": g["tanggal"].dt.month,
            "dayofyear": g["tanggal"].dt.dayofyear,
            "dayofweek": g["tanggal"].dt.dayofweek,
        })

        # fitur lag
        for col in POLLUTANT_COLS + WEATHER_COLS:
            temp[f"{col}_lag_1"] = g[f"{col}_lag_1"]
            temp[f"{col}_lag_2"] = g[f"{col}_lag_2"]
            temp[f"{col}_lag_3"] = g[f"{col}_lag_3"]

        # target masa depan
        for col in POLLUTANT_COLS:
            temp[f"target_{col}"] = g_future[col]

        rows.append(temp)

df_forecast = pd.concat(rows, ignore_index=True).dropna().reset_index(drop=True)


le_loc = LabelEncoder()

# FIT dari data training asli
df_model["lokasi_enc"] = le_loc.fit_transform(df_model["lokasi_clean"])

# BARU transform ke df_forecast
df_forecast["lokasi_enc"] = le_loc.transform(df_forecast["lokasi_clean"])



In [47]:
import lightgbm as lgb

pollutant_models = {}

FEATURES_F = (
    [f"{col}_lag_{l}" for col in POLLUTANT_COLS for l in LAGS] +
    [f"{col}_lag_{l}" for col in WEATHER_COLS for l in LAGS] +
    ["month", "dayofyear", "dayofweek", "lokasi_enc", "horizon"]
)


for col in POLLUTANT_COLS:

    y_col = f"target_{col}"

    Xf = df_forecast[FEATURES_F]
    yf = df_forecast[y_col]

    train_data = lgb.Dataset(Xf, label=yf)

    params = {
        "objective": "regression",
        "metric": "l2",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "verbosity": -1,
        "seed": 42,
    }

    model = lgb.train(params, train_data, num_boost_round=500)
    pollutant_models[col] = model


In [48]:
# ============================================
# TIME SERIES CV UNTUK OOF FORECAST POLUTAN
# ============================================

SPLITS = ["2022-01-01", "2023-01-01", "2024-01-01"]

oof_parts = []

for split_date in SPLITS:

    train_df = df_model[df_model["tanggal"] < split_date].copy()
    valid_df = df_model[df_model["tanggal"] >= split_date].copy()

    fold_rows = []

    for loc in valid_df["lokasi_clean"].unique():

        hist = (
            train_df[train_df["lokasi_clean"] == loc]
            .sort_values("tanggal")
            .iloc[-3:][POLLUTANT_COLS]
            .values.tolist()
        )

        weather_hist = (
            train_df[train_df["lokasi_clean"] == loc]
            .sort_values("tanggal")
            .iloc[-7:][WEATHER_COLS]
        )
        weather_future = weather_hist.mean().values

        future_rows = valid_df[valid_df["lokasi_clean"] == loc].sort_values("tanggal")

        last_date = train_df[train_df["lokasi_clean"] == loc]["tanggal"].max()

        for _, row in future_rows.iterrows():

            horizon = (row["tanggal"] - last_date).days

            feat = {
                "horizon": horizon,
                "month": row["month"],
                "dayofyear": row["dayofyear"],
                "dayofweek": row["dayofweek"],
                "lokasi_enc": le_loc.transform([loc])[0],
            }

            for i, col in enumerate(POLLUTANT_COLS):
                feat[f"{col}_lag_1"] = hist[-1][i]
                feat[f"{col}_lag_2"] = hist[-2][i]
                feat[f"{col}_lag_3"] = hist[-3][i]

            for i, col in enumerate(WEATHER_COLS):
                feat[f"{col}_lag_1"] = weather_future[i]
                feat[f"{col}_lag_2"] = weather_future[i]
                feat[f"{col}_lag_3"] = weather_future[i]

            Xf = pd.DataFrame([feat])

            preds = []
            for col in POLLUTANT_COLS:
                p = pollutant_models[col].predict(Xf)[0]
                preds.append(p)

            hist.append(preds)
            hist.pop(0)

            row_out = row.copy()
            for i, col in enumerate(POLLUTANT_COLS):
                row_out[col] = preds[i]

            fold_rows.append(row_out)

    oof_parts.append(pd.DataFrame(fold_rows))

df_oof = pd.concat(oof_parts).reset_index(drop=True)


In [49]:
df_oof["lokasi_enc"] = le_loc.transform(df_oof["lokasi_clean"])


In [50]:
ISPU_PARAMS = {
    "objective": "multiclass",
    "num_class": 3,
    "metric": "None",
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_data_in_leaf": 30,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
    "verbosity": -1,
    "seed": 42,
    "class_weight": "balanced"
}


In [51]:
def add_ispu_interactions(df):
    df = df.copy()

    df["pm_ratio"] = df["pm_duakomalima"] / (df["pm_sepuluh"] + 1e-3)

    df["gas_sum"] = (
        df["ozon"] +
        df["nitrogen_dioksida"] +
        df["sulfur_dioksida"]
    )

    df["pm25_ozon"] = df["pm_duakomalima"] * df["ozon"]
    df["pm10_no2"] = df["pm_sepuluh"] * df["nitrogen_dioksida"]

    return df


In [52]:

# ============================================
# TRAIN ISPU DARI DISTRIBUSI OOF (WAJIB)
# ============================================

train_ispu_df = pd.concat([
    df_model[df_model["tanggal"] < "2022-01-01"],  # bagian awal asli
    df_oof                                          # bagian forecast
]).reset_index(drop=True)

train_ispu_df = add_ispu_interactions(train_ispu_df)
df_oof = add_ispu_interactions(df_oof)


ISPU_FEATURES = [
    *POLLUTANT_COLS,
    "pm_ratio",
    "gas_sum",
    "pm25_ozon",
    "pm10_no2",
    "month", "dayofyear", "dayofweek", "lokasi_enc"
]


X_ispu = train_ispu_df[ISPU_FEATURES]
y_ispu = train_ispu_df["target"]

train_data = lgb.Dataset(X_ispu, label=y_ispu)

ispu_model = lgb.train(ISPU_PARAMS, train_data, num_boost_round=700)


In [53]:
from sklearn.metrics import f1_score
import numpy as np

X_val = df_oof[ISPU_FEATURES]
y_val = df_oof["target"]

probs = ispu_model.predict(X_val)

best_t = [0.33, 0.33, 0.33]

for c in range(3):
    best_score = 0
    for t in np.linspace(0.05, 0.9, 60):

        pred = probs.argmax(1).copy()
        pred[probs[:, c] > t] = c

        score = f1_score(y_val, pred, average="macro")

        if score > best_score:
            best_score = score
            best_t[c] = t


print("Best threshold:", best_t)


Best threshold: [0.42457627118644065, 0.5110169491525424, 0.42457627118644065]


In [54]:
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])
last_dates = (
    df_model.groupby("lokasi_clean")["tanggal"].max().to_dict()
)


In [55]:
forecast_rows = []

for loc in sub["lokasi_clean"].unique():

    # ===== riwayat 3 hari terakhir =====
    hist = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-3:][POLLUTANT_COLS]
        .values.tolist()
    )
    # ===== riwayat cuaca 7 hari terakhir → untuk proxy masa depan =====
    weather_hist = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-7:][WEATHER_COLS]
    )

# pakai rata-rata sebagai cuaca masa depan (climatology cepat)
    weather_future = weather_hist.mean().values


    future_rows = (
        sub[sub["lokasi_clean"] == loc]
        .sort_values("tanggal")
    )

    for _, row in future_rows.iterrows():

        tgl = row["tanggal"]

        # ===== fitur forecast =====
        feat = {
            "horizon": (row["tanggal"] - last_dates[loc]).days,
            "month": tgl.month,
            "dayofyear": tgl.dayofyear,
            "dayofweek": tgl.dayofweek,
            "lokasi_enc": row["lokasi_enc"],
        }

        # ===== lag dari history rolling =====
        for i, col in enumerate(POLLUTANT_COLS):
            feat[f"{col}_lag_1"] = hist[-1][i]
            feat[f"{col}_lag_2"] = hist[-2][i]
            feat[f"{col}_lag_3"] = hist[-3][i]
            
        for i, col in enumerate(WEATHER_COLS):
            feat[f"{col}_lag_1"] = weather_future[i]
            feat[f"{col}_lag_2"] = weather_future[i]
            feat[f"{col}_lag_3"] = weather_future[i]

        Xf = pd.DataFrame([feat])

        # ===== lag CUACA (pakai nilai rata-rata masa depan) =====


        # ===== prediksi semua polutan =====
        new_vals = []
        preds_pol = {}

        for i, col in enumerate(POLLUTANT_COLS):
            pred = pollutant_models[col].predict(Xf)[0]
            new_vals.append(pred)
            preds_pol[col] = pred

        # ===== update history (rolling window) =====
        hist.append(new_vals)
        hist.pop(0)

        # ===== simpan =====
        feat.update(preds_pol)
        feat["id"] = row["id"]

        forecast_rows.append(feat)

df_future = pd.DataFrame(forecast_rows)
df_future = add_ispu_interactions(df_future)

In [56]:


sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]

sub = sub.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

# encode lokasi (PASTI sama dengan df_model)
sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])




In [57]:
results = []

for _, row in df_future.iterrows():

    feat_ispu = {col: row[col] for col in POLLUTANT_COLS}

    feat_ispu["pm_ratio"] = row["pm_duakomalima"] / (row["pm_sepuluh"] + 1e-3)
    feat_ispu["gas_sum"] = row["ozon"] + row["nitrogen_dioksida"] + row["sulfur_dioksida"]
    feat_ispu["pm25_ozon"] = row["pm_duakomalima"] * row["ozon"]
    feat_ispu["pm10_no2"] = row["pm_sepuluh"] * row["nitrogen_dioksida"]


    feat_ispu.update({
        "month": row["month"],
        "dayofyear": row["dayofyear"],
        "dayofweek": row["dayofweek"],
        "lokasi_enc": row["lokasi_enc"],   
    })

    Xi = pd.DataFrame([feat_ispu])

    prob = ispu_model.predict(Xi)
    pred = prob.argmax(1)

    for c, t in enumerate(best_t):
        pred[prob[:, c] > t] = c

    results.append(int(pred[0]))


In [58]:
INV_LABEL_MAP = {0: "BAIK", 1: "SEDANG", 2: "TIDAK SEHAT"}

sub_final = pd.read_csv("sample_submission.csv")

pred_df = df_future[["id"]].copy()
pred_df["kategori"] = [INV_LABEL_MAP[i] for i in results]


sub_final = sub_final.merge(pred_df, on="id", how="left")

sub_final.to_csv("submission_step6.csv", index=False)

print("✅ submission_step6.csv siap upload")


✅ submission_step6.csv siap upload


In [59]:
import pandas as pd

df = pd.read_csv("submission_step6.csv", sep=";")  
df.to_csv("submission_step_6.csv", index=False)     


In [60]:
df_future[POLLUTANT_COLS].describe()


Unnamed: 0,pm_sepuluh,pm_duakomalima,ozon,nitrogen_dioksida,sulfur_dioksida,karbon_monoksida
count,455.0,455.0,455.0,455.0,455.0,455.0
mean,46.488784,47.334042,64.271536,28.54716,17.27377,23.783079
std,11.056087,4.098559,6.628635,3.269486,3.271835,1.646593
min,30.975903,39.45868,34.718069,22.162419,9.486155,15.448885
25%,37.566295,44.617876,59.233533,26.307729,15.674436,22.674737
50%,43.637282,47.397322,64.830733,27.784053,16.62711,23.978223
75%,56.628306,49.806336,69.266844,29.60815,17.935507,24.950018
max,68.134252,61.750067,75.974955,38.52064,41.998066,28.192965


In [61]:
pd.Series(results).value_counts()


1    452
2      2
0      1
Name: count, dtype: int64