In [1]:
# =========================================================
# FINAL FIXED ISPU PIPELINE â€” REAL FORECAST POLUTAN â†’ MAX
# =========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# ===============================
# LOAD
# ===============================
df = pd.read_csv("merged_libur_cuaca_ispu_ndvi.csv", sep=";")
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)
df = df.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

POLLUTANTS = ["pm_sepuluh","pm_duakomalima","ozon"]

# ===============================
# WEATHER FEATURES (PENTING!)
# ===============================
WEATHER = [
    c for c in df.columns
    if any(k in c.lower() for k in
           ["hujan","angin","suhu","lembab","pressure","radiasi","ndvi"])
]

df = df.dropna(subset=POLLUTANTS)

# ===============================
# REINDEX
# ===============================
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index":"tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)
df = df.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

# ===============================
# TIME FEATURES
# ===============================
df["month_sin"] = np.sin(2*np.pi*df["tanggal"].dt.month/12)
df["month_cos"] = np.cos(2*np.pi*df["tanggal"].dt.month/12)
df["doy_sin"]   = np.sin(2*np.pi*df["tanggal"].dt.dayofyear/365)
df["doy_cos"]   = np.cos(2*np.pi*df["tanggal"].dt.dayofyear/365)

TIME_FEATS = ["month_sin","month_cos","doy_sin","doy_cos"]

# ===============================
# LAG BUILDER
# ===============================
def build_lag(g, col):
    g = g.copy()
    for l in [1,2,3,7]:
        g[f"{col}_lag_{l}"] = g[col].shift(l)
    for r in [3,7]:
        g[f"{col}_roll_mean_{r}"] = g[col].shift(1).rolling(r).mean()
        g[f"{col}_roll_std_{r}"]  = g[col].shift(1).rolling(r).std()
    return g

# ===============================
# TRAIN
# ===============================
models = {}
ranges = {}

for loc, g in df.groupby("lokasi_clean"):

    models[loc] = {}
    ranges[loc] = {}

    for pol in POLLUTANTS:

        g_feat = build_lag(g, pol).dropna()

        # ðŸ”¥ FIX FEATURE SELECTION
        FEATS = [c for c in g_feat.columns if (pol in c and ("lag" in c or "roll" in c))]
        FEATS += TIME_FEATS + WEATHER

        X = g_feat[FEATS]
        y = g_feat[pol]

        model = lgb.train(
            {
                "objective":"regression",
                "learning_rate":0.03,
                "num_leaves":63,
                "min_data_in_leaf":20,
                "feature_fraction":0.8,
                "bagging_fraction":0.8,
                "bagging_freq":1,
                "verbosity":-1,
                "seed":SEED,
            },
            lgb.Dataset(X,label=y),
            num_boost_round=500,
        )

        models[loc][pol] = (model, FEATS)

        # simpan range biar recursive stabil
        ranges[loc][pol] = (y.min(), y.max())

# ===============================
# LOAD SUBMISSION
# ===============================
sub = pd.read_csv("sample_submission.csv")
sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

# ===============================
# FORECAST
# ===============================
results = []

for loc, gsub in sub.groupby("lokasi_clean"):

    hist = df[df["lokasi_clean"]==loc].sort_values("tanggal").iloc[-30:]
    hist_dict = {pol: hist[pol].tolist() for pol in POLLUTANTS}

    for _, row in gsub.iterrows():
        tgl = row["tanggal"]

        preds = []

        for pol in POLLUTANTS:
            model, FEATS = models[loc][pol]

            feat = {
                "month_sin": np.sin(2*np.pi*tgl.month/12),
                "month_cos": np.cos(2*np.pi*tgl.month/12),
                "doy_sin": np.sin(2*np.pi*tgl.dayofyear/365),
                "doy_cos": np.cos(2*np.pi*tgl.dayofyear/365),
            }

            # weather pakai last known value
            for w in WEATHER:
                feat[w] = hist[w].iloc[-1]

            for l in [1,2,3,7]:
                feat[f"{pol}_lag_{l}"] = hist_dict[pol][-l]

            for r in [3,7]:
                vals = hist_dict[pol][-r:]
                feat[f"{pol}_roll_mean_{r}"] = np.mean(vals)
                feat[f"{pol}_roll_std_{r}"]  = np.std(vals)

            Xi = pd.DataFrame([feat])[FEATS]
            pred = model.predict(Xi)[0]

            # ðŸ”¥ STABILISASI RANGE
            lo, hi = ranges[loc][pol]
            pred = np.clip(pred, lo, hi)

            hist_dict[pol].append(pred)
            hist_dict[pol].pop(0)

            preds.append(pred)

        # ===============================
        # MAX ISPU
        # ===============================
        max_pred = max(preds)

        # ===============================
        # MAP KATEGORI RESMI
        # ===============================
        if max_pred <= 50:
            kategori = "BAIK"
        elif max_pred <= 100:
            kategori = "SEDANG"
        else:
            kategori = "TIDAK SEHAT"

        results.append(kategori)

# ===============================
# SAVE
# ===============================
sub["kategori"] = results
sub[["id","kategori"]].to_csv("submission_F1.csv", index=False)

print("âœ… FINAL FIXED SUBMISSION GENERATED â†’ submission_FIXED.csv")


âœ… FINAL FIXED SUBMISSION GENERATED â†’ submission_FIXED.csv


In [49]:
df_model = df.dropna(subset=["target"]).reset_index(drop=True)

def make_target_lag(g):
    g = g.sort_values("tanggal")
    g["target_lag1"] = g["target"].shift(1)
    g["target_lag2"] = g["target"].shift(2)
    g["target_roll7"] = g["target"].shift(1).rolling(7).mean()
    return g

df_model = df_model.groupby("lokasi_clean", group_keys=False).apply(make_target_lag)


SPLIT_DATE = "2024-01-01"

train_df = df_model[df_model["tanggal"] < SPLIT_DATE].copy()
valid_df = df_model[df_model["tanggal"] >= SPLIT_DATE].copy()

def make_lag_features(g):
    g = g.sort_values("tanggal")
    for col in POLLUTANT_COLS:
        g[f"{col}_lag1"] = g[col].shift(1)
        g[f"{col}_lag2"] = g[col].shift(2)
        g[f"{col}_lag3"] = g[col].shift(3)
    return g

train_df = train_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)
valid_df = valid_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)





  df_model = df_model.groupby("lokasi_clean", group_keys=False).apply(make_target_lag)
  train_df = train_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)
  valid_df = valid_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)


In [50]:
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

le_loc = LabelEncoder()

train_df["lokasi_enc"] = le_loc.fit_transform(train_df["lokasi_clean"])
valid_df["lokasi_enc"] = le_loc.transform(valid_df["lokasi_clean"])

# >>> TAMBAHAN WAJIB
df_model["lokasi_enc"] = le_loc.transform(df_model["lokasi_clean"])

FEATURES_F = (
    [f"{c}_lag1" for c in POLLUTANT_COLS] +
    [f"{c}_lag2" for c in POLLUTANT_COLS] +
    [f"{c}_lag3" for c in POLLUTANT_COLS] +
    ["month", "dayofyear", "dayofweek", "lokasi_enc"]
)

pollutant_models = {}

for col in POLLUTANT_COLS:
    X = train_df[FEATURES_F]
    y = train_df[col]

    dtrain = lgb.Dataset(X, label=y)

    model = lgb.train(
        {
            "objective": "regression",
            "metric": "l2",
            "learning_rate": 0.05,
            "num_leaves": 64,
            "verbosity": -1,
            "seed": 42,
        },
        dtrain,
        num_boost_round=300
    )

    pollutant_models[col] = model


In [51]:
TEMPORAL_COLS = [
    "month_sin","month_cos",
    "doy_sin","doy_cos",
    "dow_sin","dow_cos",
]

ISPU_FEATURES = (
    POLLUTANT_COLS +
    WEATHER_COLS +
    TEMPORAL_COLS +
    ["target_lag1", "target_lag2", "target_roll7"] +
    ["lokasi_enc"]
)



In [None]:
# =====================================================
# DIRECT MULTI-HORIZON ISPU â€” FINAL CLEAN VERSION
# =====================================================
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

HORIZON = 7
FEATURES_DIRECT = ISPU_FEATURES + ["horizon"]

# -------------------------------
# BUILD DATASET DIRECT
# -------------------------------
def build_direct(df, horizon):
    rows = []

    for loc, g in df.groupby("lokasi_clean"):
        g = g.sort_values("tanggal").reset_index(drop=True)

        for i in range(len(g) - horizon):
            base = g.iloc[i]

            for h in range(1, horizon + 1):
                target = g.iloc[i + h]["target"]

                feat = base[ISPU_FEATURES].to_dict()
                feat["horizon"] = h
                feat["target"] = target

                rows.append(feat)

    return pd.DataFrame(rows)


train_direct = build_direct(train_df, HORIZON)
valid_direct = build_direct(valid_df, HORIZON)

X_train = train_direct[FEATURES_DIRECT]
y_train = train_direct["target"]

X_valid = valid_direct[FEATURES_DIRECT]
y_valid = valid_direct["target"]

# -------------------------------
# CLASS WEIGHT
# -------------------------------
classes = np.array([0, 1, 2])
cw = compute_class_weight("balanced", classes=classes, y=y_train)
cw_dict = dict(zip(classes, cw))
sample_weights = y_train.map(cw_dict).values

# -------------------------------
# TRAIN MODEL
# -------------------------------
dtrain = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
dvalid = lgb.Dataset(X_valid, label=y_valid)

params = {
    "objective": "multiclass",
    "num_class": 3,
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_data_in_leaf": 40,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 1,
    "verbosity": -1,
    "seed": 42,
}

direct_model = lgb.train(
    params,
    dtrain,
    num_boost_round=700,
    valid_sets=[dvalid],
    callbacks=[lgb.log_evaluation(0)]  # silent
)


# -------------------------------
# VALIDASI
# -------------------------------
probs = direct_model.predict(X_valid)
preds = np.argmax(probs, axis=1)

print("Direct Macro-F1:", round(f1_score(y_valid, preds, average="macro"), 4))


# =====================================================
# INFERENCE SUBMISSION (FINAL FIXED)
# =====================================================
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]

results = []

for loc in sub["lokasi_clean"].unique():

    # ===== history polutan =====
    hist_pol = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-3:][POLLUTANT_COLS]
        .values.tolist()
    )

    # ===== last feature row =====
    last_row = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-1]
    )

    future = sub[sub["lokasi_clean"] == loc].sort_values("tanggal")

    for h, (_, row) in enumerate(future.iterrows(), start=1):

        # -------------------------
        # 1. FORECAST POLLUTAN
        # -------------------------
        feat_f = {
            "month": row["tanggal"].month,
            "dayofyear": row["tanggal"].dayofyear,
            "dayofweek": row["tanggal"].dayofweek,
            "lokasi_enc": le_loc.transform([loc])[0],
        }

        for i, col in enumerate(POLLUTANT_COLS):
            feat_f[f"{col}_lag1"] = hist_pol[-1][i]
            feat_f[f"{col}_lag2"] = hist_pol[-2][i]
            feat_f[f"{col}_lag3"] = hist_pol[-3][i]

        Xf = pd.DataFrame([feat_f])

        new_pol = []
        for col in POLLUTANT_COLS:
            p = pollutant_models[col].predict(Xf)[0]
            p = np.clip(p, 0, train_df[col].quantile(0.995))
            new_pol.append(p)

        hist_pol.append(new_pol)
        hist_pol.pop(0)

        # -------------------------
        # 2. BUILD DIRECT FEATURES
        # -------------------------
        feat = last_row[ISPU_FEATURES].to_dict()

        tgl = row["tanggal"]

        feat["month_sin"] = np.sin(2*np.pi*tgl.month/12)
        feat["month_cos"] = np.cos(2*np.pi*tgl.month/12)

        feat["doy_sin"] = np.sin(2*np.pi*tgl.dayofyear/365)
        feat["doy_cos"] = np.cos(2*np.pi*tgl.dayofyear/365)

        feat["dow_sin"] = np.sin(2*np.pi*tgl.dayofweek/7)
        feat["dow_cos"] = np.cos(2*np.pi*tgl.dayofweek/7)


        for i, col in enumerate(POLLUTANT_COLS):
            feat[col] = new_pol[i]

        feat["horizon"] = min(h, HORIZON)

        Xi = pd.DataFrame([feat])[FEATURES_DIRECT]

        # -------------------------
        # 3. PREDICT ISPU
        # -------------------------
        prob = direct_model.predict(Xi)[0]
        pred = int(np.argmax(prob))

        results.append(pred)

# -------------------------------
# SAVE SUBMISSION
# -------------------------------
INV_LABEL_MAP = {0: "BAIK", 1: "SEDANG", 2: "TIDAK SEHAT"}

sub["kategori"] = [INV_LABEL_MAP[p] for p in results]
sub[["id", "kategori"]].to_csv("submission_direct.csv", index=False)

print("âœ… submission_direct.csv siap upload")




Direct Macro-F1: 0.5298
âœ… submission_direct.csv siap upload


: 