In [48]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")

df = pd.read_csv(PATH, sep=";")
# parse tanggal
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)

# sort
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

# drop kolom
DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

# label mapping
LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)


# REINDEX 
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index": "tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


# PM2.5 HANDLING
df["pm25_missing"] = df["pm_duakomalima"].isna().astype(int)

median_pm25 = (
    df[df["tanggal"] >= "2021-01-01"]
    .groupby("lokasi_clean")["pm_duakomalima"]
    .median()
)

df["pm_duakomalima"] = df["pm_duakomalima"].fillna(
    df["lokasi_clean"].map(median_pm25)
)




# TIME FEATURES 
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)

df["doy_sin"] = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"] = np.cos(2*np.pi*df["dayofyear"]/365)

df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7)
df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7)

# fitur lag dan rolling

POLLUTANT_COLS = [
    "pm_sepuluh",
    "pm_duakomalima",
    "ozon",
    "nitrogen_dioksida",
    "sulfur_dioksida",
    "karbon_monoksida",
]

WEATHER_COLS = [
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "wind_speed_10m_mean (km/h)",
    "precipitation_sum (mm)",
    "cloud_cover_mean (%)",
    "surface_pressure_mean (hPa)",
]

LAG_FEATURES = POLLUTANT_COLS + WEATHER_COLS

LAGS = [1, 2, 3]
ROLL_WINDOWS = [3, 7]
def create_temporal_features(g):
    g = g.sort_values("tanggal")
    for col in LAG_FEATURES:
        for lag in LAGS:
            g[f"{col}_lag_{lag}"] = g[col].shift(lag)


    for col in POLLUTANT_COLS:
        for w in ROLL_WINDOWS:
            g[f"{col}_roll_mean_{w}"] = g[col].shift(1).rolling(w).mean()
            g[f"{col}_roll_std_{w}"]  = g[col].shift(1).rolling(w).std()

    return g

df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
lag_cols = [c for c in df.columns if "lag_" in c or "roll_" in c]

for col in lag_cols:
    df[f"{col}_isnan"] = df[col].isna().astype(int)




  df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)
  df = df.groupby("lokasi_clean", group_keys=False).apply(create_temporal_features)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astype(int)
  df[f"{col}_isnan"] = df[col].isna().astyp

In [49]:
df_model = df.dropna(subset=["target"]).reset_index(drop=True)

def make_target_lag(g):
    g = g.sort_values("tanggal")
    g["target_lag1"] = g["target"].shift(1)
    g["target_lag2"] = g["target"].shift(2)
    g["target_roll7"] = g["target"].shift(1).rolling(7).mean()
    return g

df_model = df_model.groupby("lokasi_clean", group_keys=False).apply(make_target_lag)


SPLIT_DATE = "2024-01-01"

train_df = df_model[df_model["tanggal"] < SPLIT_DATE].copy()
valid_df = df_model[df_model["tanggal"] >= SPLIT_DATE].copy()

def make_lag_features(g):
    g = g.sort_values("tanggal")
    for col in POLLUTANT_COLS:
        g[f"{col}_lag1"] = g[col].shift(1)
        g[f"{col}_lag2"] = g[col].shift(2)
        g[f"{col}_lag3"] = g[col].shift(3)
    return g

train_df = train_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)
valid_df = valid_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)





  df_model = df_model.groupby("lokasi_clean", group_keys=False).apply(make_target_lag)
  train_df = train_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)
  valid_df = valid_df.groupby("lokasi_clean", group_keys=False).apply(make_lag_features)


In [50]:
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

le_loc = LabelEncoder()

train_df["lokasi_enc"] = le_loc.fit_transform(train_df["lokasi_clean"])
valid_df["lokasi_enc"] = le_loc.transform(valid_df["lokasi_clean"])

# >>> TAMBAHAN WAJIB
df_model["lokasi_enc"] = le_loc.transform(df_model["lokasi_clean"])

FEATURES_F = (
    [f"{c}_lag1" for c in POLLUTANT_COLS] +
    [f"{c}_lag2" for c in POLLUTANT_COLS] +
    [f"{c}_lag3" for c in POLLUTANT_COLS] +
    ["month", "dayofyear", "dayofweek", "lokasi_enc"]
)

pollutant_models = {}

for col in POLLUTANT_COLS:
    X = train_df[FEATURES_F]
    y = train_df[col]

    dtrain = lgb.Dataset(X, label=y)

    model = lgb.train(
        {
            "objective": "regression",
            "metric": "l2",
            "learning_rate": 0.05,
            "num_leaves": 64,
            "verbosity": -1,
            "seed": 42,
        },
        dtrain,
        num_boost_round=300
    )

    pollutant_models[col] = model


In [51]:
TEMPORAL_COLS = [
    "month_sin","month_cos",
    "doy_sin","doy_cos",
    "dow_sin","dow_cos",
]

ISPU_FEATURES = (
    POLLUTANT_COLS +
    WEATHER_COLS +
    TEMPORAL_COLS +
    ["target_lag1", "target_lag2", "target_roll7"] +
    ["lokasi_enc"]
)



In [52]:
# =====================================================
# DIRECT MULTI-HORIZON ISPU — FINAL CLEAN VERSION
# =====================================================
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

HORIZON = 7
FEATURES_DIRECT = ISPU_FEATURES + ["horizon"]

# -------------------------------
# BUILD DATASET DIRECT
# -------------------------------
def build_direct(df, horizon):
    rows = []

    for loc, g in df.groupby("lokasi_clean"):
        g = g.sort_values("tanggal").reset_index(drop=True)

        for i in range(len(g) - horizon):
            base = g.iloc[i]

            for h in range(1, horizon + 1):
                target = g.iloc[i + h]["target"]

                feat = base[ISPU_FEATURES].to_dict()
                feat["horizon"] = h
                feat["target"] = target

                rows.append(feat)

    return pd.DataFrame(rows)


train_direct = build_direct(train_df, HORIZON)
valid_direct = build_direct(valid_df, HORIZON)

X_train = train_direct[FEATURES_DIRECT]
y_train = train_direct["target"]

X_valid = valid_direct[FEATURES_DIRECT]
y_valid = valid_direct["target"]

# -------------------------------
# CLASS WEIGHT
# -------------------------------
classes = np.array([0, 1, 2])
cw = compute_class_weight("balanced", classes=classes, y=y_train)
cw_dict = dict(zip(classes, cw))
sample_weights = y_train.map(cw_dict).values

# -------------------------------
# TRAIN MODEL
# -------------------------------
dtrain = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
dvalid = lgb.Dataset(X_valid, label=y_valid)

params = {
    "objective": "multiclass",
    "num_class": 3,
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_data_in_leaf": 40,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 1,
    "verbosity": -1,
    "seed": 42,
}

direct_model = lgb.train(
    params,
    dtrain,
    num_boost_round=700,
    valid_sets=[dvalid],
    callbacks=[lgb.log_evaluation(0)]  # silent
)


# -------------------------------
# VALIDASI
# -------------------------------
probs = direct_model.predict(X_valid)
preds = np.argmax(probs, axis=1)

print("Direct Macro-F1:", round(f1_score(y_valid, preds, average="macro"), 4))


# =====================================================
# INFERENCE SUBMISSION (FINAL FIXED)
# =====================================================
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]

results = []

for loc in sub["lokasi_clean"].unique():

    # ===== history polutan =====
    hist_pol = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-3:][POLLUTANT_COLS]
        .values.tolist()
    )

    # ===== last feature row =====
    last_row = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-1]
    )

    future = sub[sub["lokasi_clean"] == loc].sort_values("tanggal")

    for h, (_, row) in enumerate(future.iterrows(), start=1):

        # -------------------------
        # 1. FORECAST POLLUTAN
        # -------------------------
        feat_f = {
            "month": row["tanggal"].month,
            "dayofyear": row["tanggal"].dayofyear,
            "dayofweek": row["tanggal"].dayofweek,
            "lokasi_enc": le_loc.transform([loc])[0],
        }

        for i, col in enumerate(POLLUTANT_COLS):
            feat_f[f"{col}_lag1"] = hist_pol[-1][i]
            feat_f[f"{col}_lag2"] = hist_pol[-2][i]
            feat_f[f"{col}_lag3"] = hist_pol[-3][i]

        Xf = pd.DataFrame([feat_f])

        new_pol = []
        for col in POLLUTANT_COLS:
            p = pollutant_models[col].predict(Xf)[0]
            p = np.clip(p, 0, train_df[col].quantile(0.995))
            new_pol.append(p)

        hist_pol.append(new_pol)
        hist_pol.pop(0)

        # -------------------------
        # 2. BUILD DIRECT FEATURES
        # -------------------------
        feat = last_row[ISPU_FEATURES].to_dict()

        tgl = row["tanggal"]

        feat["month_sin"] = np.sin(2*np.pi*tgl.month/12)
        feat["month_cos"] = np.cos(2*np.pi*tgl.month/12)

        feat["doy_sin"] = np.sin(2*np.pi*tgl.dayofyear/365)
        feat["doy_cos"] = np.cos(2*np.pi*tgl.dayofyear/365)

        feat["dow_sin"] = np.sin(2*np.pi*tgl.dayofweek/7)
        feat["dow_cos"] = np.cos(2*np.pi*tgl.dayofweek/7)


        for i, col in enumerate(POLLUTANT_COLS):
            feat[col] = new_pol[i]

        feat["horizon"] = min(h, HORIZON)

        Xi = pd.DataFrame([feat])[FEATURES_DIRECT]

        # -------------------------
        # 3. PREDICT ISPU
        # -------------------------
        prob = direct_model.predict(Xi)[0]
        pred = int(np.argmax(prob))

        results.append(pred)

# -------------------------------
# SAVE SUBMISSION
# -------------------------------
INV_LABEL_MAP = {0: "BAIK", 1: "SEDANG", 2: "TIDAK SEHAT"}

sub["kategori"] = [INV_LABEL_MAP[p] for p in results]
sub[["id", "kategori"]].to_csv("submission_direct.csv", index=False)

print("✅ submission_direct.csv siap upload")




Direct Macro-F1: 0.5298
✅ submission_direct.csv siap upload
