In [46]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column


In [47]:
path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

# df.head()

In [48]:

REQUIRED_COLS = ["tanggal", "lokasi_clean", "kategori"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
assert len(missing) == 0, f"Missing columns: {missing}"
df["tanggal"] = pd.to_datetime(df["tanggal"])


In [49]:
df = df.dropna(subset=["kategori"])


In [50]:
df["kategori"] = df["kategori"].replace({
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT"
})


In [51]:
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)


In [52]:
df["prev_tanggal"] = df.groupby("lokasi_clean")["tanggal"].shift(1)
df["delta_days"] = (df["tanggal"] - df["prev_tanggal"]).dt.days


In [53]:
display(df["delta_days"].describe())


count    15252.000000
mean         1.744820
std          6.412906
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        339.000000
Name: delta_days, dtype: float64

In [54]:
LABEL_MAP = {
    "BAIK": 0,
    "SEDANG": 1,
    "TIDAK SEHAT": 2
}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

df["y"] = df["kategori"].map(LABEL_MAP)


In [55]:
# LABEL_MAP_PARAM = {
#     "PM10": 0,
#     "SO2": 1,
#     "CO": 2,
#     "O3": 3,
#     "NO2": 4,
# }
# INV_LABEL_MAP_PARAM = {v: k for k, v in LABEL_MAP_PARAM.items()}

# df["y_param"] = df["parameter_pencemar_kritis"].map(LABEL_MAP_PARAM)

In [56]:
# df[["parameter_pencemar_kritis", "y_param"]].head()
# df["y_param"].value_counts(dropna=False)


In [57]:
BASE_FEATURES = [
    "pm_sepuluh", "sulfur_dioksida", "karbon_monoksida", "ozon", "nitrogen_dioksida",
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "precipitation_sum (mm)",
    "wind_speed_10m_mean (km/h)",
    "cloud_cover_mean (%)",
    "ndvi",
]

META_FEATURES = ["delta_days"]

In [58]:
df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,prev_tanggal,delta_days,y
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,NaT,,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-01,1.0,0
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,2010-01-02,1.0,0
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,2010-01-03,1.0,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-04,1.0,0


In [59]:

LAG_FEATURES = []

for col in BASE_FEATURES:
    lag_col = f"{col}_lag_1"
    df[lag_col] = np.where(
        df["delta_days"] <= 2,
        df.groupby("lokasi_clean")[col].shift(1),
        np.nan
    )
    LAG_FEATURES.append(lag_col)


In [60]:
ROLL7_FEATURES = []

for col in BASE_FEATURES:
    roll_col = f"{col}_roll7"
    df[roll_col] = (
        df.groupby("lokasi_clean")[col]
          .shift(1)
          .rolling(7, min_periods=3)
          .mean()
    )
    ROLL7_FEATURES.append(roll_col)


In [61]:
# ROLL14_FEATURES = []

# for col in BASE_FEATURES:
#     roll_col = f"{col}_roll14"
#     df[roll_col] = (
#         df.groupby("lokasi_clean")[col]
#           .shift(1)
#           .rolling(14, min_periods=5)
#           .mean()
#     )
#     ROLL14_FEATURES.append(roll_col)


In [62]:
FEATURES = (
     META_FEATURES
    + LAG_FEATURES
    + ROLL7_FEATURES
    # + ROLL14_FEATURES
)


In [63]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import numpy as np

N_SPLITS = 5

oof = np.full(len(df), -1)
mask = np.zeros(len(df), dtype=bool)

for loc, dloc in df.groupby("lokasi_clean"):

    dloc = dloc.sort_values("tanggal")
    idx = dloc.index.values

    X = dloc[FEATURES].values
    y = dloc["y"].values

    tscv = TimeSeriesSplit(n_splits=N_SPLITS, gap=7)


    for tr, va in tscv.split(X):

        model = LGBMClassifier(
            objective="multiclass",
            num_class=3,
            n_estimators=700,
            learning_rate=0.03,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            class_weight="balanced",
            random_state=42
        )

        model.fit(X[tr], y[tr])

        pred = model.predict(X[va])

        oof[idx[va]] = pred
        mask[idx[va]] = True

cv = f1_score(df.loc[mask, "y"], oof[mask], average="macro")
print("CV Macro-F1:", cv)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1775
[LightGBM] [Info] Number of data points in the train set: 471, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2902
[LightGBM] [Info] Number of data points in the train set: 948, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000571 seconds

In [64]:
final_model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=700,
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

final_model.fit(df[FEATURES], df["y"])


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4321
[LightGBM] [Info] Number of data points in the train set: 15257, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [65]:
path = find_file("sample_submission.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

sub = pd.read_csv(path, na_values=NA_VALUES)
sub.head()

Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [66]:


sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]

sub.head()

Unnamed: 0,id,category,tanggal,lokasi_clean
0,2025-09-01_DKI1,,2025-09-01,DKI1
1,2025-09-01_DKI2,,2025-09-01,DKI2
2,2025-09-01_DKI3,,2025-09-01,DKI3
3,2025-09-01_DKI4,,2025-09-01,DKI4
4,2025-09-01_DKI5,,2025-09-01,DKI5


In [67]:
df_hist = df.copy()


In [68]:
def build_test_features(df_hist, tanggal, lokasi):

    hist = df_hist[
        (df_hist["lokasi_clean"] == lokasi) &
        (df_hist["tanggal"] < tanggal)
    ].sort_values("tanggal")

    if len(hist) < 2:
        return None, hist

    row = {}

    # META
    row["delta_days"] = (hist.iloc[-1]["tanggal"] - hist.iloc[-2]["tanggal"]).days

    # LAG1
    for col in BASE_FEATURES:
        row[f"{col}_lag1"] = hist.iloc[-1][col]

    # ROLL7
    for col in BASE_FEATURES:
        vals = hist[col].iloc[-7:]
        row[f"{col}_roll7"] = vals.mean() if len(vals) >= 3 else np.nan

    return pd.DataFrame([row]), hist


In [70]:
preds = []

for _, r in sub.iterrows():

    X_test, hist = build_test_features(
        df_hist,
        r["tanggal"],
        r["lokasi_clean"]
    )

    # ===== fallback kalau histori kurang =====
    if X_test is None:
        if len(hist) == 0:
            preds.append("SEDANG")  # global safe fallback
        else:
            last_label = hist["y"].iloc[-1]
            preds.append(INV_LABEL_MAP[last_label])
        continue

    # ===== prediksi normal =====
    X_test = X_test.reindex(columns=FEATURES, fill_value=np.nan)
    y_hat = final_model.predict(X_test)[0]

    preds.append(INV_LABEL_MAP[y_hat])

# TAMBAHKAN INI
    new_row = {
        "tanggal": r["tanggal"],
        "lokasi_clean": r["lokasi_clean"],
        "y": y_hat
    }
    df_hist = pd.concat([df_hist, pd.DataFrame([new_row])], ignore_index=True)



In [71]:
sub["category"] = preds
sub[["id", "category"]].to_csv("submission.csv", index=False)

print("✅ submission.csv berhasil dibuat")


✅ submission.csv berhasil dibuat
