In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [None]:
path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

In [None]:
TARGET = "kategori"

DROP_COLS = [
    TARGET,
    "max",
    "parameter_pencemar_kritis",

    # identifier / non-feature
    "id",
    "tanggal",
    "periode_data",
    "time",
    "stasiun",
    "lokasi",
    "lokasi_clean",
]

X = df.drop(columns=[c for c in DROP_COLS if c in df.columns])
y = df[TARGET]




In [None]:
WINDOWS = {
    "W1": (("2010-01-01", "2022-12-31"), ("2023-01-01", "2023-06-30")),
    "W2": (("2010-01-01", "2023-06-30"), ("2023-07-01", "2023-12-31")),
    "W3": (("2010-01-01", "2023-12-31"), ("2024-01-01", "2024-12-31")),
    "W4": (("2010-01-01", "2024-12-31"), ("2025-01-01", "2025-12-31")), 
}


#### Set Up trainning


In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [None]:
import os
import random
import numpy as np

SEED = 42

os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)


In [None]:
MODELS = {
    "LightGBM": LGBMClassifier(
        objective="multiclass",
        class_weight="balanced",
        n_estimators=500,
        learning_rate=0.05,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        eval_metric="mlogloss",
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
}

In [None]:
results = []

for w_name, (train_rng, val_rng) in WINDOWS.items():
    print(f"\n================ {w_name} ================")

    train_mask = (df['tanggal'] >= train_rng[0]) & (df['tanggal'] <= train_rng[1])
    val_mask   = (df['tanggal'] >= val_rng[0])   & (df['tanggal'] <= val_rng[1])

    X_train, X_val = X[train_mask], X[val_mask]
    y_train, y_val = y_enc[train_mask], y_enc[val_mask]

    print(f"Train size: {X_train.shape}, Val size: {X_val.shape}")

    for model_name, model in MODELS.items():
        print(f"\n--- {model_name} ---")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        macro_f1 = f1_score(y_val, y_pred, average="macro")
        print(f"Macro-F1: {macro_f1:.4f}")

        # Confusion Matrix
        cm = confusion_matrix(y_val, y_pred)
        print("Confusion Matrix:")
        print(cm)

        # collapse check
        if len(np.unique(y_pred)) < len(np.unique(y_val)):
            print("⚠️ WARNING: model collapse ke kelas mayoritas")

        results.append({
            "window": w_name,
            "model": model_name,
            "macro_f1": macro_f1
        })


In [None]:
results_df = pd.DataFrame(results)

summary = (
    results_df
    .groupby("model")["macro_f1"]
    .agg(["mean", "std"])
    .sort_values("mean", ascending=False)
)

print("\n===== MODEL COMPARISON SUMMARY =====")
print(summary)


In [None]:
train_rng, val_rng = WINDOWS["W3"]

train_mask = (df['tanggal'] >= train_rng[0]) & (df['tanggal'] <= train_rng[1])
val_mask   = (df['tanggal'] >= val_rng[0])   & (df['tanggal'] <= val_rng[1])

X_train, X_val = X[train_mask], X[val_mask]
y_train, y_val = y_enc[train_mask], y_enc[val_mask]

print("W3 Train:", X_train.shape)
print("W3 Val  :", X_val.shape)

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
import numpy as np

def objective(trial):
    params = {
        "objective": "multiclass",
        "class_weight": "balanced",
        "n_estimators": 500,

        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.08),
        "num_leaves": trial.suggest_int("num_leaves", 16, 64),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 30, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 10),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),

        "random_state": SEED,
        "n_jobs": 1
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    # Macro-F1 (utama)
    macro_f1 = f1_score(y_val, y_pred, average="macro")

    # Penalti collapse (jika prediksi < jumlah kelas)
    n_pred_class = len(np.unique(y_pred))
    n_true_class = len(np.unique(y_val))

    if n_pred_class < n_true_class:
        macro_f1 -= 0.05  # penalti ringan tapi tegas

    return macro_f1


In [None]:
from tqdm.auto import tqdm

N_TRIALS = 50
pbar = tqdm(total=N_TRIALS, desc="Optuna Tuning (W3)")

def tqdm_callback(study, trial):
    pbar.update(1)

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

study.optimize(
    objective,
    n_trials=N_TRIALS,
    callbacks=[tqdm_callback]  
)

pbar.close()  # <-- tutup progress bar

print("Best Macro-F1 (W3):", study.best_value)
print("Best Params:", study.best_params)


In [None]:
final_train_mask = (
    (df['tanggal'] >= "2010-01-01") &
    (df['tanggal'] <= "2024-12-31")
)

X_final = X[final_train_mask]
y_final = y_enc[final_train_mask]

print("Final training size:", X_final.shape)


In [None]:
best_params = study.best_params

final_model = LGBMClassifier(
    **best_params,
    objective="multiclass",
    class_weight="balanced",
    n_estimators=500,
    random_state=SEED,
    n_jobs=1
)

final_model.fit(X_final, y_final)


In [None]:
from sklearn.metrics import f1_score

f1_scores = {}

for w_name, (train_rng, val_rng) in WINDOWS.items():

    train_mask = (df['tanggal'] >= train_rng[0]) & (df['tanggal'] <= train_rng[1])
    val_mask   = (df['tanggal'] >= val_rng[0])   & (df['tanggal'] <= val_rng[1])

    X_train, X_val = X[train_mask], X[val_mask]
    y_train, y_val = y_enc[train_mask], y_enc[val_mask]

    model = LGBMClassifier(
        **best_params,
        objective="multiclass",
        class_weight="balanced",
        n_estimators=500,
        random_state=SEED,
        n_jobs=1
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    f1 = f1_score(y_val, y_pred, average="macro")
    f1_scores[w_name] = f1

    print(f"{w_name} Macro-F1: {f1:.4f}")

print("\nAverage Macro-F1:", np.mean(list(f1_scores.values())))


In [None]:
import joblib

joblib.dump(final_model, "lgbm_ispu_model.pkl")
joblib.dump(le, "label_encoder.pkl")

print("✅ Model dan LabelEncoder berhasil disimpan")
