In [70]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column

path = find_file("short_CLEANED.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)
df.head()

Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,id,stasiun_clean,tanggal_raw,delta_hari
0,202101,2021-01-01,DKI1 (Bunderan HI),38.0,53.0,29.0,6.0,31.0,13.0,53.0,PM25,SEDANG,2021-01-01_DKI1,DKI1,2021-01-01,84.0
1,202101,2021-01-02,DKI1 (Bunderan HI),27.0,46.0,27.0,7.0,47.0,7.0,47.0,O3,BAIK,2021-01-02_DKI1,DKI1,2021-01-02,1.0
2,202101,2021-01-03,DKI1 (Bunderan HI),44.0,58.0,25.0,7.0,40.0,13.0,58.0,PM25,SEDANG,2021-01-03_DKI1,DKI1,2021-01-03,1.0
3,202101,2021-01-04,DKI1 (Bunderan HI),30.0,48.0,24.0,4.0,32.0,7.0,48.0,PM25,BAIK,2021-01-04_DKI1,DKI1,2021-01-04,1.0
4,202101,2021-01-05,DKI1 (Bunderan HI),38.0,53.0,24.0,6.0,31.0,9.0,53.0,PM25,SEDANG,2021-01-05_DKI1,DKI1,2021-01-05,1.0


In [71]:
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")


In [72]:
df = df.sort_values(["stasiun_clean", "tanggal"]).reset_index(drop=True)


In [73]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["kategori_enc"] = le.fit_transform(df["kategori"])
N_CLASSES = len(le.classes_)

print("Classes:", le.classes_)


Classes: ['BAIK' 'SEDANG' 'TIDAK SEHAT']


In [74]:
import numpy as np

df["bulan"] = df["tanggal"].dt.month
df["sin_bulan"] = np.sin(2 * np.pi * df["bulan"] / 12)
df["cos_bulan"] = np.cos(2 * np.pi * df["bulan"] / 12)


In [75]:
FEATURES = [
    "pm_duakomalima",
    "pm_sepuluh",
    "sulfur_dioksida",
    "karbon_monoksida",
    "ozon",
    "nitrogen_dioksida",
    "delta_hari",
    "sin_bulan",
    "cos_bulan",
]

TARGET = "kategori_enc"
WINDOW = 7
GAP_THRESHOLD = 7


In [76]:
from sklearn.preprocessing import StandardScaler
split_date = pd.Timestamp("2025-08-01")

df_scaled = df.copy()
scalers = {}

for st in df["stasiun_clean"].unique():
    mask_st = df["stasiun_clean"] == st
    mask_train_time = df["tanggal"] < split_date

    scaler = StandardScaler()

    train_idx = mask_st & mask_train_time
    all_idx = mask_st

    scaler.fit(df.loc[train_idx, FEATURES])
    df_scaled.loc[all_idx, FEATURES] = scaler.transform(df.loc[all_idx, FEATURES])

    scalers[st] = scaler


In [77]:
def build_sequences_multistation(df, features, target, window, gap_threshold):
    X_list, y_list, date_list = [], [], []

    for st in df["stasiun_clean"].unique():

        sub = df[df["stasiun_clean"] == st] \
              .sort_values("tanggal") \
              .reset_index(drop=True)

        for i in range(len(sub) - window):

            delta_window = sub["delta_hari"].iloc[i+1:i+window+1]

            # skip kalau gap besar
            if (delta_window > gap_threshold).any():
                continue

            X_seq = sub[features].iloc[i:i+window].values
            y_val = sub[target].iloc[i+window]
            y_date = sub["tanggal"].iloc[i+window]

            X_list.append(X_seq)
            y_list.append(y_val)
            date_list.append(y_date)

    return (
        np.array(X_list),
        np.array(y_list),
        np.array(date_list)
    )


In [78]:
X, y, seq_dates = build_sequences_multistation(
    df_scaled,
    FEATURES,
    TARGET,
    WINDOW,
    GAP_THRESHOLD
)

print(X.shape, y.shape, seq_dates.shape)


(6840, 7, 9) (6840,) (6840,)


In [79]:
split_date = pd.Timestamp("2025-08-01")

train_mask = seq_dates < split_date
valid_mask = seq_dates >= split_date


In [80]:
X_train, y_train = X[train_mask], y[train_mask]
X_valid, y_valid = X[valid_mask], y[valid_mask]

print("Train:", X_train.shape, "Valid:", X_valid.shape)


Train: (6687, 7, 9) Valid: (153, 7, 9)


In [81]:
import tensorflow as tf
from tensorflow.keras import layers, models

N_FEATURES = X_train.shape[2]
N_CLASSES = len(le.classes_)

model = models.Sequential([
    layers.Input(shape=(WINDOW, N_FEATURES)),

    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),

    layers.LSTM(32),
    layers.Dropout(0.2),

    layers.Dense(32, activation="relu"),
    layers.Dense(N_CLASSES, activation="softmax")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


ModuleNotFoundError: No module named 'tensorflow.keras'

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        patience=5,
        restore_best_weights=True
    )
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_valid).argmax(axis=1)

print(classification_report(y_valid, y_pred, target_names=le.classes_))


In [None]:
last_windows = {}

for st in df_scaled["stasiun_clean"].unique():
    sub = df_scaled[df_scaled["stasiun_clean"] == st].sort_values("tanggal")

    last_window = sub.iloc[-WINDOW:][FEATURES].values
    last_date = sub["tanggal"].iloc[-1]

    last_windows[st] = {
        "window": last_window,
        "date": last_date
    }


In [None]:
from datetime import timedelta

def recursive_forecast(model, last_windows, start_date, end_date):
    results = []

    for st, info in last_windows.items():

        window = info["window"].copy()
        current_date = start_date

        while current_date <= end_date:

            # prediksi kategori
            pred = model.predict(window[np.newaxis, ...], verbose=0)
            pred_class = pred.argmax(axis=1)[0]
            pred_label = le.inverse_transform([pred_class])[0]

            results.append({
                "id": f"{current_date.date()}_{st}",
                "kategori": pred_label
            })

            # ===== update window =====
            # fitur masa depan TIDAK ada → pakai pendekatan sederhana:
            # geser window & isi baris baru dengan nol kecuali fitur waktu

            new_row = np.zeros(window.shape[1])

            # update fitur waktu musiman
            bulan = current_date.month
            new_row[FEATURES.index("sin_bulan")] = np.sin(2*np.pi*bulan/12)
            new_row[FEATURES.index("cos_bulan")] = np.cos(2*np.pi*bulan/12)

            # delta hari = 1
            new_row[FEATURES.index("delta_hari")] = 1

            # geser window
            window = np.vstack([window[1:], new_row])

            current_date += timedelta(days=1)

    return pd.DataFrame(results)


In [None]:
forecast_df = recursive_forecast(
    model,
    last_windows,
    start_date=pd.Timestamp("2025-09-01"),
    end_date=pd.Timestamp("2025-11-30")
)

forecast_df.head()
