In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column


In [2]:
path = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

# df.head()

In [3]:
# make sure tanggal is datetime
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")

# sort once globally (we still sort per location later)
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

print(df.shape)
display(df.head())


(15257, 41)


Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [4]:
KATEGORI_MAP = {
    "BAIK": "BAIK",
    "SEDANG": "SEDANG",
    "TIDAK SEHAT": "TIDAK SEHAT",
    "TIDAK BAIK": "TIDAK SEHAT",      # normalize
    "SANGAT TIDAK SEHAT": "SANGAT TIDAK SEHAT",
    "TIDAK ADA DATA": np.nan
}

df["kategori"] = df["kategori"].map(KATEGORI_MAP)


In [5]:
LABEL_MAP = {
    "BAIK": 0,
    "SEDANG": 1,
    "TIDAK SEHAT": 2,
    "SANGAT TIDAK SEHAT": 3
}

INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

df["kategori_enc"] = df["kategori"].map(LABEL_MAP)
df["kategori_valid"] = df["kategori_enc"].notna()

df["kategori"].value_counts(dropna=False)


kategori
SEDANG                10343
TIDAK SEHAT            2424
BAIK                   2286
SANGAT TIDAK SEHAT      203
NaN                       1
Name: count, dtype: int64

In [6]:
FEATURES = [
    # pollutants
    "pm_sepuluh",
    "pm_duakomalima",
    "sulfur_dioksida",
    "karbon_monoksida",
    "ozon",
    "nitrogen_dioksida",

    # weather (exact column names)
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "wind_speed_10m_mean (km/h)",
    "precipitation_sum (mm)",
    "surface_pressure_mean (hPa)"
]


In [7]:
import random
import tensorflow as tf

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)


In [8]:
def build_sequences(X, y, valid_mask, lookback):
    X_seq, y_seq = [], []

    for i in range(lookback, len(X)):
        if not valid_mask.iloc[i]:
            continue
        X_seq.append(X.iloc[i-lookback:i].values)
        y_seq.append(y.iloc[i])

    return np.array(X_seq), np.array(y_seq)


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report


In [None]:
def train_lstm_per_location(
    df_loc,
    location_name,
    lookback=14,
    min_samples=60
):
    df_loc = df_loc.sort_values("tanggal").reset_index(drop=True)

    # ===== time encoding =====
    df_loc["day_of_year"] = df_loc["tanggal"].dt.dayofyear
    df_loc["doy_sin"] = np.sin(2 * np.pi * df_loc["day_of_year"] / 365)
    df_loc["doy_cos"] = np.cos(2 * np.pi * df_loc["day_of_year"] / 365)

    feats = FEATURES + ["doy_sin", "doy_cos"]

    # ===== numeric + missing handling =====
    df_loc[feats] = df_loc[feats].astype(float)
    df_loc[feats] = df_loc[feats].fillna(method="ffill").fillna(method="bfill")

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(
        scaler.fit_transform(df_loc[feats]),
        columns=feats
    )

    # ===== sequences =====
    X_seq, y_seq = build_sequences(
        X_scaled,
        df_loc["kategori_enc"],
        df_loc["kategori_valid"],
        lookback
    )

    if len(y_seq) < min_samples:
        print(f"⚠️ Skip {location_name}: insufficient data ({len(y_seq)})")
        return None

    # ===== time split =====
    split = int(len(X_seq) * 0.8)
    X_train, X_val = X_seq[:split], X_seq[split:]
    y_train, y_val = y_seq[:split], y_seq[split:]

    # ===== class weights =====
    classes = np.unique(y_train)
    weights = compute_class_weight(
        class_weight="balanced",
        classes=classes,
        y=y_train
    )
    class_weight = dict(zip(classes, weights))

    # ===== model =====
    model = Sequential([
        LSTM(64, input_shape=(lookback, X_seq.shape[2])),
        Dropout(0.3),
        Dense(len(LABEL_MAP), activation="softmax")
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    # ===== training =====
    model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=40,
        batch_size=32,
        class_weight=class_weight,
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=0
    )

    # ===== evaluation =====
    y_pred = np.argmax(model.predict(X_val, verbose=0), axis=1)
    labels_present = sorted(np.unique(np.concatenate([y_val, y_pred])))

    report = classification_report(
        y_val,
        y_pred,
        labels=labels_present,
        target_names=[INV_LABEL_MAP[i] for i in labels_present],
        output_dict=True,
        zero_division=0
    )


    return report

for col in df.columns:
    print(repr(col))


'tanggal'
'periode_data'
'stasiun'
'pm_sepuluh'
'pm_duakomalima'
'sulfur_dioksida'
'karbon_monoksida'
'ozon'
'nitrogen_dioksida'
'max'
'parameter_pencemar_kritis'
'kategori'
'id'
'time'
'temperature_2m_max (°C)'
'temperature_2m_min (°C)'
'precipitation_sum (mm)'
'precipitation_hours (h)'
'wind_speed_10m_max (km/h)'
'wind_direction_10m_dominant (°)'
'shortwave_radiation_sum (MJ/m²)'
'temperature_2m_mean (°C)'
'relative_humidity_2m_mean (%)'
'cloud_cover_mean (%)'
'surface_pressure_mean (hPa)'
'wind_gusts_10m_max (km/h)'
'winddirection_10m_dominant (°)'
'relative_humidity_2m_max (%)'
'relative_humidity_2m_min (%)'
'cloud_cover_max (%)'
'cloud_cover_min (%)'
'wind_gusts_10m_mean (km/h)'
'wind_speed_10m_mean (km/h)'
'wind_gusts_10m_min (km/h)'
'wind_speed_10m_min (km/h)'
'surface_pressure_max (hPa)'
'surface_pressure_min (hPa)'
'lokasi'
'lokasi_clean'
'ndvi'
'is_libur'
'kategori_enc'
'kategori_valid'


In [11]:
results = {}

locations = df["lokasi_clean"].dropna().unique()

for loc in locations:
    print(f"Training LSTM for lokasi_clean: {loc}")
    df_loc = df[df["lokasi_clean"] == loc].copy()

    rep = train_lstm_per_location(df_loc, loc)

    if rep is not None:
        results[loc] = rep


Training LSTM for lokasi_clean: DKI1


  df_loc[feats] = df_loc[feats].fillna(method="ffill").fillna(method="bfill")
  super().__init__(**kwargs)


ValueError: Number of classes, 3, does not match size of target_names, 4. Try specifying the labels parameter

In [None]:
rows = []

for loc, rep in results.items():
    rows.append({
        "lokasi_clean": loc,
        "f1_weighted": rep["weighted avg"]["f1-score"],
        "f1_macro": rep["macro avg"]["f1-score"],
        "f1_BAIK": rep["BAIK"]["f1-score"],
        "f1_SEDANG": rep["SEDANG"]["f1-score"],
        "f1_TIDAK_SEHAT": rep["TIDAK SEHAT"]["f1-score"]
    })

summary_df = pd.DataFrame(rows)
summary_df.sort_values("f1_weighted", ascending=False)


In [None]:
summary_df.mean(numeric_only=True)
