In [167]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column


In [168]:
path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

# df.head()

In [169]:

REQUIRED_COLS = ["tanggal", "lokasi_clean", "kategori"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
assert len(missing) == 0, f"Missing columns: {missing}"
df["tanggal"] = pd.to_datetime(df["tanggal"])


In [170]:
df = df.dropna(subset=["kategori"])


In [171]:
df["kategori"] = df["kategori"].replace({
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT"
})


In [172]:
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)


In [173]:
df["prev_tanggal"] = df.groupby("lokasi_clean")["tanggal"].shift(1)
df["delta_days"] = (df["tanggal"] - df["prev_tanggal"]).dt.days


In [174]:
display(df["delta_days"].describe())


count    15252.000000
mean         1.744820
std          6.412906
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        339.000000
Name: delta_days, dtype: float64

In [175]:
LABEL_MAP = {
    "BAIK": 0,
    "SEDANG": 1,
    "TIDAK SEHAT": 2
}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

df["y"] = df["kategori"].map(LABEL_MAP)


In [176]:
# LABEL_MAP_PARAM = {
#     "PM10": 0,
#     "SO2": 1,
#     "CO": 2,
#     "O3": 3,
#     "NO2": 4,
# }
# INV_LABEL_MAP_PARAM = {v: k for k, v in LABEL_MAP_PARAM.items()}

# df["y_param"] = df["parameter_pencemar_kritis"].map(LABEL_MAP_PARAM)

In [177]:
# df[["parameter_pencemar_kritis", "y_param"]].head()
# df["y_param"].value_counts(dropna=False)


In [178]:
BASE_FEATURES = [
    "pm_sepuluh", "sulfur_dioksida", "karbon_monoksida", "ozon", "nitrogen_dioksida",
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "precipitation_sum (mm)",
    "wind_speed_10m_mean (km/h)",
    "cloud_cover_mean (%)",
    "ndvi",
]

META_FEATURES = ["delta_days"]

In [179]:
df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,prev_tanggal,delta_days,y
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,NaT,,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-01,1.0,0
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,2010-01-02,1.0,0
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,2010-01-03,1.0,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-04,1.0,0


In [180]:

LAG_FEATURES = []

for col in BASE_FEATURES:
    lag_col = f"{col}_lag_1_safe"
    df[lag_col] = np.where(
        df["delta_days"] <= 2,
        df.groupby("lokasi_clean")[col].shift(1),
        np.nan
    )
    LAG_FEATURES.append(lag_col)


In [181]:
ROLL7_FEATURES = []

for col in BASE_FEATURES:
    roll_col = f"{col}_roll7"
    df[roll_col] = (
        df.groupby("lokasi_clean")[col]
          .shift(1)
          .rolling(7, min_periods=3)
          .mean()
    )
    ROLL7_FEATURES.append(roll_col)


In [None]:
from sklearn.preprocessing import LabelEncoder

le_loc = LabelEncoder()
df["loc_id"] = le_loc.fit_transform(df["lokasi_clean"])

In [None]:
SEQ_LEN = 14

FEATURES = (
    BASE_FEATURES
    + META_FEATURES
    + ROLL7_FEATURES
    + ["loc_id"]
)


In [184]:
SPLIT_DATE = "2024-12-31"

train_df = df[df["tanggal"] < SPLIT_DATE]
valid_df = df[df["tanggal"] >= SPLIT_DATE]

X_train = train_df[FEATURES]
y_train = train_df["y"]

X_valid = valid_df[FEATURES]
y_valid = valid_df["y"]


In [None]:

def build_sequences(df, seq_len, features, target="y"):
    X, y = [], []

    for lokasi, g in df.groupby("lokasi_clean"):
        g = g.sort_values("tanggal")

        vals = g[features].values
        labels = g[target].values

        for i in range(seq_len, len(g)):
            X.append(vals[i-seq_len:i])
            y.append(labels[i])

    return np.array(X), np.array(y)


In [None]:
X_train, y_train = build_sequences(train_df, SEQ_LEN, FEATURES)
X_valid, y_valid = build_sequences(valid_df, SEQ_LEN, FEATURES)

print("Train:", X_train.shape)
print("Valid:", X_valid.shape)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
n_features = X_train.shape[-1]

X_train_2d = X_train.reshape(-1, n_features)
X_valid_2d = X_valid.reshape(-1, n_features)

scaler.fit(X_train_2d)

X_train = scaler.transform(X_train_2d).reshape(X_train.shape)
X_valid = scaler.transform(X_valid_2d).reshape(X_valid.shape)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model

SEQ_LEN = X_train.shape[1]
N_FEATURES = X_train.shape[2]

inputs = layers.Input(shape=(SEQ_LEN, N_FEATURES))

x = layers.Masking(mask_value=np.nan)(inputs)

# BiLSTM 1
x = layers.Bidirectional(
    layers.LSTM(
        64,
        return_sequences=True,
        dropout=0.2,
        recurrent_dropout=0.2
    )
)(x)

x = layers.LayerNormalization()(x)

# BiLSTM 2 (lebih kecil)
x = layers.Bidirectional(
    layers.LSTM(
        32,
        return_sequences=True,
        dropout=0.2,
        recurrent_dropout=0.2
    )
)(x)

# ===== Attention sederhana =====
score = layers.Dense(1, activation="tanh")(x)
weights = layers.Softmax(axis=1)(score)
context = tf.reduce_sum(weights * x, axis=1)

# Dense head
x = layers.Dense(64, activation="relu")(context)
x = layers.Dropout(0.3)(x)

outputs = layers.Dense(3, activation="softmax")(x)

model = Model(inputs, outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weight_dict = dict(zip(classes, class_weights))
print(class_weight_dict)


In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        patience=8,
        restore_best_weights=True,
        monitor="val_loss"
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        factor=0.5,
        patience=4,
        min_lr=1e-5
    )
]


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=60,
    batch_size=64,
    class_weight=class_weight_dict,
    callbacks=callbacks,
    verbose=1
)


In [None]:
from sklearn.metrics import f1_score, classification_report
import numpy as np

y_pred = np.argmax(model.predict(X_valid), axis=1)

print("Macro F1:", f1_score(y_valid, y_pred, average="macro"))
print(classification_report(y_valid, y_pred))
