In [171]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column


In [172]:
path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [173]:

REQUIRED_COLS = ["tanggal", "lokasi_clean", "kategori"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
assert len(missing) == 0, f"Missing columns: {missing}"
df["tanggal"] = pd.to_datetime(df["tanggal"])


In [174]:
df = df.dropna(subset=["kategori"])


In [175]:
df["kategori"] = df["kategori"].replace({
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT"
})


In [176]:
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)


In [177]:
df["prev_tanggal"] = df.groupby("lokasi_clean")["tanggal"].shift(1)
df["delta_days"] = (df["tanggal"] - df["prev_tanggal"]).dt.days


In [178]:
display(df["delta_days"].describe())


count    15252.000000
mean         1.744820
std          6.412906
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        339.000000
Name: delta_days, dtype: float64

In [179]:
LABEL_MAP = {
    "BAIK": 0,
    "SEDANG": 1,
    "TIDAK SEHAT": 2
}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

df["y"] = df["kategori"].map(LABEL_MAP)


In [180]:
BASE_FEATURES = [
    "temperature_2m_mean (°C)",
    "relative_humidity_2m_mean (%)",
    "precipitation_sum (mm)",
    "wind_speed_10m_mean (km/h)",
    "cloud_cover_mean (%)",
    "ndvi",
]

META_FEATURES = ["delta_days"]

In [181]:
df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,prev_tanggal,delta_days,y
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,NaT,,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-01,1.0,0
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,2010-01-02,1.0,0
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,2010-01-03,1.0,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,2010-01-04,1.0,0


In [182]:

LAG_FEATURES = []

for col in BASE_FEATURES:
    lag_col = f"{col}_lag_1_safe"
    df[lag_col] = np.where(
        df["delta_days"] <= 2,
        df.groupby("lokasi_clean")[col].shift(1),
        np.nan
    )
    LAG_FEATURES.append(lag_col)


In [183]:
ROLL7_FEATURES = []

for col in BASE_FEATURES:
    roll_col = f"{col}_roll7"
    df[roll_col] = (
        df.groupby("lokasi_clean")[col]
          .shift(1)
          .rolling(7, min_periods=3)
          .mean()
    )
    ROLL7_FEATURES.append(roll_col)


In [184]:
# ROLL14_FEATURES = []

# for col in BASE_FEATURES:
#     roll_col = f"{col}_roll14"
#     df[roll_col] = (
#         df.groupby("lokasi_clean")[col]
#           .shift(1)
#           .rolling(14, min_periods=5)
#           .mean()
#     )
#     ROLL14_FEATURES.append(roll_col)


In [185]:
FEATURES = (
    BASE_FEATURES
    + META_FEATURES
    + LAG_FEATURES
    + ROLL7_FEATURES
    # + ROLL14_FEATURES
)


In [186]:
SPLIT_DATE = "2024-12-31"

train_df = df[df["tanggal"] < SPLIT_DATE]
valid_df = df[df["tanggal"] >= SPLIT_DATE]

X_train = train_df[FEATURES]
y_train = train_df["y"]

X_valid = valid_df[FEATURES]
y_valid = valid_df["y"]


In [187]:
model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=500,
    learning_rate=0.05,
    class_weight = {
        0: 1.0,
        1: 0.7,
        2: 3.0
}



)


model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3292
[LightGBM] [Info] Number of data points in the train set: 14049, number of used features: 19
[LightGBM] [Info] Start training from score -2.056873
[LightGBM] [Info] Start training from score -0.889469
[LightGBM] [Info] Start training from score -0.773765


In [188]:
from sklearn.metrics import f1_score, classification_report

y_pred = model.predict(X_valid)

print("Macro F1:", f1_score(y_valid, y_pred, average="macro"))
print(classification_report(
    y_valid,
    y_pred,
    target_names=LABEL_MAP.keys()
))


Macro F1: 0.4412636101501894
              precision    recall  f1-score   support

        BAIK       0.63      0.31      0.41       218
      SEDANG       0.72      0.80      0.76       849
 TIDAK SEHAT       0.14      0.16      0.15       141

    accuracy                           0.64      1208
   macro avg       0.50      0.42      0.44      1208
weighted avg       0.64      0.64      0.63      1208

