In [1]:
# CELL C01 ‚Äî Imports
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display
import joblib
from prophet import Prophet
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report


Importing plotly failed. Interactive plots will not work.


In [2]:
NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("‚ùå script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column


In [3]:
path_main_data = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path_main_data is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

df = pd.read_csv(path_main_data, na_values=NA_VALUES)

df["tanggal"] = pd.to_datetime(df["tanggal"])
df = df.sort_values(["lokasi_clean", "tanggal"])
df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [4]:
# CELL C03 ‚Äî Minimal columns

base_cols = [
    "tanggal",
    "lokasi_clean",
    "max",
    "kategori",
    "temperature_2m_mean (¬∞C)",
    "relative_humidity_2m_mean (%)",
    "wind_speed_10m_mean (km/h)",
    "precipitation_sum (mm)",
    "ndvi",
    "is_libur"
]

df = df[base_cols].dropna(subset=["max", "kategori"])


In [5]:
# CELL C04 ‚Äî Label encoding

label_map = {
    "BAIK": 0,
    "SEDANG": 1,
    "TIDAK SEHAT": 2,
    "SANGAT TIDAK SEHAT": 3,
    "BERBAHAYA": 4
}

inv_label_map = {v: k for k, v in label_map.items()}

df["kategori_enc"] = df["kategori"].map(label_map)


In [6]:
def add_time_features(df_loc):
    df_loc = df_loc.copy()

    # =========================
    # EXPLICIT TIME / TREND
    # =========================
    df_loc["t"] = np.arange(len(df_loc))  # global trend index
    df_loc["day_of_year"] = df_loc["tanggal"].dt.dayofyear
    df_loc["month"] = df_loc["tanggal"].dt.month

    # =========================
    # NUMERIC LAGS (TARGET-RELATED BUT SAFE)
    # =========================
    for lag in [7, 30, 90]:
        df_loc[f"max_lag_{lag}"] = df_loc["max"].shift(lag)

    # =========================
    # ROLLING STATISTICS
    # =========================
    df_loc["max_roll_mean_7"] = df_loc["max"].rolling(7).mean()
    df_loc["max_roll_mean_30"] = df_loc["max"].rolling(30).mean()
    df_loc["max_roll_mean_90"] = df_loc["max"].rolling(90).mean()

    df_loc["max_roll_std_30"] = df_loc["max"].rolling(30).std()
    df_loc["max_roll_std_90"] = df_loc["max"].rolling(90).std()

    return df_loc


In [7]:
# CELL C06 ‚Äî Time split

TRAIN_END = pd.Timestamp("2024-12-01")
TEST_START = pd.Timestamp("2025-01-01")
TEST_END   = pd.Timestamp("2025-08-31")



In [8]:
# CELL C07 ‚Äî Train per lokasi_clean (CLEAN & FINAL)

results = []
models = {}

LABELS = list(label_map.values())
TARGET_NAMES = list(label_map.keys())

for lokasi in sorted(df["lokasi_clean"].dropna().unique()):
    print(f"\nüìç Processing lokasi: {lokasi}")

    df_loc = df[df["lokasi_clean"] == lokasi].copy()
    df_loc = add_time_features(df_loc)
    df_loc = df_loc.dropna()

    train = df_loc[df_loc["tanggal"] <= TRAIN_END]
    test = df_loc[
        (df_loc["tanggal"] >= TEST_START) &
        (df_loc["tanggal"] <= TEST_END)
    ]

    print(
        f"{lokasi} | after dropna: {len(df_loc)}"
        f" | train: {len(train)}"
        f" | test: {len(test)}"
    )

    if len(train) < 300:
        print("‚ö†Ô∏è Skip: not enough training data")
        continue

    if len(test) == 0:
        print("‚ö†Ô∏è Skip: empty test window")
        continue

    X_train = train.drop(
        columns=[
            "tanggal",
            "lokasi_clean",
            "kategori",
            "kategori_enc",
            "max"
        ],
        errors="ignore"
    )
    y_train = train["kategori_enc"]

    X_test = test.drop(
        columns=[
            "tanggal",
            "lokasi_clean",
            "kategori",
            "kategori_enc",
            "max"
        ],
        errors="ignore"
    )
    y_test = test["kategori_enc"]

    model = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=len(label_map),
        random_state=42,
        eval_metric="mlogloss",
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    models[lokasi] = model

    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro")

    print("F1-score (macro):", round(f1, 4))
    print(
        classification_report(
            y_test,
            y_pred,
            labels=LABELS,
            target_names=TARGET_NAMES,
            zero_division=0
        )
    )

    results.append({
        "lokasi_clean": lokasi,
        "f1_macro": f1,
        "train_rows": len(train),
        "test_rows": len(test)
    })



üìç Processing lokasi: DKI1
DKI1 | after dropna: 2772 | train: 2500 | test: 242
F1-score (macro): 0.51
                    precision    recall  f1-score   support

              BAIK       0.50      0.49      0.49        35
            SEDANG       0.81      0.89      0.84       183
       TIDAK SEHAT       0.43      0.12      0.19        24
SANGAT TIDAK SEHAT       0.00      0.00      0.00         0
         BERBAHAYA       0.00      0.00      0.00         0

          accuracy                           0.75       242
         macro avg       0.35      0.30      0.31       242
      weighted avg       0.72      0.75      0.73       242


üìç Processing lokasi: DKI2
DKI2 | after dropna: 3026 | train: 2755 | test: 241
F1-score (macro): 0.4058
                    precision    recall  f1-score   support

              BAIK       0.67      0.12      0.21        48
            SEDANG       0.71      0.96      0.82       167
       TIDAK SEHAT       0.50      0.12      0.19        26
SANG

In [9]:
path_sample = find_file("sample_submission.csv")

if path_sample is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

ds = pd.read_csv(path_sample, na_values=NA_VALUES)


In [10]:
# CELL S02 ‚Äî Parse tanggal & lokasi_clean

ds["tanggal"] = pd.to_datetime(ds["id"].str[:10])
ds["lokasi_clean"] = ds["id"].str.split("_").str[1]

ds = ds.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)
ds.head()


Unnamed: 0,id,category,tanggal,lokasi_clean
0,2025-09-01_DKI1,,2025-09-01,DKI1
1,2025-09-02_DKI1,,2025-09-02,DKI1
2,2025-09-03_DKI1,,2025-09-03,DKI1
3,2025-09-04_DKI1,,2025-09-04,DKI1
4,2025-09-05_DKI1,,2025-09-05,DKI1


In [11]:
# CELL S03 ‚Äî Check submission range

print("Tanggal min:", ds["tanggal"].min())
print("Tanggal max:", ds["tanggal"].max())
print("Lokasi:", ds["lokasi_clean"].unique())


Tanggal min: 2025-09-01 00:00:00
Tanggal max: 2025-11-30 00:00:00
Lokasi: ['DKI1' 'DKI2' 'DKI3' 'DKI4' 'DKI5']


In [12]:
# CELL S04 ‚Äî Prepare historical seed per lokasi (FINAL)

histories = {}

for lokasi in models.keys():
    hist = df[df["lokasi_clean"] == lokasi].copy()
    hist = hist.sort_values("tanggal")

    # feature engineering (same as training)
    hist = add_time_features(hist)

    # drop NaN from lags / rolling
    hist = hist.dropna()

    histories[lokasi] = hist

    print(
        lokasi,
        "| history rows:", len(hist),
        "| last date:", hist["tanggal"].max()
    )


DKI1 | history rows: 2772 | last date: 2025-08-31 00:00:00
DKI2 | history rows: 3026 | last date: 2025-08-31 00:00:00
DKI3 | history rows: 2857 | last date: 2025-08-31 00:00:00
DKI4 | history rows: 3269 | last date: 2025-08-31 00:00:00
DKI5 | history rows: 2882 | last date: 2025-08-31 00:00:00


In [13]:
# CELL S05 ‚Äî Autoregressive prediction for submission (FINAL)

submission_rows = []

DROP_COLS = [
    "tanggal",
    "lokasi_clean",
    "kategori",
    "kategori_enc",
    "max"   # üî• WAJIB dibuang (konsisten dengan training)
]

for lokasi, model in models.items():
    print(f"\nüöÄ Forecasting lokasi: {lokasi}")

    hist = histories[lokasi].copy()
    ds_loc = ds[ds["lokasi_clean"] == lokasi]

    for _, row in ds_loc.iterrows():
        # ambil baris terakhir sebagai input
        last_row = hist.iloc[-1:].copy()

        # === MODEL INPUT (HARUS SAMA DENGAN TRAINING)
        X = last_row.drop(columns=DROP_COLS, errors="ignore")

        # predict
        pred_enc = model.predict(X)[0]
        pred_cat = inv_label_map[pred_enc]

        submission_rows.append({
            "id": row["id"],
            "category": pred_cat
        })

        # === UPDATE HISTORY (BOLEH SIMPAN max, kategori)
        new_row = last_row.copy()
        new_row["tanggal"] = row["tanggal"]
        new_row["kategori_enc"] = pred_enc
        new_row["kategori"] = pred_cat

        # NOTE: max TIDAK di-update karena kita TIDAK pakai max autoregressively
        hist = pd.concat([hist, new_row], ignore_index=True)



üöÄ Forecasting lokasi: DKI1

üöÄ Forecasting lokasi: DKI2

üöÄ Forecasting lokasi: DKI3

üöÄ Forecasting lokasi: DKI4

üöÄ Forecasting lokasi: DKI5


In [14]:
# CELL S06 ‚Äî Build forecast.csv

forecast_df = pd.DataFrame(submission_rows)

forecast_df = forecast_df.sort_values("id").reset_index(drop=True)

forecast_df.head(), forecast_df.shape


(                id category
 0  2025-09-01_DKI1   SEDANG
 1  2025-09-01_DKI2   SEDANG
 2  2025-09-01_DKI3   SEDANG
 3  2025-09-01_DKI4   SEDANG
 4  2025-09-01_DKI5   SEDANG,
 (455, 2))

In [15]:
# CELL S07 ‚Äî Validate format

assert forecast_df.shape[0] == ds.shape[0]
assert forecast_df["id"].is_unique
assert forecast_df["category"].isna().sum() == 0

print("‚úÖ forecast.csv VALID")


‚úÖ forecast.csv VALID


In [16]:
# CELL S08 ‚Äî Save forecast.csv

forecast_df.to_csv("forecast.csv", index=False)
print("üì¶ forecast.csv saved")


üì¶ forecast.csv saved


In [17]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


In [18]:
# CELL F01 ‚Äî Train per lokasi (numeric max)

models_reg = {}

for lokasi in sorted(df["lokasi_clean"].dropna().unique()):
    print(f"\nüìç Training REG model: {lokasi}")

    df_loc = df[df["lokasi_clean"] == lokasi].copy()
    df_loc = add_time_features(df_loc)
    df_loc = df_loc.dropna()

    train = df_loc[df_loc["tanggal"] <= TRAIN_END]

    if len(train) < 300:
        print("‚ö†Ô∏è Skip")
        continue

    X_train = train.drop(
        columns=["tanggal", "lokasi_clean", "kategori", "kategori_enc", "max"],
        errors="ignore"
    )
    y_train = train["max"]

    model = XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    models_reg[lokasi] = model

    print("‚úì trained")



üìç Training REG model: DKI1
‚úì trained

üìç Training REG model: DKI2
‚úì trained

üìç Training REG model: DKI3
‚úì trained

üìç Training REG model: DKI4
‚úì trained

üìç Training REG model: DKI5
‚úì trained


In [19]:
# CELL F02 ‚Äî Forecast numeric max

numeric_preds = []

for lokasi, model in models_reg.items():
    hist = df[df["lokasi_clean"] == lokasi].copy()
    hist = hist.sort_values("tanggal")
    hist = add_time_features(hist)
    hist = hist.dropna()

    ds_loc = ds[ds["lokasi_clean"] == lokasi]

    for _, row in ds_loc.iterrows():
        last = hist.iloc[-1:].copy()

        X = last.drop(
            columns=["tanggal", "lokasi_clean", "kategori", "kategori_enc", "max"],
            errors="ignore"
        )

        yhat = model.predict(X)[0]

        numeric_preds.append({
            "id": row["id"],
            "lokasi_clean": lokasi,
            "tanggal": row["tanggal"],
            "max_pred": yhat
        })

        new = last.copy()
        new["tanggal"] = row["tanggal"]
        new["max"] = yhat

        hist = pd.concat([hist, new], ignore_index=True)


In [20]:
# CELL F03 ‚Äî ISPU mapping

def max_to_kategori(x):
    if x <= 50:
        return "BAIK"
    elif x <= 100:
        return "SEDANG"
    elif x <= 200:
        return "TIDAK SEHAT"
    else:
        return "TIDAK SEHAT"


In [21]:
pred_df = pd.DataFrame(numeric_preds)
pred_df["category"] = pred_df["max_pred"].apply(max_to_kategori)


In [22]:
forecast_df = pred_df[["id", "category"]].sort_values("id")
forecast_df.to_csv("forecast_reg.csv", index=False)

forecast_df["category"].value_counts()


category
SEDANG    455
Name: count, dtype: int64

In [23]:
df["delta_max"] = (
    df.groupby("lokasi_clean")["max"].diff()
)


In [24]:
models_delta = {}

for lokasi in sorted(df["lokasi_clean"].dropna().unique()):
    print(f"\nüìç Training DELTA model: {lokasi}")

    df_loc = df[df["lokasi_clean"] == lokasi].copy()
    df_loc = add_time_features(df_loc)
    df_loc = df_loc.dropna()

    train = df_loc[df_loc["tanggal"] <= TRAIN_END]

    X_train = train.drop(
        columns=[
            "tanggal",
            "lokasi_clean",
            "kategori",
            "kategori_enc",
            "max",
            "delta_max"
        ],
        errors="ignore"
    )
    y_train = train["delta_max"]

    model = XGBRegressor(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    models_delta[lokasi] = model

    print("‚úì trained")



üìç Training DELTA model: DKI1
‚úì trained

üìç Training DELTA model: DKI2
‚úì trained

üìç Training DELTA model: DKI3
‚úì trained

üìç Training DELTA model: DKI4
‚úì trained

üìç Training DELTA model: DKI5
‚úì trained


In [25]:
delta_preds = []

for lokasi, model in models_delta.items():
    hist = df[df["lokasi_clean"] == lokasi].copy()
    hist = hist.sort_values("tanggal")
    hist = add_time_features(hist)
    hist = hist.dropna()

    last_max = hist.iloc[-1]["max"]
    ds_loc = ds[ds["lokasi_clean"] == lokasi]

    for _, row in ds_loc.iterrows():
        last = hist.iloc[-1:].copy()

        X = last.drop(
            columns=[
                "tanggal",
                "lokasi_clean",
                "kategori",
                "kategori_enc",
                "max",
                "delta_max"
            ],
            errors="ignore"
        )

        delta = model.predict(X)[0]
        next_max = last_max + delta

        delta_preds.append({
            "id": row["id"],
            "max_pred": next_max
        })

        # update history
        new = last.copy()
        new["tanggal"] = row["tanggal"]
        new["max"] = next_max
        new["delta_max"] = delta

        hist = pd.concat([hist, new], ignore_index=True)
        last_max = next_max


In [26]:
pred_df = pd.DataFrame(delta_preds)
pred_df["category"] = pred_df["max_pred"].apply(max_to_kategori)

forecast_df = pred_df[["id", "category"]].sort_values("id")
forecast_df.to_csv("forecast_delta.csv", index=False)

forecast_df["category"].value_counts()


category
BAIK           260
SEDANG         117
TIDAK SEHAT     78
Name: count, dtype: int64

In [32]:
pred_df = pd.DataFrame(delta_preds)
pred_df["category"] = pred_df["max_pred"].apply(max_to_kategori)

forecast_df = pred_df[["id", "category"]].sort_values("id")
forecast_df.to_csv("forecast_delta.csv", index=False)

forecast_df["category"].value_counts()


category
BAIK           260
SEDANG         117
TIDAK SEHAT     78
Name: count, dtype: int64

In [33]:
# CELL P01 ‚Äî Prepare base data for prior

df_prior = df.copy()

df_prior["month"] = df_prior["tanggal"].dt.month

df_prior[["lokasi_clean", "tanggal", "month", "kategori"]].head()


Unnamed: 0,lokasi_clean,tanggal,month,kategori
0,DKI1,2010-01-01,1,SEDANG
1,DKI1,2010-01-02,1,BAIK
2,DKI1,2010-01-03,1,BAIK
3,DKI1,2010-01-04,1,BAIK
4,DKI1,2010-01-05,1,BAIK


In [34]:
# CELL P02 ‚Äî Restrict to training window

df_prior = df_prior[df_prior["tanggal"] <= TRAIN_END]

print(
    "Rows used for prior:",
    len(df_prior),
    "| last date:",
    df_prior["tanggal"].max()
)


Rows used for prior: 13903 | last date: 2024-12-01 00:00:00


In [35]:
# CELL P03 ‚Äî Raw category counts

prior_counts = (
    df_prior
    .groupby(["lokasi_clean", "month", "kategori"])
    .size()
    .reset_index(name="count")
)

prior_counts.head(10)


Unnamed: 0,lokasi_clean,month,kategori,count
0,DKI1,1,BAIK,102
1,DKI1,1,SEDANG,131
2,DKI1,1,TIDAK SEHAT,9
3,DKI1,2,BAIK,91
4,DKI1,2,SEDANG,116
5,DKI1,2,TIDAK SEHAT,2
6,DKI1,3,BAIK,55
7,DKI1,3,SEDANG,161
8,DKI1,3,TIDAK SEHAT,6
9,DKI1,4,BAIK,55


In [36]:
# CELL P04 ‚Äî Pivot table

prior_pivot = (
    prior_counts
    .pivot_table(
        index=["lokasi_clean", "month"],
        columns="kategori",
        values="count",
        fill_value=0
    )
)

prior_pivot.head()


Unnamed: 0_level_0,kategori,BAIK,BERBAHAYA,SANGAT TIDAK SEHAT,SEDANG,TIDAK SEHAT
lokasi_clean,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DKI1,1,102.0,0.0,0.0,131.0,9.0
DKI1,2,91.0,0.0,0.0,116.0,2.0
DKI1,3,55.0,0.0,0.0,161.0,6.0
DKI1,4,55.0,0.0,0.0,164.0,6.0
DKI1,5,11.0,0.0,0.0,203.0,7.0


In [37]:
# CELL P05 ‚Äî Laplace smoothing

SMOOTH = 1  # jangan 0, jangan gede

prior_smooth = prior_pivot + SMOOTH

prior_smooth.head()


Unnamed: 0_level_0,kategori,BAIK,BERBAHAYA,SANGAT TIDAK SEHAT,SEDANG,TIDAK SEHAT
lokasi_clean,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DKI1,1,103.0,1.0,1.0,132.0,10.0
DKI1,2,92.0,1.0,1.0,117.0,3.0
DKI1,3,56.0,1.0,1.0,162.0,7.0
DKI1,4,56.0,1.0,1.0,165.0,7.0
DKI1,5,12.0,1.0,1.0,204.0,8.0


In [38]:
# CELL P06 ‚Äî Normalize to probabilities

prior_prob = prior_smooth.div(
    prior_smooth.sum(axis=1),
    axis=0
)

prior_prob.head()


Unnamed: 0_level_0,kategori,BAIK,BERBAHAYA,SANGAT TIDAK SEHAT,SEDANG,TIDAK SEHAT
lokasi_clean,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DKI1,1,0.417004,0.004049,0.004049,0.534413,0.040486
DKI1,2,0.429907,0.004673,0.004673,0.546729,0.014019
DKI1,3,0.246696,0.004405,0.004405,0.713656,0.030837
DKI1,4,0.243478,0.004348,0.004348,0.717391,0.030435
DKI1,5,0.053097,0.004425,0.004425,0.902655,0.035398


In [39]:
# CELL P07 ‚Äî Inspect critical months

prior_prob.loc[("DKI1", 9)]
prior_prob.loc[("DKI1", 11)]
prior_prob.loc[("DKI4", 11)]


kategori
BAIK                  0.076642
BERBAHAYA             0.003650
SANGAT TIDAK SEHAT    0.003650
SEDANG                0.697080
TIDAK SEHAT           0.218978
Name: (DKI4, 11), dtype: float64

In [27]:
# hybrid_preds = []

# ALPHA = 0.7   # bobot delta (tune 0.6‚Äì0.8)

# for lokasi in models_delta.keys():
#     print(f"üöÄ Hybrid forecasting: {lokasi}")

#     model_delta = models_delta[lokasi]
#     model_level = models_reg[lokasi]

#     hist = df[df["lokasi_clean"] == lokasi].copy()
#     hist = hist.sort_values("tanggal")
#     hist = add_time_features(hist)
#     hist = hist.dropna()

#     last_max = hist.iloc[-1]["max"]
#     ds_loc = ds[ds["lokasi_clean"] == lokasi]

#     for _, row in ds_loc.iterrows():
#         last = hist.iloc[-1:].copy()

#         X = last.drop(
#             columns=[
#                 "tanggal",
#                 "lokasi_clean",
#                 "kategori",
#                 "kategori_enc",
#                 "max",
#                 "delta_max"
#             ],
#             errors="ignore"
#         )

#         # --- predict delta
#         delta = model_delta.predict(X)[0]

#         # --- predict level
#         level_pred = model_level.predict(X)[0]

#         # --- HYBRID COMBINATION
#         next_max = (last_max + delta) + 0.1 * (level_pred - last_max)


#         hybrid_preds.append({
#             "id": row["id"],
#             "max_pred": next_max
#         })

#         # --- update history
#         new = last.copy()
#         new["tanggal"] = row["tanggal"]
#         new["max"] = next_max
#         new["delta_max"] = delta

#         hist = pd.concat([hist, new], ignore_index=True)
#         last_max = next_max


In [28]:
# pred_df = pd.DataFrame(hybrid_preds)

# pred_df["category"] = pred_df["max_pred"].apply(max_to_kategori)

# forecast_df = pred_df[["id", "category"]].sort_values("id")
# forecast_df.to_csv("forecast_hybrid.csv", index=False)

# forecast_df["category"].value_counts()


In [None]:
# ds["date"] = ds["id"].str[:10]
# ds["lok"] = ds["id"].str.split("_").str[1]

# ds.groupby(["date", "lok"]).size().head(20)


date        lok 
2025-09-01  DKI1    1
            DKI2    1
            DKI3    1
            DKI4    1
            DKI5    1
2025-09-02  DKI1    1
            DKI2    1
            DKI3    1
            DKI4    1
            DKI5    1
2025-09-03  DKI1    1
            DKI2    1
            DKI3    1
            DKI4    1
            DKI5    1
2025-09-04  DKI1    1
            DKI2    1
            DKI3    1
            DKI4    1
            DKI5    1
dtype: int64

In [None]:
# ds["date"].value_counts().sort_index()


date
2025-09-01    5
2025-09-02    5
2025-09-03    5
2025-09-04    5
2025-09-05    5
             ..
2025-11-26    5
2025-11-27    5
2025-11-28    5
2025-11-29    5
2025-11-30    5
Name: count, Length: 91, dtype: int64

In [None]:
# df["month"] = df["tanggal"].dt.month

# df.groupby(["lokasi_clean", "month", "kategori"]).size().unstack(fill_value=0)


Unnamed: 0_level_0,kategori,BAIK,BERBAHAYA,SANGAT TIDAK SEHAT,SEDANG,TIDAK SEHAT
lokasi_clean,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DKI1,1,114,0,0,150,9
DKI1,2,100,0,0,135,2
DKI1,3,64,0,0,183,6
DKI1,4,60,0,0,189,6
DKI1,5,11,0,0,233,8
DKI1,6,15,0,0,211,13
DKI1,7,22,0,0,205,25
DKI1,8,25,0,0,207,16
DKI1,9,31,0,0,173,8
DKI1,10,17,0,0,182,14
