In [68]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import sys
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor


NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("‚ùå script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column

# CELL - 2 "load data"
path = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()



Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [69]:
# CELL 2 - load data

path = find_file("merged_libur_cuaca_ispu_ndvi.csv")
if path is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES, parse_dates=["tanggal"])

df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

df[["tanggal", "lokasi_clean", "max"]].head()


Unnamed: 0,tanggal,lokasi_clean,max
0,2010-01-01,DKI1,73.0
1,2010-01-02,DKI1,33.0
2,2010-01-03,DKI1,27.0
3,2010-01-04,DKI1,22.0
4,2010-01-05,DKI1,25.0


In [70]:
# CELL 3 - minimal modeling dataframe

df_model = df[
    ["tanggal", "lokasi_clean", "max", "is_libur"]
].copy()

df_model.isna().sum()


tanggal         0
lokasi_clean    0
max             1
is_libur        0
dtype: int64

In [71]:
# CELL 4 - calendar features

df_model["month"] = df_model["tanggal"].dt.month
df_model["month_sin"] = np.sin(2 * np.pi * df_model["month"] / 12)
df_model["month_cos"] = np.cos(2 * np.pi * df_model["month"] / 12)

df_model.head()


Unnamed: 0,tanggal,lokasi_clean,max,is_libur,month,month_sin,month_cos
0,2010-01-01,DKI1,73.0,1,1,0.5,0.866025
1,2010-01-02,DKI1,33.0,1,1,0.5,0.866025
2,2010-01-03,DKI1,27.0,1,1,0.5,0.866025
3,2010-01-04,DKI1,22.0,0,1,0.5,0.866025
4,2010-01-05,DKI1,25.0,0,1,0.5,0.866025


In [72]:
# CELL 5 - lag & rolling features

LAGS = [1, 7, 30, 365]

for lag in LAGS:
    df_model[f"lag_{lag}"] = (
        df_model
        .groupby("lokasi_clean")["max"]
        .shift(lag)
    )

df_model["roll_7"] = (
    df_model
    .groupby("lokasi_clean")["max"]
    .shift(1)
    .rolling(7)
    .mean()
)

df_model["roll_30"] = (
    df_model
    .groupby("lokasi_clean")["max"]
    .shift(1)
    .rolling(30)
    .mean()
)

df_model = df_model.dropna().reset_index(drop=True)

df_model.head()


Unnamed: 0,tanggal,lokasi_clean,max,is_libur,month,month_sin,month_cos,lag_1,lag_7,lag_30,lag_365,roll_7,roll_30
0,2011-04-09,DKI1,50.0,1,4,0.866025,-0.5,49.0,38.0,34.0,73.0,39.714286,38.4
1,2011-04-22,DKI1,125.0,1,4,0.866025,-0.5,50.0,34.0,25.0,33.0,41.428571,38.933333
2,2011-04-24,DKI1,83.0,1,4,0.866025,-0.5,125.0,36.0,25.0,27.0,54.428571,42.266667
3,2011-04-25,DKI1,70.0,0,4,0.866025,-0.5,83.0,33.0,25.0,22.0,61.142857,44.2
4,2011-04-27,DKI1,71.0,0,4,0.866025,-0.5,70.0,47.0,28.0,25.0,66.428571,45.7


In [73]:
# CELL 6 - features & target

FEATURES = [
    "lag_1", "lag_7", "lag_30", "lag_365",
    "roll_7", "roll_30",
    "month_sin", "month_cos",
    "is_libur"
]

TARGET = "max"

# sanity check
missing = set(FEATURES) - set(df_model.columns)
assert len(missing) == 0, f"Missing features: {missing}"


In [74]:
# CELL 7 - rolling train / validation split

train_df = df_model[
    (df_model["tanggal"] >= "2010-01-01") &
    (df_model["tanggal"] <= "2024-06-30")
]

val_df = df_model[
    (df_model["tanggal"] >= "2024-07-01") &
    (df_model["tanggal"] <= "2024-12-31")
]

len(train_df), len(val_df)


(11279, 918)

In [75]:
# CELL 8 - train per lokasi_clean (FIXED metrics, sklearn-compatible)

models = {}

for lokasi in df_model["lokasi_clean"].unique():
    print(f"\n=== TRAIN {lokasi} ===")

    train_l = train_df[train_df["lokasi_clean"] == lokasi]
    val_l   = val_df[val_df["lokasi_clean"] == lokasi]

    X_train = train_l[FEATURES]
    y_train = train_l[TARGET]

    X_val = val_l[FEATURES]
    y_val = val_l[TARGET]

    model = XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_val)

    # üîß FIX: manual RMSE (works on all sklearn versions)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, preds)

    print(f"RMSE: {rmse:.2f}")
    print(f"MAE : {mae:.2f}")

    models[lokasi] = model



=== TRAIN DKI1 ===
RMSE: 14.76
MAE : 11.00

=== TRAIN DKI2 ===
RMSE: 12.24
MAE : 9.13

=== TRAIN DKI3 ===
RMSE: 13.20
MAE : 10.56

=== TRAIN DKI4 ===
RMSE: 17.15
MAE : 11.25

=== TRAIN DKI5 ===
RMSE: 15.88
MAE : 12.01


In [76]:
# CELL 9 - future date frame

future_dates = pd.date_range(
    start="2025-09-01",
    end="2025-11-30",
    freq="D"
)

df_future = pd.MultiIndex.from_product(
    [df_model["lokasi_clean"].unique(), future_dates],
    names=["lokasi_clean", "tanggal"]
).to_frame(index=False)

df_future.head()


Unnamed: 0,lokasi_clean,tanggal
0,DKI1,2025-09-01
1,DKI1,2025-09-02
2,DKI1,2025-09-03
3,DKI1,2025-09-04
4,DKI1,2025-09-05


In [77]:
# CELL 10 - calendar & libur for future

libur = df[["tanggal", "is_libur"]].drop_duplicates()

df_future = df_future.merge(
    libur,
    on="tanggal",
    how="left"
)

df_future["is_libur"] = df_future["is_libur"].fillna(0).astype(int)

df_future["month"] = df_future["tanggal"].dt.month
df_future["month_sin"] = np.sin(2 * np.pi * df_future["month"] / 12)
df_future["month_cos"] = np.cos(2 * np.pi * df_future["month"] / 12)

df_future.head()


Unnamed: 0,lokasi_clean,tanggal,is_libur,month,month_sin,month_cos
0,DKI1,2025-09-01,0,9,-1.0,-1.83697e-16
1,DKI1,2025-09-02,0,9,-1.0,-1.83697e-16
2,DKI1,2025-09-03,0,9,-1.0,-1.83697e-16
3,DKI1,2025-09-04,0,9,-1.0,-1.83697e-16
4,DKI1,2025-09-05,0,9,-1.0,-1.83697e-16


In [78]:
# CELL 11 - recursive forecasting for max

all_preds = []

for lokasi, model in models.items():
    # historical data for this lokasi
    hist = (
        df_model[df_model["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .copy()
    )

    series = hist["max"].tolist()

    future_l = (
        df_future[df_future["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .copy()
    )

    preds = []

    for _, row in future_l.iterrows():
        feat = row.copy()

        # lag features
        feat["lag_1"] = series[-1]
        feat["lag_7"] = series[-7]
        feat["lag_30"] = series[-30]
        feat["lag_365"] = series[-365]

        # rolling features
        feat["roll_7"] = np.mean(series[-7:])
        feat["roll_30"] = np.mean(series[-30:])

        X = pd.DataFrame([feat])[FEATURES]

        pred = model.predict(X)[0]

        feat["pred_max"] = pred
        preds.append(feat)

        # üîÅ append prediction for next step
        series.append(pred)

    all_preds.append(pd.DataFrame(preds))

df_preds = pd.concat(all_preds, ignore_index=True)


In [79]:
df_preds

Unnamed: 0,lokasi_clean,tanggal,is_libur,month,month_sin,month_cos,lag_1,lag_7,lag_30,lag_365,roll_7,roll_30,pred_max
0,DKI1,2025-09-01,0,9,-1.0,-1.836970e-16,70.000000,114.000000,82.000000,75.0,83.714286,87.500000,74.492409
1,DKI1,2025-09-02,0,9,-1.0,-1.836970e-16,74.492409,95.000000,94.000000,72.0,78.070344,87.249747,93.515488
2,DKI1,2025-09-03,0,9,-1.0,-1.836970e-16,93.515488,92.000000,91.000000,70.0,77.858271,87.233597,97.847244
3,DKI1,2025-09-04,0,9,-1.0,-1.836970e-16,97.847244,71.000000,76.000000,79.0,78.693592,87.461838,95.244797
4,DKI1,2025-09-05,0,9,-1.0,-1.836970e-16,95.244797,67.000000,83.000000,83.0,82.157134,88.103331,82.604874
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,DKI5,2025-11-26,0,11,-0.5,8.660254e-01,73.677727,80.811089,91.285698,40.0,75.034019,80.430161,74.448120
451,DKI5,2025-11-27,0,11,-0.5,8.660254e-01,74.448120,75.187569,93.407219,42.0,74.125038,79.868896,76.022385
452,DKI5,2025-11-28,0,11,-0.5,8.660254e-01,76.022385,73.312027,89.354805,53.0,74.244286,79.289406,72.888481
453,DKI5,2025-11-29,0,11,-0.5,8.660254e-01,72.888481,71.573738,88.471092,56.0,74.183784,78.740532,78.594505


In [80]:
# CELL 12A - numeric ISPU ‚Üí kategori (evaluation only)

def max_to_kategori(x):
    if x <= 50:
        return "BAIK"
    elif x <= 100:
        return "SEDANG"
    else:
        return "TIDAK SEHAT"


In [81]:
# CELL 12B - backtest window

BACKTEST_START = "2024-01-01"
BACKTEST_END   = "2024-03-31"


In [82]:
# CELL 12C - recursive backtest

from sklearn.metrics import classification_report, confusion_matrix

all_backtests = []

for lokasi, model in models.items():
    hist = (
        df_model[
            (df_model["lokasi_clean"] == lokasi) &
            (df_model["tanggal"] < BACKTEST_START)
        ]
        .sort_values("tanggal")
        .copy()
    )

    future_true = (
        df_model[
            (df_model["lokasi_clean"] == lokasi) &
            (df_model["tanggal"] >= BACKTEST_START) &
            (df_model["tanggal"] <= BACKTEST_END)
        ]
        .sort_values("tanggal")
        .copy()
    )

    series = hist["max"].tolist()
    preds = []

    for _, row in future_true.iterrows():
        feat = row.copy()

        feat["lag_1"] = series[-1]
        feat["lag_7"] = series[-7]
        feat["lag_30"] = series[-30]
        feat["lag_365"] = series[-365]
        feat["roll_7"] = np.mean(series[-7:])
        feat["roll_30"] = np.mean(series[-30:])

        X = pd.DataFrame([feat])[FEATURES]
        pred = model.predict(X)[0]

        preds.append(pred)
        series.append(pred)

    future_true["pred_max"] = preds
    all_backtests.append(future_true)

df_backtest = pd.concat(all_backtests, ignore_index=True)


In [83]:
# CELL 12D - regression metrics

rmse = np.sqrt(mean_squared_error(df_backtest["max"], df_backtest["pred_max"]))
mae = mean_absolute_error(df_backtest["max"], df_backtest["pred_max"])

print("===== REGRESSION METRICS =====")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")


===== REGRESSION METRICS =====
RMSE: 18.88
MAE : 13.58


In [84]:
# CELL 12E - classification metrics

df_backtest["true_kat"] = df_backtest["max"].apply(max_to_kategori)
df_backtest["pred_kat"] = df_backtest["pred_max"].apply(max_to_kategori)

print("===== CLASSIFICATION REPORT =====")
print(classification_report(
    df_backtest["true_kat"],
    df_backtest["pred_kat"],
    zero_division=0
))

print("===== CONFUSION MATRIX =====")
display(
    pd.DataFrame(
        confusion_matrix(
            df_backtest["true_kat"],
            df_backtest["pred_kat"],
            labels=["BAIK", "SEDANG", "TIDAK SEHAT"]
        ),
        index=["True_BAIK", "True_SEDANG", "True_TIDAK_SEHAT"],
        columns=["Pred_BAIK", "Pred_SEDANG", "Pred_TIDAK_SEHAT"]
    )
)


===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

        BAIK       0.33      0.17      0.23        76
      SEDANG       0.81      0.92      0.86       360
 TIDAK SEHAT       0.60      0.19      0.29        16

    accuracy                           0.77       452
   macro avg       0.58      0.43      0.46       452
weighted avg       0.73      0.77      0.74       452

===== CONFUSION MATRIX =====


Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK_SEHAT
True_BAIK,13,63,0
True_SEDANG,26,332,2
True_TIDAK_SEHAT,0,13,3


In [85]:
# CELL 1 - config & features

SEED = 42
TARGET = "max"

FEATURES = [
    "is_libur",
    "month_sin",
    "month_cos",
    "lag_1",
    "lag_7",
    "lag_30",
    "lag_365",
    "roll_7",
    "roll_30",
]


In [86]:
# CELL 2 - calendar features

df["month"] = df["tanggal"].dt.month
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)


In [87]:
# CELL 3 - lag & rolling features

df = df.sort_values(["lokasi_clean", "tanggal"])

for lag in [1, 7, 30, 365]:
    df[f"lag_{lag}"] = (
        df.groupby("lokasi_clean")[TARGET]
        .shift(lag)
    )

df["roll_7"] = (
    df.groupby("lokasi_clean")[TARGET]
    .shift(1)
    .rolling(7)
    .mean()
)

df["roll_30"] = (
    df.groupby("lokasi_clean")[TARGET]
    .shift(1)
    .rolling(30)
    .mean()
)

df = df.dropna().reset_index(drop=True)


In [88]:
# CELL 4 - time-based split

train_df = df[df["tanggal"] < "2024-01-01"]
val_df   = df[df["tanggal"] >= "2024-01-01"]

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_val = val_df[FEATURES]
y_val = val_df[TARGET]


In [89]:
# CELL 5 - quantile models

def train_quantile(q):
    return XGBRegressor(
        objective="reg:quantileerror",
        quantile_alpha=q,
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED,
        n_jobs=-1,
    )

models_q = {
    "q20": train_quantile(0.2),
    "q50": train_quantile(0.5),
    "q80": train_quantile(0.8),
}

for name, model in models_q.items():
    model.fit(X_train, y_train)
    print(f"Trained {name}")


Trained q20
Trained q50
Trained q80


In [90]:
# CELL 6 - regression metrics

from sklearn.metrics import mean_absolute_error

for name, model in models_q.items():
    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    rmse = np.sqrt(((y_val - preds) ** 2).mean())

    print(f"{name} | MAE: {mae:.2f} | RMSE: {rmse:.2f}")


q20 | MAE: 13.16 | RMSE: 17.33
q50 | MAE: 11.28 | RMSE: 14.92
q80 | MAE: 14.19 | RMSE: 17.68


In [91]:
# CELL 7 - kategori evaluation

def ispu_to_kategori(x):
    if x <= 50:
        return "BAIK"
    elif x <= 100:
        return "SEDANG"
    else:
        return "TIDAK SEHAT"

from sklearn.metrics import classification_report, confusion_matrix

for name, model in models_q.items():
    preds = model.predict(X_val)

    y_true_cat = y_val.apply(ispu_to_kategori)
    y_pred_cat = pd.Series(preds).apply(ispu_to_kategori)

    print(f"\n===== {name} =====")
    print(classification_report(y_true_cat, y_pred_cat, zero_division=0))






===== q20 =====
              precision    recall  f1-score   support

        BAIK       0.48      0.67      0.56       361
      SEDANG       0.82      0.88      0.85      2303
 TIDAK SEHAT       0.17      0.00      0.01       319

    accuracy                           0.76      2983
   macro avg       0.49      0.52      0.47      2983
weighted avg       0.71      0.76      0.73      2983


===== q50 =====
              precision    recall  f1-score   support

        BAIK       0.60      0.41      0.49       361
      SEDANG       0.81      0.94      0.87      2303
 TIDAK SEHAT       0.45      0.13      0.20       319

    accuracy                           0.79      2983
   macro avg       0.62      0.49      0.52      2983
weighted avg       0.75      0.79      0.75      2983


===== q80 =====
              precision    recall  f1-score   support

        BAIK       0.67      0.17      0.27       361
      SEDANG       0.81      0.87      0.84      2303
 TIDAK SEHAT       0.34 

In [92]:
# CELL 7A - generate quantile predictions (DEFINE VARIABLES)

pred_q20 = models_q["q20"].predict(X_val)
pred_q50 = models_q["q50"].predict(X_val)
pred_q80 = models_q["q80"].predict(X_val)


In [94]:
df_quantile_preds = pd.DataFrame({
    "q20": pred_q20,
    "q50": pred_q50,
    "q80": pred_q80,
})

for q in ["q20", "q50", "q80"]:
    df_quantile_preds[f"{q}_kategori"] = df_quantile_preds[q].apply(ispu_to_kategori)

df_quantile_preds


Unnamed: 0,q20,q50,q80,q20_kategori,q50_kategori,q80_kategori
0,52.497654,63.633457,73.970520,SEDANG,SEDANG,SEDANG
1,59.275280,71.130219,84.430061,SEDANG,SEDANG,SEDANG
2,60.991829,73.520645,87.087013,SEDANG,SEDANG,SEDANG
3,50.928581,62.480930,76.516518,SEDANG,SEDANG,SEDANG
4,62.205132,80.001762,98.166733,SEDANG,SEDANG,SEDANG
...,...,...,...,...,...,...
2978,76.678474,85.106041,96.688026,SEDANG,SEDANG,SEDANG
2979,83.119110,90.929993,97.717064,SEDANG,SEDANG,SEDANG
2980,68.032433,79.183662,86.491074,SEDANG,SEDANG,SEDANG
2981,62.772232,73.182724,81.949821,SEDANG,SEDANG,SEDANG


In [95]:
counts = {}

for q in ["q20", "q50", "q80"]:
    counts[q] = (
        df_quantile_preds[f"{q}_kategori"]
        .value_counts()
        .reindex(["BAIK", "SEDANG", "TIDAK SEHAT"], fill_value=0)
    )

df_kategori_counts = pd.DataFrame(counts)
display(df_kategori_counts)


Unnamed: 0,q20,q50,q80
BAIK,506,245,90
SEDANG,2471,2647,2480
TIDAK SEHAT,6,91,413


In [101]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix
from xgboost import XGBRegressor

SEED = 42
np.random.seed(SEED)


In [102]:

path = find_file("merged_libur_cuaca_ispu_ndvi.csv")
if path is None:
    raise FileNotFoundError("‚ùå merged_libur_cuaca_ispu_ndvi.csv not found")

df = pd.read_csv(path, parse_dates=["tanggal"])

df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

df[["tanggal", "lokasi_clean", "max"]].head()

Unnamed: 0,tanggal,lokasi_clean,max
0,2010-01-01,DKI1,73.0
1,2010-01-02,DKI1,33.0
2,2010-01-03,DKI1,27.0
3,2010-01-04,DKI1,22.0
4,2010-01-05,DKI1,25.0


In [103]:
df = df.dropna(subset=["tanggal", "lokasi_clean", "max"])
df["max"] = df["max"].astype(float)

df["lokasi_clean"].value_counts()


lokasi_clean
DKI4    3359
DKI2    3116
DKI5    2972
DKI3    2947
DKI1    2862
Name: count, dtype: int64

In [104]:
train_df = df[
    (df["tanggal"] >= "2010-01-01") &
    (df["tanggal"] <= "2024-06-30")
].copy()

val_df = df[
    (df["tanggal"] >= "2024-07-01") &
    (df["tanggal"] <= "2024-12-31")
].copy()

print("Train:", train_df.shape)
print("Val  :", val_df.shape)


Train: (13135, 41)
Val  : (918, 41)


In [105]:
def add_features(d):
    d = d.copy()

    d["lag_1"]  = d.groupby("lokasi_clean")["max"].shift(1)
    d["lag_7"]  = d.groupby("lokasi_clean")["max"].shift(7)
    d["lag_30"] = d.groupby("lokasi_clean")["max"].shift(30)

    d["roll_7"]  = d.groupby("lokasi_clean")["max"].shift(1).rolling(7).mean()
    d["roll_30"] = d.groupby("lokasi_clean")["max"].shift(1).rolling(30).mean()

    d["month"] = d["tanggal"].dt.month
    d["month_sin"] = np.sin(2 * np.pi * d["month"] / 12)
    d["month_cos"] = np.cos(2 * np.pi * d["month"] / 12)

    return d

train_df = add_features(train_df)
val_df   = add_features(val_df)


In [106]:
FEATURES = [
    "lag_1", "lag_7", "lag_30",
    "roll_7", "roll_30",
    "month_sin", "month_cos"
]

TARGET = "max"

train_df = train_df.dropna(subset=FEATURES + [TARGET])
val_df   = val_df.dropna(subset=FEATURES + [TARGET])


In [107]:
models = {}

for lokasi in train_df["lokasi_clean"].unique():
    tr = train_df[train_df["lokasi_clean"] == lokasi]
    
    X_tr = tr[FEATURES]
    y_tr = tr[TARGET]

    model = XGBRegressor(
        objective="reg:quantileerror",
        quantile_alpha=0.2,
        n_estimators=500,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED,
    )

    model.fit(X_tr, y_tr)
    models[lokasi] = model

    print(f"‚úÖ Trained q20 model for {lokasi}")


‚úÖ Trained q20 model for DKI1
‚úÖ Trained q20 model for DKI2
‚úÖ Trained q20 model for DKI3
‚úÖ Trained q20 model for DKI4
‚úÖ Trained q20 model for DKI5


In [108]:
def ispu_to_kategori(x):
    if x <= 50:
        return "BAIK"
    elif x <= 100:
        return "SEDANG"
    else:
        return "TIDAK SEHAT"

for lokasi, model in models.items():
    val_loc = val_df[val_df["lokasi_clean"] == lokasi]

    X_val = val_loc[FEATURES]
    y_val = val_loc[TARGET]

    preds = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))
    mae  = mean_absolute_error(y_val, preds)

    print(f"\n===== {lokasi} =====")
    print("RMSE:", rmse)
    print("MAE :", mae)

    y_true_cat = y_val.apply(ispu_to_kategori)
    y_pred_cat = pd.Series(preds).apply(ispu_to_kategori)

    print(classification_report(y_true_cat, y_pred_cat, zero_division=0))



===== DKI1 =====
RMSE: 17.318224670910045
MAE : 13.804281160428927
              precision    recall  f1-score   support

        BAIK       0.26      0.88      0.40         8
      SEDANG       0.91      0.85      0.88       136
 TIDAK SEHAT       0.00      0.00      0.00        10

    accuracy                           0.80       154
   macro avg       0.39      0.58      0.43       154
weighted avg       0.82      0.80      0.80       154


===== DKI2 =====
RMSE: 15.472352077285233
MAE : 11.733249193662173
              precision    recall  f1-score   support

        BAIK       0.69      0.75      0.72        12
      SEDANG       0.89      0.97      0.93       129
 TIDAK SEHAT       0.00      0.00      0.00        13

    accuracy                           0.87       154
   macro avg       0.53      0.57      0.55       154
weighted avg       0.80      0.87      0.83       154


===== DKI3 =====
RMSE: 15.963151129320623
MAE : 12.61019488565283
              precision    recall  

In [109]:
sample_path = find_file("sample_submission.csv")
if sample_path is None:
    raise FileNotFoundError("‚ùå sample_submission.csv not found")

df_sub = pd.read_csv(sample_path)

df_sub["tanggal"] = pd.to_datetime(df_sub["id"].str[:10])
df_sub["lokasi_clean"] = df_sub["id"].str.split("_").str[-1]

df_sub.head()


Unnamed: 0,id,category,tanggal,lokasi_clean
0,2025-09-01_DKI1,,2025-09-01,DKI1
1,2025-09-01_DKI2,,2025-09-01,DKI2
2,2025-09-01_DKI3,,2025-09-01,DKI3
3,2025-09-01_DKI4,,2025-09-01,DKI4
4,2025-09-01_DKI5,,2025-09-01,DKI5


In [110]:
all_preds = []

for lokasi, model in models.items():
    hist = (
        df[df["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .copy()
    )

    series = hist["max"].tolist()

    future = (
        df_sub[df_sub["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .copy()
    )

    preds = []

    for _, row in future.iterrows():
        feat = {}

        feat["lag_1"]  = series[-1]
        feat["lag_7"]  = series[-7]
        feat["lag_30"] = series[-30]

        feat["roll_7"]  = np.mean(series[-7:])
        feat["roll_30"] = np.mean(series[-30:])

        m = row["tanggal"].month
        feat["month_sin"] = np.sin(2 * np.pi * m / 12)
        feat["month_cos"] = np.cos(2 * np.pi * m / 12)

        X = pd.DataFrame([feat])[FEATURES]
        pred = model.predict(X)[0]

        series.append(pred)

        preds.append({
            "id": row["id"],
            "pred_max": pred
        })

    all_preds.append(pd.DataFrame(preds))

df_preds = pd.concat(all_preds, ignore_index=True)
df_preds.head()


Unnamed: 0,id,pred_max
0,2025-09-01_DKI1,73.172043
1,2025-09-02_DKI1,79.09446
2,2025-09-03_DKI1,81.515457
3,2025-09-04_DKI1,74.478813
4,2025-09-05_DKI1,74.103149


In [111]:
df_preds["kategori"] = df_preds["pred_max"].apply(ispu_to_kategori)

df_preds["kategori"].value_counts()


kategori
BAIK      236
SEDANG    219
Name: count, dtype: int64

In [113]:
submission = (
    df_preds[["id", "kategori"]]
    .sort_values("id")
    .reset_index(drop=True)
)

out_path = Path.cwd() / "eksjibus_v1.csv"
submission.to_csv(out_path, index=False)

print("‚úÖ eksjibus_v1.csv saved to:", out_path)
submission.head()


‚úÖ eksjibus_v1.csv saved to: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\xgboost\eksjibus_v1.csv


Unnamed: 0,id,kategori
0,2025-09-01_DKI1,SEDANG
1,2025-09-01_DKI2,SEDANG
2,2025-09-01_DKI3,SEDANG
3,2025-09-01_DKI4,SEDANG
4,2025-09-01_DKI5,SEDANG
