In [1]:
# CELL - 1 "imports"
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display




NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column




In [2]:
# CELL - 2 "load data"
path = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [3]:
# CELL 1.1 - global random seed (REPRODUCIBILITY)

SEED = 42

import os
import random
import numpy as np

# python
random.seed(SEED)

# numpy
np.random.seed(SEED)

# hash seed (important for Python >=3.11)
os.environ["PYTHONHASHSEED"] = str(SEED)


In [4]:
# CELL 3 - datetime & sorting

df["tanggal"] = pd.to_datetime(df["tanggal"])
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

display(df[["tanggal", "lokasi_clean", "kategori"]].head())


Unnamed: 0,tanggal,lokasi_clean,kategori
0,2010-01-01,DKI1,SEDANG
1,2010-01-02,DKI1,BAIK
2,2010-01-03,DKI1,BAIK
3,2010-01-04,DKI1,BAIK
4,2010-01-05,DKI1,BAIK


In [None]:
# CELL 4 - kategori <-> ordinal proxy (FINAL: 3 classes only)

KATEGORI_TO_ISPU = {
    "BAIK": 25,
    "SEDANG": 75,
    "TIDAK SEHAT": 150,
}

ISPU_TO_KATEGORI_THRESHOLDS = [
    (50, "BAIK"),
    (100, "SEDANG"),
    (np.inf, "TIDAK SEHAT"),
]

def ispu_to_kategori(x):
    for t, k in ISPU_TO_KATEGORI_THRESHOLDS:
        if x <= t:
            return k


In [6]:
# CELL 5 - encode target

df["ispu_numeric"] = df["kategori"].map(KATEGORI_TO_ISPU)

display(df[["kategori", "ispu_numeric"]].drop_duplicates())


Unnamed: 0,kategori,ispu_numeric
0,SEDANG,75.0
1,BAIK,25.0
97,TIDAK SEHAT,150.0
2947,SANGAT TIDAK SEHAT,
3206,BERBAHAYA,


In [7]:
# CELL 6 - drop unusable rows

df = df.dropna(subset=["ispu_numeric", "lokasi_clean", "tanggal"])
df = df.reset_index(drop=True)

print("Rows remaining:", len(df))


Rows remaining: 15053


In [8]:
# CELL 7 - calendar features

df["month"] = df["tanggal"].dt.month
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

display(df[["tanggal", "month_sin", "month_cos", "is_libur"]].head())


Unnamed: 0,tanggal,month_sin,month_cos,is_libur
0,2010-01-01,0.5,0.866025,1
1,2010-01-02,0.5,0.866025,1
2,2010-01-03,0.5,0.866025,1
3,2010-01-04,0.5,0.866025,0
4,2010-01-05,0.5,0.866025,0


In [9]:
# CELL 8 - lag features

LAGS = [1, 7, 30, 365]

for lag in LAGS:
    df[f"ispu_lag_{lag}"] = (
        df.groupby("lokasi_clean")["ispu_numeric"]
          .shift(lag)
    )

display(df.filter(regex="lag").head(20))


Unnamed: 0,ispu_lag_1,ispu_lag_7,ispu_lag_30,ispu_lag_365
0,,,,
1,75.0,,,
2,25.0,,,
3,25.0,,,
4,25.0,,,
5,25.0,,,
6,25.0,,,
7,25.0,75.0,,
8,75.0,25.0,,
9,75.0,25.0,,


In [10]:
# CELL 9 - rolling features

df["ispu_roll_7"] = (
    df.groupby("lokasi_clean")["ispu_numeric"]
      .shift(1)
      .rolling(7)
      .mean()
)

df["ispu_roll_30"] = (
    df.groupby("lokasi_clean")["ispu_numeric"]
      .shift(1)
      .rolling(30)
      .mean()
)


In [11]:
# CELL 10 - modeling frame

FEATURES = [
    "is_libur",
    "month_sin",
    "month_cos",
    "ispu_lag_1",
    "ispu_lag_7",
    "ispu_lag_30",
    "ispu_lag_365",
    "ispu_roll_7",
    "ispu_roll_30",
]

TARGET = "ispu_numeric"

df_model = df.dropna(subset=FEATURES + [TARGET]).reset_index(drop=True)

print("Model rows:", len(df_model))
display(df_model[FEATURES + [TARGET]].head())


Model rows: 13228


Unnamed: 0,is_libur,month_sin,month_cos,ispu_lag_1,ispu_lag_7,ispu_lag_30,ispu_lag_365,ispu_roll_7,ispu_roll_30,ispu_numeric
0,1,0.866025,-0.5,25.0,25.0,25.0,75.0,25.0,30.0,25.0
1,1,0.866025,-0.5,25.0,25.0,25.0,25.0,25.0,30.0,150.0
2,1,0.866025,-0.5,150.0,25.0,25.0,25.0,42.857143,34.166667,75.0
3,0,0.866025,-0.5,75.0,25.0,25.0,25.0,50.0,35.833333,75.0
4,0,0.866025,-0.5,75.0,25.0,25.0,25.0,57.142857,37.5,75.0


In [12]:
# CELL 11A - split A

train_A = df_model[
    (df_model["tanggal"] >= "2023-01-01") &
    (df_model["tanggal"] <= "2024-12-31")
]

test_A = df_model[
    (df_model["tanggal"] >= "2025-01-01") &
    (df_model["tanggal"] <= "2025-08-31")
]

print("Train A:", train_A.shape)
print("Test  A:", test_A.shape)


Train A: (3624, 50)
Test  A: (1203, 50)


In [13]:
# CELL 11B - split B (best practice)

train_B = df_model[
    (df_model["tanggal"] >= "2023-01-01") &
    (df_model["tanggal"] <= "2024-06-30")
]

val_B = df_model[
    (df_model["tanggal"] >= "2024-07-01") &
    (df_model["tanggal"] <= "2024-12-31")
]

print("Train B:", train_B.shape)
print("Val   B:", val_B.shape)


Train B: (2706, 50)
Val   B: (918, 50)


In [14]:
# CELL 12 - XGBoost Regressor

from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    f1_score,
    classification_report,
    confusion_matrix,
)

xgb = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=SEED,
)


In [15]:
# CELL 13A - evaluation split A (FULL, SAFE)

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
)

# =========================
# Prepare train / test
# =========================
X_train = train_A[FEATURES]
y_train = train_A[TARGET]

X_test = test_A[FEATURES]
y_test = test_A[TARGET]

# =========================
# Train model
# =========================
xgb.fit(X_train, y_train)

# =========================
# Predict (numeric proxy)
# =========================
y_pred_num = xgb.predict(X_test)

# =========================
# Regression metrics
# =========================
rmse = np.sqrt(mean_squared_error(y_test, y_pred_num))
mae = mean_absolute_error(y_test, y_pred_num)

print("===== REGRESSION METRICS =====")
print(f"RMSE : {rmse:.3f}")
print(f"MAE  : {mae:.3f}")

# =========================
# Convert to kategori
# =========================
y_test_cat = y_test.apply(ispu_to_kategori)
y_pred_cat = pd.Series(y_pred_num, index=y_test.index).apply(ispu_to_kategori)

# =========================
# Classification metrics
# =========================
print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(y_test_cat, y_pred_cat, zero_division=0))

print("\n===== CONFUSION MATRIX =====")
labels = sorted(y_test_cat.unique())

cm = confusion_matrix(y_test_cat, y_pred_cat, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=[f"True_{l}" for l in labels],
    columns=[f"Pred_{l}" for l in labels],
)

display(cm_df)


===== REGRESSION METRICS =====
RMSE : 31.086
MAE  : 23.630

===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

        BAIK       0.58      0.32      0.41       213
      SEDANG       0.74      0.78      0.76       849
 TIDAK SEHAT       0.25      0.33      0.28       141

    accuracy                           0.64      1203
   macro avg       0.52      0.48      0.48      1203
weighted avg       0.65      0.64      0.64      1203


===== CONFUSION MATRIX =====


Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,68,143,2
True_SEDANG,50,659,140
True_TIDAK SEHAT,0,94,47


In [16]:
# CELL 13B - evaluation split B (FULL, SAFE)

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
)

# =========================
# Prepare train / validation
# =========================
X_train = train_B[FEATURES]
y_train = train_B[TARGET]

X_val = val_B[FEATURES]
y_val = val_B[TARGET]

# =========================
# Train model
# =========================
xgb.fit(X_train, y_train)

# =========================
# Predict (numeric proxy)
# =========================
y_val_num = xgb.predict(X_val)

# =========================
# Regression metrics
# =========================
rmse = np.sqrt(mean_squared_error(y_val, y_val_num))
mae = mean_absolute_error(y_val, y_val_num)

print("===== REGRESSION METRICS =====")
print(f"RMSE : {rmse:.3f}")
print(f"MAE  : {mae:.3f}")

# =========================
# Convert to kategori
# =========================
y_val_cat = y_val.apply(ispu_to_kategori)
y_pred_cat = pd.Series(y_val_num, index=y_val.index).apply(ispu_to_kategori)

# =========================
# Classification metrics
# =========================
print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(y_val_cat, y_pred_cat, zero_division=0))

print("\n===== CONFUSION MATRIX =====")
labels = sorted(y_val_cat.unique())

cm = confusion_matrix(y_val_cat, y_pred_cat, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=[f"True_{l}" for l in labels],
    columns=[f"Pred_{l}" for l in labels],
)

display(cm_df)


===== REGRESSION METRICS =====
RMSE : 27.036
MAE  : 18.122

===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

        BAIK       0.76      0.33      0.46        89
      SEDANG       0.83      0.91      0.87       735
 TIDAK SEHAT       0.22      0.18      0.20        94

    accuracy                           0.78       918
   macro avg       0.61      0.47      0.51       918
weighted avg       0.76      0.78      0.76       918


===== CONFUSION MATRIX =====


Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,29,60,0
True_SEDANG,9,667,59
True_TIDAK SEHAT,0,77,17


In [17]:
# CELL 14 - lokasi list

lokasi_list = sorted(df_model["lokasi_clean"].unique())
lokasi_list


['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

In [18]:
# CELL 15 - evaluation helper

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
)

def evaluate_model_per_lokasi(
    model,
    train_df,
    test_df,
    features,
    target,
    title=""
):
    X_train = train_df[features]
    y_train = train_df[target]

    X_test = test_df[features]
    y_test = test_df[target]

    model.fit(X_train, y_train)
    y_pred_num = model.predict(X_test)

    # regression metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_num))
    mae = mean_absolute_error(y_test, y_pred_num)

    print(f"\n===== {title} =====")
    print(f"RMSE : {rmse:.3f}")
    print(f"MAE  : {mae:.3f}")

    # classification metrics
    y_test_cat = y_test.apply(ispu_to_kategori)
    y_pred_cat = pd.Series(y_pred_num, index=y_test.index).apply(ispu_to_kategori)

    print("\nClassification Report:")
    print(classification_report(y_test_cat, y_pred_cat, zero_division=0))

    labels = sorted(y_test_cat.unique())
    cm = confusion_matrix(y_test_cat, y_pred_cat, labels=labels)

    cm_df = pd.DataFrame(
        cm,
        index=[f"True_{l}" for l in labels],
        columns=[f"Pred_{l}" for l in labels],
    )

    display(cm_df)


In [19]:
# CELL 16 - define split B function

def split_rolling(df_lokasi):
    train_df = df_lokasi[
        (df_lokasi["tanggal"] >= "2010-01-01") &
        (df_lokasi["tanggal"] <= "2024-06-30")
    ]

    val_df = df_lokasi[
        (df_lokasi["tanggal"] >= "2024-07-01") &
        (df_lokasi["tanggal"] <= "2024-12-31")
    ]

    return train_df, val_df


In [20]:
# CELL 17 - train & evaluate per lokasi_clean (Split B)

from xgboost import XGBRegressor

for lokasi in lokasi_list:
    print(f"\n\n############################")
    print(f"### LOKASI: {lokasi}")
    print(f"############################")

    df_lokasi = df_model[df_model["lokasi_clean"] == lokasi]

    # split
    train_df, val_df = split_rolling(df_lokasi)

    print("Train rows:", len(train_df))
    print("Val rows  :", len(val_df))

    if len(train_df) < 200 or len(val_df) < 50:
        print("⚠️ Not enough data, skipping...")
        continue

    # model (same config for fairness)
    model = XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
    )

    evaluate_model_per_lokasi(
        model=model,
        train_df=train_df,
        test_df=val_df,
        features=FEATURES,
        target=TARGET,
        title=f"{lokasi} (Rolling Validation)",
    )




############################
### LOKASI: DKI1
############################
Train rows: 2072
Val rows  : 184

===== DKI1 (Rolling Validation) =====
RMSE : 23.068
MAE  : 14.324

Classification Report:
              precision    recall  f1-score   support

        BAIK       0.71      0.52      0.60        29
      SEDANG       0.85      0.91      0.88       145
 TIDAK SEHAT       0.12      0.10      0.11        10

    accuracy                           0.80       184
   macro avg       0.56      0.51      0.53       184
weighted avg       0.79      0.80      0.79       184



Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,15,14,0
True_SEDANG,6,132,7
True_TIDAK SEHAT,0,9,1




############################
### LOKASI: DKI2
############################
Train rows: 2173
Val rows  : 184

===== DKI2 (Rolling Validation) =====
RMSE : 24.508
MAE  : 17.469

Classification Report:
              precision    recall  f1-score   support

        BAIK       0.80      0.33      0.47        12
      SEDANG       0.86      0.92      0.89       153
 TIDAK SEHAT       0.27      0.21      0.24        19

    accuracy                           0.81       184
   macro avg       0.64      0.49      0.53       184
weighted avg       0.79      0.81      0.79       184



Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,4,8,0
True_SEDANG,1,141,11
True_TIDAK SEHAT,0,15,4




############################
### LOKASI: DKI3
############################
Train rows: 2148
Val rows  : 183

===== DKI3 (Rolling Validation) =====
RMSE : 24.551
MAE  : 14.701

Classification Report:
              precision    recall  f1-score   support

        BAIK       0.00      0.00      0.00         3
      SEDANG       0.90      0.89      0.90       166
 TIDAK SEHAT       0.05      0.07      0.06        14

    accuracy                           0.81       183
   macro avg       0.32      0.32      0.32       183
weighted avg       0.82      0.81      0.82       183



Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,0,3,0
True_SEDANG,0,148,18
True_TIDAK SEHAT,0,13,1




############################
### LOKASI: DKI4
############################
Train rows: 2564
Val rows  : 183

===== DKI4 (Rolling Validation) =====
RMSE : 21.184
MAE  : 13.071

Classification Report:
              precision    recall  f1-score   support

        BAIK       0.80      0.40      0.53        10
      SEDANG       0.90      0.96      0.93       161
 TIDAK SEHAT       0.00      0.00      0.00        12

    accuracy                           0.87       183
   macro avg       0.57      0.45      0.49       183
weighted avg       0.83      0.87      0.85       183



Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,4,6,0
True_SEDANG,1,155,5
True_TIDAK SEHAT,0,12,0




############################
### LOKASI: DKI5
############################
Train rows: 2150
Val rows  : 184

===== DKI5 (Rolling Validation) =====
RMSE : 35.739
MAE  : 25.529

Classification Report:
              precision    recall  f1-score   support

        BAIK       0.85      0.63      0.72        35
      SEDANG       0.67      0.82      0.74       110
 TIDAK SEHAT       0.25      0.15      0.19        39

    accuracy                           0.64       184
   macro avg       0.59      0.53      0.55       184
weighted avg       0.62      0.64      0.62       184



Unnamed: 0,Pred_BAIK,Pred_SEDANG,Pred_TIDAK SEHAT
True_BAIK,22,11,2
True_SEDANG,4,90,16
True_TIDAK SEHAT,0,33,6


In [21]:
# CELL 16 - final rolling split (LOCKED)

def split_final(df_lokasi):
    train_df = df_lokasi[
        (df_lokasi["tanggal"] >= "2010-01-01") &
        (df_lokasi["tanggal"] <= "2025-08-31")
    ]
    return train_df


In [22]:
# CELL 17 - lokasi list

lokasi_list = sorted(df_model["lokasi_clean"].unique())
print(lokasi_list)


['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']


In [23]:
# CELL 18 - model factory (XGBoost, FINAL)

from xgboost import XGBRegressor

def make_model(seed):
    return XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=seed,
    )


In [24]:
# CELL 19 - retrain per lokasi_clean (FINAL)

final_models = {}

for lokasi in lokasi_list:
    print(f"\n############################")
    print(f"### TRAIN FINAL MODEL: {lokasi}")
    print(f"############################")

    df_lokasi = df_model[df_model["lokasi_clean"] == lokasi]

    train_df = split_final(df_lokasi)

    print("Train rows:", len(train_df))

    if len(train_df) < 500:
        print("⚠️ Too little data, skipping...")
        continue

    X_train = train_df[FEATURES]
    y_train = train_df[TARGET]

    model = make_model(SEED)
    model.fit(X_train, y_train)

    final_models[lokasi] = model

print("\n✅ Training completed for models:")
print(list(final_models.keys()))



############################
### TRAIN FINAL MODEL: DKI1
############################
Train rows: 2498

############################
### TRAIN FINAL MODEL: DKI2
############################
Train rows: 2598

############################
### TRAIN FINAL MODEL: DKI3
############################
Train rows: 2573

############################
### TRAIN FINAL MODEL: DKI4
############################
Train rows: 2984

############################
### TRAIN FINAL MODEL: DKI5
############################
Train rows: 2575

✅ Training completed for models:
['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']


In [25]:
# CELL 20 - sanity check prediction

for lokasi, model in final_models.items():
    sample_row = (
        df_model[df_model["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .iloc[-1:]
    )

    pred_num = model.predict(sample_row[FEATURES])[0]
    pred_cat = ispu_to_kategori(pred_num)

    print(f"{lokasi} → numeric: {pred_num:.2f}, kategori: {pred_cat}")


DKI1 → numeric: 80.20, kategori: SEDANG
DKI2 → numeric: 85.66, kategori: SEDANG
DKI3 → numeric: 83.14, kategori: SEDANG
DKI4 → numeric: 82.40, kategori: SEDANG
DKI5 → numeric: 73.86, kategori: SEDANG


In [26]:
# CELL 21 - imports for saving models

import joblib
from pathlib import Path


In [27]:
# CELL 22 - output directory for PKL

MODELS_DIR = Path.cwd() / "models_pkl"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("Models will be saved to:", MODELS_DIR)


Models will be saved to: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl


In [28]:
# CELL 23 - save PKL per lokasi_clean

saved_paths = {}

for lokasi, model in final_models.items():
    pkl_path = MODELS_DIR / f"xgb_ispu_{lokasi}.pkl"
    joblib.dump(model, pkl_path)
    saved_paths[lokasi] = pkl_path
    print(f"✅ Saved {lokasi} model → {pkl_path.name}")

print("\nSaved models:")
for k, v in saved_paths.items():
    print(f"{k}: {v}")


✅ Saved DKI1 model → xgb_ispu_DKI1.pkl
✅ Saved DKI2 model → xgb_ispu_DKI2.pkl
✅ Saved DKI3 model → xgb_ispu_DKI3.pkl
✅ Saved DKI4 model → xgb_ispu_DKI4.pkl
✅ Saved DKI5 model → xgb_ispu_DKI5.pkl

Saved models:
DKI1: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl\xgb_ispu_DKI1.pkl
DKI2: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl\xgb_ispu_DKI2.pkl
DKI3: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl\xgb_ispu_DKI3.pkl
DKI4: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl\xgb_ispu_DKI4.pkl
DKI5: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\models_pkl\xgb_ispu_DKI5.pkl


In [29]:
# CELL 24 - verify PKL integrity

for lokasi, pkl_path in saved_paths.items():
    loaded_model = joblib.load(pkl_path)

    sample_row = (
        df_model[df_model["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .iloc[-1:]
    )

    pred_num = loaded_model.predict(sample_row[FEATURES])[0]
    pred_cat = ispu_to_kategori(pred_num)

    print(f"{lokasi} | loaded PKL → numeric: {pred_num:.2f}, kategori: {pred_cat}")


DKI1 | loaded PKL → numeric: 80.20, kategori: SEDANG
DKI2 | loaded PKL → numeric: 85.66, kategori: SEDANG
DKI3 | loaded PKL → numeric: 83.14, kategori: SEDANG
DKI4 | loaded PKL → numeric: 82.40, kategori: SEDANG
DKI5 | loaded PKL → numeric: 73.86, kategori: SEDANG


In [30]:
# CELL 25 - load sample submission

path_sub = find_file("sample_submission.csv")
if path_sub is None:
    raise FileNotFoundError("❌ sample_submission.csv not found")

df_sub = pd.read_csv(path_sub)
display(df_sub.head())


Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [31]:
# CELL 26 - parse tanggal & lokasi from id

df_sub["tanggal"] = pd.to_datetime(df_sub["id"].str[:10])
df_sub["lokasi_clean"] = df_sub["id"].str.split("_").str[1]

display(df_sub.head())


Unnamed: 0,id,category,tanggal,lokasi_clean
0,2025-09-01_DKI1,,2025-09-01,DKI1
1,2025-09-01_DKI2,,2025-09-01,DKI2
2,2025-09-01_DKI3,,2025-09-01,DKI3
3,2025-09-01_DKI4,,2025-09-01,DKI4
4,2025-09-01_DKI5,,2025-09-01,DKI5


In [32]:
# CELL 27 - attach is_libur for future dates (FIXED)

# load libur_processed.csv
path_libur = find_file("libur_processed.csv")
if path_libur is None:
    raise FileNotFoundError("❌ libur_processed.csv not found")

df_libur_processed = pd.read_csv(path_libur)

# ensure datetime
df_libur_processed["tanggal"] = pd.to_datetime(df_libur_processed["tanggal"])

# merge with submission frame
df_future = df_sub.merge(
    df_libur_processed[["tanggal", "is_libur"]],
    on="tanggal",
    how="left"
)

# fill missing (non-holiday)
df_future["is_libur"] = df_future["is_libur"].fillna(0).astype(int)

display(df_future.head())
df_future["is_libur"].value_counts()



Unnamed: 0,id,category,tanggal,lokasi_clean,is_libur
0,2025-09-01_DKI1,,2025-09-01,DKI1,0
1,2025-09-01_DKI2,,2025-09-01,DKI2,0
2,2025-09-01_DKI3,,2025-09-01,DKI3,0
3,2025-09-01_DKI4,,2025-09-01,DKI4,0
4,2025-09-01_DKI5,,2025-09-01,DKI5,0


is_libur
0    320
1    135
Name: count, dtype: int64

In [33]:
# CELL 28 - calendar features

df_future["month"] = df_future["tanggal"].dt.month
df_future["month_sin"] = np.sin(2 * np.pi * df_future["month"] / 12)
df_future["month_cos"] = np.cos(2 * np.pi * df_future["month"] / 12)

display(df_future.head())


Unnamed: 0,id,category,tanggal,lokasi_clean,is_libur,month,month_sin,month_cos
0,2025-09-01_DKI1,,2025-09-01,DKI1,0,9,-1.0,-1.83697e-16
1,2025-09-01_DKI2,,2025-09-01,DKI2,0,9,-1.0,-1.83697e-16
2,2025-09-01_DKI3,,2025-09-01,DKI3,0,9,-1.0,-1.83697e-16
3,2025-09-01_DKI4,,2025-09-01,DKI4,0,9,-1.0,-1.83697e-16
4,2025-09-01_DKI5,,2025-09-01,DKI5,0,9,-1.0,-1.83697e-16


In [34]:
# CELL 29 - lag & rolling features from history

future_rows = []

for lokasi in df_future["lokasi_clean"].unique():
    hist = (
        df_model[df_model["lokasi_clean"] == lokasi]
        .sort_values("tanggal")
        .copy()
    )

    if len(hist) < 365:
        raise ValueError(f"Not enough history for {lokasi}")

    for _, row in df_future[df_future["lokasi_clean"] == lokasi].iterrows():
        new_row = row.copy()

        # lag features
        new_row["ispu_lag_1"] = hist.iloc[-1]["ispu_numeric"]
        new_row["ispu_lag_7"] = hist.iloc[-7]["ispu_numeric"]
        new_row["ispu_lag_30"] = hist.iloc[-30]["ispu_numeric"]
        new_row["ispu_lag_365"] = hist.iloc[-365]["ispu_numeric"]

        # rolling features
        new_row["ispu_roll_7"] = hist.iloc[-7:]["ispu_numeric"].mean()
        new_row["ispu_roll_30"] = hist.iloc[-30:]["ispu_numeric"].mean()

        future_rows.append(new_row)

df_future_feat = pd.DataFrame(future_rows)
display(df_future_feat.head())


Unnamed: 0,id,category,tanggal,lokasi_clean,is_libur,month,month_sin,month_cos,ispu_lag_1,ispu_lag_7,ispu_lag_30,ispu_lag_365,ispu_roll_7,ispu_roll_30
0,2025-09-01_DKI1,,2025-09-01,DKI1,0,9,-1.0,-1.83697e-16,75.0,150.0,75.0,75.0,85.714286,87.5
5,2025-09-02_DKI1,,2025-09-02,DKI1,0,9,-1.0,-1.83697e-16,75.0,150.0,75.0,75.0,85.714286,87.5
10,2025-09-03_DKI1,,2025-09-03,DKI1,0,9,-1.0,-1.83697e-16,75.0,150.0,75.0,75.0,85.714286,87.5
15,2025-09-04_DKI1,,2025-09-04,DKI1,0,9,-1.0,-1.83697e-16,75.0,150.0,75.0,75.0,85.714286,87.5
20,2025-09-05_DKI1,,2025-09-05,DKI1,1,9,-1.0,-1.83697e-16,75.0,150.0,75.0,75.0,85.714286,87.5


In [35]:
# CELL 30 - load PKLs

import joblib
from pathlib import Path

MODELS_DIR = Path.cwd() / "models_pkl"

models = {}
for lokasi in df_future_feat["lokasi_clean"].unique():
    pkl_path = MODELS_DIR / f"xgb_ispu_{lokasi}.pkl"
    if not pkl_path.exists():
        raise FileNotFoundError(f"Model not found: {pkl_path}")
    models[lokasi] = joblib.load(pkl_path)
    print(f"Loaded model for {lokasi}")


Loaded model for DKI1
Loaded model for DKI2
Loaded model for DKI3
Loaded model for DKI4
Loaded model for DKI5


In [36]:
# CELL 31 - predict numeric ISPU

preds = []

for lokasi, model in models.items():
    df_loc = df_future_feat[df_future_feat["lokasi_clean"] == lokasi].copy()
    df_loc["pred_ispu"] = model.predict(df_loc[FEATURES])
    preds.append(df_loc)

df_preds = pd.concat(preds, ignore_index=True)
display(df_preds[["id", "pred_ispu"]].head())


Unnamed: 0,id,pred_ispu
0,2025-09-01_DKI1,74.820763
1,2025-09-02_DKI1,74.820763
2,2025-09-03_DKI1,74.820763
3,2025-09-04_DKI1,74.820763
4,2025-09-05_DKI1,75.704185


In [37]:
# CELL 32 - convert numeric ISPU to kategori

df_preds["kategori"] = df_preds["pred_ispu"].apply(ispu_to_kategori)

display(df_preds[["id", "pred_ispu", "kategori"]].head())


Unnamed: 0,id,pred_ispu,kategori
0,2025-09-01_DKI1,74.820763,SEDANG
1,2025-09-02_DKI1,74.820763,SEDANG
2,2025-09-03_DKI1,74.820763,SEDANG
3,2025-09-04_DKI1,74.820763,SEDANG
4,2025-09-05_DKI1,75.704185,SEDANG


In [38]:
# CELL 33 - write submission.csv (FINAL)

submission = (
    df_preds[["id", "kategori"]]
    .sort_values("id")
    .reset_index(drop=True)
)

out_path = Path.cwd() / "submission.csv"
submission.to_csv(out_path, index=False)

print("✅ submission.csv saved to:", out_path)
display(submission.head())


✅ submission.csv saved to: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\matematic_model\markov_fresh\submission.csv


Unnamed: 0,id,kategori
0,2025-09-01_DKI1,SEDANG
1,2025-09-01_DKI2,SEDANG
2,2025-09-01_DKI3,SEDANG
3,2025-09-01_DKI4,SEDANG
4,2025-09-01_DKI5,SEDANG


In [39]:
submission["kategori"].value_counts()


kategori
SEDANG    455
Name: count, dtype: int64