# Datavidia ISPU Prediction - Fixed Version

Notebook ini memperbaiki error `AttributeError: Can only use .dt accessor with datetimelike values` dan mengatasi masalah **Data Leakage**.

### Perubahan Utama:
1. **Datetime Fix**: Penambahan konversi `pd.to_datetime` yang lebih aman sebelum pembuatan fitur.
2. **Anti-Leakage**: Model hanya melihat data kemarin (t-1) dan sebelumnya untuk memprediksi hari ini.
3. **Submission Logic**: Menggunakan merge untuk memastikan urutan ID sesuai dengan `sample_submission.csv`.

In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.metrics import classification_report, f1_score

# =========================
# 1. KONFIGURASI & LOAD DATA
# =========================
NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

def find_file(name):
    for path in [Path.cwd()] + list(Path.cwd().parents):
        matches = list(path.rglob(name))
        if matches: return matches[0]
    return None

path_main = find_file("merged_cuaca_ndvi_ispu.csv")
path_sub = find_file("sample_submission.csv")

if path_main is None:
    raise FileNotFoundError("‚ùå File merged_cuaca_ndvi_ispu.csv tidak ditemukan!")

df = pd.read_csv(path_main, na_values=NA_VALUES)

# PREPROCESSING AWAL
df["tanggal"] = pd.to_datetime(df["tanggal"])
df["kategori"] = df["kategori"].replace({
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT"
})
df = df.dropna(subset=["kategori"])
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)
df["y"] = df["kategori"].map(LABEL_MAP)

print(f"Data loaded: {len(df)} rows")

Data loaded: 15257 rows


In [10]:
def create_features(data):
    data = data.copy()
    
    # Pastikan tipe datetime untuk akses .dt
    data["tanggal"] = pd.to_datetime(data["tanggal"])
    
    # Fitur Kalender
    data["month"] = data["tanggal"].dt.month
    data["day_of_week"] = data["tanggal"].dt.dayofweek
    data["is_weekend"] = data["day_of_week"].isin([5, 6]).astype(int)
    
    # List fitur dasar
    POLLUTANTS = ["pm_sepuluh", "sulfur_dioksida", "karbon_monoksida", "ozon", "nitrogen_dioksida"]
    WEATHER = [
        "temperature_2m_mean (¬∞C)", "relative_humidity_2m_mean (%)",
        "precipitation_sum (mm)", "wind_speed_10m_mean (km/h)", "ndvi"
    ]
    COLS_TO_SHIFT = [c for c in POLLUTANTS + WEATHER if c in data.columns]
    
    for col in COLS_TO_SHIFT:
        # Lags: Menggunakan data H-1 s/d H-3 untuk prediksi hari H (Anti-Leakage)
        data[f"{col}_lag_1"] = data.groupby("lokasi_clean")[col].shift(1)
        data[f"{col}_lag_2"] = data.groupby("lokasi_clean")[col].shift(2)
        
        # Rolling Mean 7 hari terakhir (berdasarkan data s/d kemarin)
        data[f"{col}_roll7"] = (
            data.groupby("lokasi_clean")[col]
            .transform(lambda x: x.shift(1).rolling(7, min_periods=3).mean())
        )
        
    return data

print("üî® Building features...")
df_feat = create_features(df)

# Hanya gunakan fitur lag/roll dan kalender (BUKAN base features hari H)
FEATURES = [c for c in df_feat.columns if "_lag_" in c or "_roll" in c or c in ["month", "day_of_week", "is_weekend"]]
print(f"Total features: {len(FEATURES)}")

üî® Building features...
Total features: 33


In [11]:
# =========================
# 3. TRAINING & VALIDATION
# =========================
SPLIT_DATE = "2024-12-31"
train_mask = (df_feat["tanggal" ] < SPLIT_DATE) & (df_feat["y"].notna())
valid_mask = (df_feat["tanggal"] >= SPLIT_DATE) & (df_feat["y"].notna())

X_train, y_train = df_feat.loc[train_mask, FEATURES], df_feat.loc[train_mask, "y"]
X_valid, y_valid = df_feat.loc[valid_mask, FEATURES], df_feat.loc[valid_mask, "y"]

model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=1000,
    learning_rate=0.03,
    class_weight={0: 1.0, 1: 0.8, 2: 4.5}, # Penalti berat jika salah prediksi TIDAK SEHAT
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

y_pred = model.predict(X_valid)
print("\n--- VALIDATION REPORT ---")
print(f"Macro F1: {f1_score(y_valid, y_pred, average='macro'):.4f}")
print(classification_report(y_valid, y_pred, target_names=LABEL_MAP.keys()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5357
[LightGBM] [Info] Number of data points in the train set: 14049, number of used features: 33
[LightGBM] [Info] Start training from score -2.310997
[LightGBM] [Info] Start training from score -1.010063
[LightGBM] [Info] Start training from score -0.622425
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[341]	valid_0's multi_logloss: 0.732343

--- VALIDATION REPORT ---
Macro F1: 0.4979
              precision    recall  f1-score   support

        BAIK       0.55      0.45      0.50       218
      SEDANG       0.75      0.79      0.77       849
 TIDAK SEHAT       0.23      0.22      0.23       141

    accuracy                           0.66      1208
   macro avg       0.51      0.49      0.50      1208
weighted avg       0.65      0.66      0.66

In [12]:
# =========================
# 4. GENERATE SUBMISSION
# =========================
if path_sub:
    sub = pd.read_csv(path_sub)
    
    # Gunakan ID dari sample submission untuk mengambil baris yang sudah ada fiturnya
    sub_data = df_feat[df_feat["id"].isin(sub["id"])].copy()
    
    if len(sub_data) > 0:
        # Prediksi
        preds = model.predict(sub_data[FEATURES])
        sub_data["category"] = [INV_LABEL_MAP[p] for p in preds]
        
        # Gabungkan kembali agar urutan ID tetap sama dengan sample_submission
        final_sub = sub[["id"]].merge(sub_data[["id", "category"]], on="id", how="left")
        
        # Isi nilai kosong (jika ada ID yang tidak masuk ke dataset utama)
        final_sub["category"] = final_sub["category"].fillna("SEDANG")
        
        final_sub.to_csv("submission.csv", index=False)
        print("‚úÖ submission.csv berhasil dibuat!")
    else:
        print("‚ö†Ô∏è ID di sample_submission tidak ditemukan dalam data fitur!")
else:
    print("‚ùå sample_submission.csv tidak ditemukan.")

‚ö†Ô∏è ID di sample_submission tidak ditemukan dalam data fitur!
