# Datavidia ISPU Prediction - Fixed Version

Notebook ini telah diperbaiki untuk mengatasi **Data Leakage** dan mengoptimalkan **Feature Engineering**. 

### Perbaikan Utama:
1. **Anti-Leakage**: Menggunakan `shift(1)` pada semua fitur polutan dan cuaca sehingga model hanya memprediksi berdasarkan data masa lalu.
2. **Efficient Rolling**: Menggunakan `groupby().transform()` untuk menghitung statistik bergerak (rolling mean) tanpa loop lambat.
3. **Calendar Features**: Menambahkan fitur bulan dan hari dalam seminggu.
4. **Balanced Weights**: Menggunakan `class_weight` untuk meningkatkan akurasi pada kategori minoritas (TIDAK SEHAT).

In [7]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("‚ùå script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column

path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


### 2. Feature Engineering
Kita akan membuat fitur lag dan rolling secara efisien.

In [8]:
def create_features(data):
    data["tanggal"] = pd.to_datetime(data["tanggal"])
    data = data.copy()
    
    # Fitur Kalender
    data["month"] = data["tanggal"].dt.month
    data["day_of_week"] = data["tanggal"].dt.dayofweek
    data["is_weekend"] = data["day_of_week"].isin([5, 6]).astype(int)
    
    # List fitur dasar yang ingin diolah
    POLLUTANTS = ["pm_sepuluh", "sulfur_dioksida", "karbon_monoksida", "ozon", "nitrogen_dioksida"]
    WEATHER = [
        "temperature_2m_mean (¬∞C)", "relative_humidity_2m_mean (%)",
        "precipitation_sum (mm)", "wind_speed_10m_mean (km/h)", "ndvi"
    ]
    COLS_TO_SHIFT = POLLUTANTS + WEATHER
    
    for col in COLS_TO_SHIFT:
        # Lag 1 (Kemarin), 2, 3
        data[f"{col}_lag_1"] = data.groupby("lokasi_clean")[col].shift(1)
        data[f"{col}_lag_2"] = data.groupby("lokasi_clean")[col].shift(2)
        data[f"{col}_lag_3"] = data.groupby("lokasi_clean")[col].shift(3)
        
        # Rolling Mean 7 hari (menggunakan data s/d kemarin)
        data[f"{col}_roll7"] = (
            data.groupby("lokasi_clean")[col]
            .transform(lambda x: x.shift(1).rolling(7, min_periods=3).mean())
        )
        
        # Rolling Mean 3 hari
        data[f"{col}_roll3"] = (
            data.groupby("lokasi_clean")[col]
            .transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())
        )
        
    return data

print("üî® Building features...")
df_feat = create_features(df)

# Pilih fitur final (Hanya fitur masa lalu + kalender)
FEATURES = [c for c in df_feat.columns if "_lag_" in c or "_roll" in c or c in ["month", "day_of_week", "is_weekend"]]
print(f"Total features used: {len(FEATURES)}")

üî® Building features...
Total features used: 53


### 3. Model Training & Validation

In [9]:
# Mapping target ke angka
df_feat["y"] = df_feat["kategori"].map(LABEL_MAP)

# Split berdasarkan tanggal (Time Series Split)
SPLIT_DATE = "2024-12-31"

train_mask = (df_feat["tanggal"] < SPLIT_DATE) & (df_feat["y"].notna())
valid_mask = (df_feat["tanggal"] >= SPLIT_DATE) & (df_feat["y"].notna())

X_train, y_train = df_feat.loc[train_mask, FEATURES], df_feat.loc[train_mask, "y"]
X_valid, y_valid = df_feat.loc[valid_mask, FEATURES], df_feat.loc[valid_mask, "y"]

print(f"Train size: {len(X_train)}, Valid size: {len(X_valid)}")

# Inisialisasi Model LightGBM
model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=1000,
    learning_rate=0.03,
    class_weight={0: 1.0, 1: 0.8, 2: 4.5}, # Bobot tinggi untuk kategori TIDAK SEHAT
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

# Evaluasi
y_pred = model.predict(X_valid)
print("\n--- VALIDATION REPORT ---")
print(f"Macro F1: {f1_score(y_valid, y_pred, average='macro'):.4f}")
print(classification_report(y_valid, y_pred, target_names=LABEL_MAP.keys()))

Train size: 13845, Valid size: 1208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8808
[LightGBM] [Info] Number of data points in the train set: 13845, number of used features: 53
[LightGBM] [Info] Start training from score -2.265980
[LightGBM] [Info] Start training from score -0.965046
[LightGBM] [Info] Start training from score -0.662994
Training until validation scores don't improve for 50 rounds


[WinError 2] The system cannot find the file specified
  File "C:\Users\USER\AppData\Roaming\Python\Python312\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Early stopping, best iteration is:
[320]	valid_0's multi_logloss: 0.73406

--- VALIDATION REPORT ---
Macro F1: 0.5076
              precision    recall  f1-score   support

        BAIK       0.56      0.51      0.53       218
      SEDANG       0.76      0.79      0.77       849
 TIDAK SEHAT       0.23      0.21      0.22       141

    accuracy                           0.67      1208
   macro avg       0.52      0.50      0.51      1208
weighted avg       0.66      0.67      0.66      1208



### 4. Generation Submission
Kita memprediksi kategori untuk data yang ada di `sample_submission.csv`.

In [10]:
path = find_file("sample_submission.csv")

if path is None:
    raise FileNotFoundError("‚ùå File merged tidak ditemukan")

sub = pd.read_csv(path, na_values=NA_VALUES)
sub.head()

Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [13]:
# ===============================
# 4. GENERATE SUBMISSION (FORECASTING)
# ===============================
path = find_file("sample_submission.csv")

if path is None:
    raise FileNotFoundError("‚ùå sample_submission.csv tidak ditemukan")

sub = pd.read_csv(path, na_values=NA_VALUES)

# Ambil tanggal & lokasi dari id
sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]

# Pastikan history terurut
df_hist = df_feat.sort_values(["lokasi_clean", "tanggal"])

rows = []

for _, row in sub.iterrows():
    loc = row["lokasi_clean"]
    tgl = row["tanggal"]

    # Ambil data historis sebelum tanggal prediksi
    hist = df_hist[
        (df_hist["lokasi_clean"] == loc) &
        (df_hist["tanggal"] < tgl)
    ].copy()

    if len(hist) == 0:
        pred_label = "SEDANG"  # fallback
    else:
        # Ambil baris terakhir sebagai basis fitur
        last_row = hist.iloc[-1:].copy()

        # Gunakan fitur yang sama seperti training
        X_pred = last_row[FEATURES]

        pred_num = model.predict(X_pred)[0]
        pred_label = INV_LABEL_MAP[pred_num]

    rows.append(pred_label)

# Simpan submission
sub["category"] = rows
sub[["id", "category"]].to_csv("submission.csv", index=False)

print("‚úÖ submission.csv berhasil dibuat!")


‚úÖ submission.csv berhasil dibuat!
