In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [2]:
path = find_file("merged_cuaca_ndvi_ispu.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [3]:
df["tanggal"] = pd.to_datetime(df["tanggal"])
df = df.sort_values("tanggal").reset_index(drop=True)


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["kategori_enc"] = le.fit_transform(df["kategori"])


In [5]:
df["lag_1"] = df["kategori_enc"].shift(1)
df["lag_7"] = df["kategori_enc"].shift(7)
df["lag_3"]  = df["kategori_enc"].shift(3)
df["lag_14"] = df["kategori_enc"].shift(14)
df["rolling_mean_7"]  = df["kategori_enc"].shift(1).rolling(7).mean()
df["rolling_std_7"]   = df["kategori_enc"].shift(1).rolling(7).std()
df["rolling_mean_14"] = df["kategori_enc"].shift(1).rolling(14).mean()



df["rolling_mode_7"] = (
    df["kategori_enc"]
    .shift(1)
    .rolling(7)
    .apply(lambda x: pd.Series(x).mode().iloc[0])
)


df["dayofweek"] = df["tanggal"].dt.dayofweek
df["month"] = df["tanggal"].dt.month


In [6]:
df_model = df.dropna().reset_index(drop=True)


In [7]:
FEATURES = [
    "lag_1", "lag_3", "lag_7", "lag_14",
    "rolling_mode_7",
    "rolling_mean_7", "rolling_mean_14",
    "rolling_std_7",
    "dayofweek", "month"
]


X = df_model[FEATURES]
y = df_model["kategori_enc"]


In [8]:
WINDOWS = {
    "W1": (("2010-01-01", "2022-12-31"), ("2023-01-01", "2023-06-30")),
    "W2": (("2010-01-01", "2023-06-30"), ("2023-07-01", "2023-12-31")),
    "W3": (("2010-01-01", "2023-12-31"), ("2024-01-01", "2024-12-31")),
    "W4": (("2010-01-01", "2024-12-31"), ("2025-01-01", "2025-12-31")), 
}


In [9]:
train_rng, val_rng = WINDOWS["W3"]

train_mask = (df_model["tanggal"] >= train_rng[0]) & (df_model["tanggal"] <= train_rng[1])
val_mask   = (df_model["tanggal"] >= val_rng[0])   & (df_model["tanggal"] <= val_rng[1])

X_train, X_val = X[train_mask], X[val_mask]
y_train, y_val = y[train_mask], y[val_mask]


In [10]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(
    objective="multiclass",
    class_weight="balanced",
    n_estimators=300,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 12216, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [11]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_val)
print("Macro-F1:", f1_score(y_val, y_pred, average="macro"))


Macro-F1: 0.3847787976338824


In [12]:
path = find_file("sample_submission.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

sample_submission = pd.read_csv(path, na_values=NA_VALUES)
display(sample_submission.head())

Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [13]:
from collections import deque

history = deque(
    df_model["kategori_enc"].iloc[-14:],  # 14 hari terakhir
    maxlen=14
)

last_date = df_model["tanggal"].iloc[-1]
predictions = []
n_test = len(sample_submission)

for i in range(n_test):
    current_date = last_date + pd.Timedelta(days=1)

    hist = list(history)  # ⬅️ INI YANG KAMU KURANG

    X_step = pd.DataFrame([{
        "lag_1": hist[-1],
        "lag_3": hist[-3],
        "lag_7": hist[-7],
        "lag_14": hist[0],

        "rolling_mode_7": pd.Series(hist[-7:]).mode().iloc[0],
        "rolling_mean_7": np.mean(hist[-7:]),
        "rolling_mean_14": np.mean(hist),
        "rolling_std_7": np.std(hist[-7:]),

        "dayofweek": current_date.dayofweek,
        "month": current_date.month,
    }])

    # (boleh pakai predict dulu, nanti bisa upgrade ke proba)
    proba = model.predict_proba(X_step)[0]

# smoothing: tekan prediksi ekstrem
    proba = proba ** 1.2
    proba = proba / proba.sum()

    y_next = np.argmax(proba)
    predictions.append(y_next)


    history.append(y_next)     # ⬅️ BUKAN last_7
    last_date = current_date



In [14]:

submission = pd.DataFrame({
    "id": sample_submission["id"],
    "kategori": le.inverse_transform(predictions)
})

submission.to_csv("submission_final.csv", index=False)
