In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\ispu_all_years_duplicate_handled.csv",
    na_values=["---", "--", "", " ", "NA", "N/A"]
)

df['tanggal'] = pd.to_datetime(df['tanggal'])
df = df.sort_values(['stasiun', 'tanggal']).reset_index(drop=True)

# Label valid
labels = ['BAIK', 'SEDANG', 'TIDAK SEHAT', 'SANGAT TIDAK SEHAT', 'BERBAHAYA']
df = df[df['kategori'].isin(labels)].copy()

features = [
    'pm_sepuluh',
    'pm_duakomalima',
    'sulfur_dioksida',
    'karbon_monoksida',
    'ozon',
    'nitrogen_dioksida'
]

df[features] = df[features].clip(lower=0)

# Encode stasiun
le = LabelEncoder()
df['stasiun_code'] = le.fit_transform(df['stasiun'])

print("Distribusi label asli:")
print(df['kategori'].value_counts())


Distribusi label asli:
kategori
SEDANG                10345
TIDAK SEHAT            2424
BAIK                   2286
SANGAT TIDAK SEHAT      203
BERBAHAYA                 1
Name: count, dtype: int64


In [3]:
df['label_risk'] = df['kategori'].apply(
    lambda x: 'AMAN' if x in ['BAIK', 'SEDANG'] else 'TIDAK AMAN'
)

print("\nDistribusi label risk:")
print(df['label_risk'].value_counts())



Distribusi label risk:
label_risk
AMAN          12631
TIDAK AMAN     2628
Name: count, dtype: int64


In [4]:
windows = {
    'Window1': {'train_end': '2022-12-31', 'test_start': '2023-01-01', 'test_end': '2023-06-30'},
    'Window2': {'train_end': '2023-06-30', 'test_start': '2023-07-01', 'test_end': '2023-12-31'},
    'Window3': {'train_end': '2023-12-31', 'test_start': '2024-01-01', 'test_end': '2024-12-31'},
    'Window4': {'train_end': '2024-12-31', 'test_start': '2025-01-01', 'test_end': '2025-12-31'}
}


In [5]:
windows = {
    'Window1': {'train_end': '2022-12-31', 'test_start': '2023-01-01', 'test_end': '2023-06-30'},
    'Window2': {'train_end': '2023-06-30', 'test_start': '2023-07-01', 'test_end': '2023-12-31'},
    'Window3': {'train_end': '2023-12-31', 'test_start': '2024-01-01', 'test_end': '2024-12-31'},
    'Window4': {'train_end': '2024-12-31', 'test_start': '2025-01-01', 'test_end': '2025-12-31'}
}


In [6]:
results = {}

for window_name, dates in windows.items():
    print(f"\n{'='*60}")
    print(window_name.upper())
    print(f"{'='*60}")

    train = df[df['tanggal'] <= dates['train_end']]
    test = df[
        (df['tanggal'] >= dates['test_start']) &
        (df['tanggal'] <= dates['test_end'])
    ]

    print(f"Train: {len(train)} | Test: {len(test)}")

    X_train = train[features + ['stasiun_code']].fillna(
        train.groupby('stasiun_code')[features].transform('median')
    ).fillna(0)

    X_test = test[features + ['stasiun_code']].fillna(
        test.groupby('stasiun_code')[features].transform('median')
    ).fillna(0)

    y_train = train['label_risk']
    y_test = test['label_risk']

    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=14,
        min_samples_leaf=5,
        class_weight={'AMAN': 1, 'TIDAK AMAN': 3},
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"Accuracy  : {acc:.3f}")
    print(f"F1-macro  : {f1:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    results[window_name] = {
        'accuracy': acc,
        'f1_macro': f1
    }



WINDOW1
Train: 10428 | Test: 893
Accuracy  : 1.000
F1-macro  : 1.000

Classification Report:
              precision    recall  f1-score   support

        AMAN       1.00      1.00      1.00       836
  TIDAK AMAN       1.00      1.00      1.00        57

    accuracy                           1.00       893
   macro avg       1.00      1.00      1.00       893
weighted avg       1.00      1.00      1.00       893


WINDOW2
Train: 11321 | Test: 911
Accuracy  : 0.996
F1-macro  : 0.992

Classification Report:
              precision    recall  f1-score   support

        AMAN       1.00      0.99      1.00       758
  TIDAK AMAN       0.97      1.00      0.99       153

    accuracy                           1.00       911
   macro avg       0.99      1.00      0.99       911
weighted avg       1.00      1.00      1.00       911


WINDOW3
Train: 12232 | Test: 1824
Accuracy  : 0.996
F1-macro  : 0.989

Classification Report:
              precision    recall  f1-score   support

        

In [7]:
print("\nFINAL SUMMARY")
for k, v in results.items():
    print(f"{k}: Accuracy={v['accuracy']:.3f}, F1-macro={v['f1_macro']:.3f}")



FINAL SUMMARY
Window1: Accuracy=1.000, F1-macro=1.000
Window2: Accuracy=0.996, F1-macro=0.992
Window3: Accuracy=0.996, F1-macro=0.989
Window4: Accuracy=0.999, F1-macro=0.998
