In [27]:
import pandas as pd
import numpy as np

In [28]:
d = pd.read_csv("d1.csv")
t = pd.read_csv("d2.csv")

In [31]:
display(d.info())
display(t.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tanggal              127 non-null    object 
 1   jam                  117 non-null    object 
 2   waktu                0 non-null      float64
 3   fase                 125 non-null    object 
 4   lokasi_perimeter     125 non-null    object 
 5   titik                49 non-null     float64
 6   kategori_kejadian    127 non-null    object 
 7   airline              127 non-null    object 
 8   runway_use           100 non-null    float64
 9   komponen_pesawat     59 non-null     object 
 10  dampak_pada_pesawat  29 non-null     object 
 11  kondisi_kerusakan    68 non-null     object 
 12  tindakan_perbaikan   39 non-null     object 
 13  sumber_informasi     113 non-null    object 
 14  remark               127 non-null    object 
 15  deskripsi            112 non-null    obj

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30181 entries, 0 to 30180
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   no                    30181 non-null  int64  
 1   act_type              30158 non-null  object 
 2   reg_no                30152 non-null  object 
 3   opr                   30181 non-null  object 
 4   flight_number_origin  30144 non-null  object 
 5   flight_number_dest    30175 non-null  object 
 6   ata                   30181 non-null  object 
 7   block_on              30181 non-null  object 
 8   block_off             30181 non-null  object 
 9   atd                   30181 non-null  object 
 10  ground_time           30138 non-null  object 
 11  org                   30144 non-null  object 
 12  des                   30175 non-null  object 
 13  ps                    30144 non-null  object 
 14  runway                30147 non-null  float64
 15  avio_a             

None

#### <b>preprocess

In [45]:
# ===== Fill kolom 'cuaca' di DataFrame `a` pakai open-meteo (client openmeteo_requests) =====
# - Pakai cache & retry (requests_cache + retry_requests)
# - Otomatis pilih endpoint Historical vs Forecast (batas ~3 bulan dari hari ini)
# - Rounding menit: < :30 turun, >= :30 naik (max 23)
# - Map WMO weather_code → deskripsi cuaca (ID)
#
# Prasyarat:
#   pip install openmeteo-requests requests-cache retry-requests

import pandas as pd
import numpy as np
import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta, timezone

# ---------- KONFIG ----------
# Koordinat tetap untuk semua baris (ganti sesuai lokasi kamu).
# Kalau kamu sudah punya kolom latitude/longitude per baris, tinggal loop per (lat,lon) unik.
LAT_VAL = -7.38
LON_VAL = 112.7851

COL_TANGGAL = "tanggal"   # di `a`, bisa dalam format apapun yang bisa di-parse pandas
COL_JAM     = "jam"       # "HH:MM"
COL_CUACA   = "cuaca"     # akan ditulis oleh skrip ini

# ---------- WMO → Deskripsi ID ----------
WMO_DESC_ID = {
    0:"Cerah", 1:"Cerah Berawan", 2:"Berawan Sebagian", 3:"Berawan",
    45:"Berkabut", 48:"Rime Kabut",
    51:"Gerimis Ringan", 53:"Gerimis Sedang", 55:"Gerimis Lebat",
    56:"Gerimis Beku Ringan", 57:"Gerimis Beku Lebat",
    61:"Hujan Ringan", 63:"Hujan Sedang", 65:"Hujan Lebat",
    66:"Hujan Beku Ringan", 67:"Hujan Beku Lebat",
    71:"Salju Ringan", 73:"Salju Sedang", 75:"Salju Lebat",
    77:"Butiran Salju",
    80:"Hujan Gerimis", 81:"Hujan Lebat Sesaat", 82:"Hujan Sangat Lebat Sesaat",
    85:"Hujan Salju Ringan", 86:"Hujan Salju Lebat",
    95:"Badai Petir", 96:"Badai Petir + Hujan Es Ringan", 99:"Badai Petir + Hujan Es Lebat",
}

def weather_code_to_desc(code) -> str:
    try:
        return WMO_DESC_ID.get(int(code), "")
    except Exception:
        return ""

def round_hour_from_hhmm(hhmm: str) -> int:
    """Parse 'HH:MM' → nearest hour; >= :30 naik 1 jam; clamp 0..23."""
    if not isinstance(hhmm, str) or ":" not in hhmm:
        return 0
    hs, ms = hhmm.split(":", 1)
    try: h = int(hs)
    except: h = 0
    try: m = int(ms)
    except: m = 0
    h = max(0, min(23, h)); m = max(0, min(59, m))
    return h if m < 30 else min(23, h + 1)

def pick_base_url(date_ymd: str) -> str:
    """Pilih endpoint Open-Meteo berdasarkan apakah tanggal < ~3 bulan lalu."""
    try:
        dt = datetime.strptime(date_ymd, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    except Exception:
        return "https://api.open-meteo.com/v1/forecast"
    today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
    # buffer ~90 hari
    three_months_ago = today - timedelta(days=90)
    if dt < three_months_ago:
        return "https://historical-forecast-api.open-meteo.com/v1/forecast"
    return "https://api.open-meteo.com/v1/forecast"

# ---------- Setup Open-Meteo client (cache + retry) ----------
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# ---------- Normalisasi tanggal & jam di DataFrame `a` ----------
# Pastikan kolom cuaca ada
if COL_CUACA not in a.columns:
    a[COL_CUACA] = ""

# Parse tanggal → YYYY-MM-DD
a["_tanggal_dt"] = pd.to_datetime(a[COL_TANGGAL], errors="coerce")
a["_tanggal_ymd"] = a["_tanggal_dt"].dt.strftime("%Y-%m-%d")

# Jam sebagai string aman
a["_jam_str"] = a[COL_JAM].astype(str).fillna("00:00")

# Baris yang perlu diisi cuaca
need_mask = a[COL_CUACA].astype(str).eq("") & a["_tanggal_ymd"].notna() & a["_jam_str"].notna()
need_df = a.loc[need_mask, ["_tanggal_ymd", "_jam_str"]].copy()

if not need_df.empty:
    # Kumpulkan tanggal unik yang perlu diambil
    unique_dates = sorted(set(need_df["_tanggal_ymd"]))

    # Kelompokkan tanggal berdasarkan endpoint (historical vs forecast)
    by_base = {}
    for d_ymd in unique_dates:
        base = pick_base_url(d_ymd)
        by_base.setdefault(base, []).append(d_ymd)

    # Kumpulan mapping "YYYY-MM-DDTHH" → code untuk semua tanggal yang diminta
    # (jam lokal sesuai timezone=auto dari Open-Meteo)
    code_map = {}

    for base_url, dates in by_base.items():
        start_date = min(dates)
        end_date   = max(dates)

        params = {
            "latitude": LAT_VAL,
            "longitude": LON_VAL,
            "start_date": start_date,
            "end_date": end_date,
            "hourly": "weather_code",
            "timezone": "auto",   # buat jam lokal sesuai lokasi
        }

        # Panggil API
        try:
            responses = openmeteo.weather_api(base_url, params=params)
        except Exception as e:
            # gagal panggil API untuk range ini; lanjutkan tanggal lain
            # (baris-barismu akan tetap kosong cuacanya)
            # print("Open-Meteo error:", e)
            continue

        # Ambil lokasi pertama (sesuai contoh lib)
        response = responses[0]
        hourly = response.Hourly()

        # Waktu mulai/akhir (epoch detik, sudah memperhitungkan timezone=auto)
        t0 = pd.to_datetime(hourly.Time(), unit="s")
        t1 = pd.to_datetime(hourly.TimeEnd(), unit="s")
        dt_seconds = int(hourly.Interval())

        # DataFrame hourly
        hourly_times = pd.date_range(
            start=t0, end=t1, freq=pd.Timedelta(seconds=dt_seconds), inclusive="left"
        )
        hourly_codes = hourly.Variables(0).ValuesAsNumpy()

        df_hourly = pd.DataFrame({
            "time": hourly_times,          # tz-aware / lokal sesuai timezone=auto
            "weather_code": hourly_codes,
        })

        # Buat key "YYYY-MM-DDTHH" untuk lookup cepat
        # (tanpa menit supaya bisa dicocokkan dengan hasil pembulatan jam)
        keys = df_hourly["time"].dt.strftime("%Y-%m-%dT%H")
        vals = df_hourly["weather_code"].astype("Int64")
        code_map.update(dict(zip(keys, vals)))

    # Setelah semua tanggal diambil, isi cuaca baris per baris
    def row_weather_desc(row):
        date_ymd = row["_tanggal_ymd"]
        jam      = row["_jam_str"]
        if not isinstance(date_ymd, str) or not isinstance(jam, str):
            return ""
        hr = round_hour_from_hhmm(jam)
        key = f"{date_ymd}T{str(hr).zfill(2)}"
        code = code_map.get(key, None)
        return weather_code_to_desc(code) if code is not None and pd.notna(code) else ""

    a.loc[need_mask, COL_CUACA] = a.loc[need_mask].apply(row_weather_desc, axis=1)

# ---------- Bereskan kolom helper ----------
a.drop(columns=["_tanggal_dt", "_tanggal_ymd", "_jam_str"], errors="ignore", inplace=True)

print("Baris yang diisi cuaca:", need_mask.sum())
a.head()


Baris yang diisi cuaca: 48


Unnamed: 0,tanggal,jam,waktu,cuaca,jumlah burung pada titik x,titik,fase,strike,latitude,longitude
0,2025-01-06T00:00:00.000Z,1970-01-01T18:17:00.000Z,Sore,,,1.0,Landing,1,-7.38,112.7851
1,2025-02-09T00:00:00.000Z,1970-01-01T16:10:00.000Z,Sore,,,8.0,Landing,1,-7.38,112.7851
2,2025-02-21T00:00:00.000Z,1970-01-01T10:18:00.000Z,Siang,,,2.0,Take Off,1,-7.38,112.7851
3,2025-01-01T00:00:00.000Z,1970-01-01T06:33:00.000Z,Pagi,,,3.0,Landing,1,-7.38,112.7851
4,2025-02-25T00:00:00.000Z,1970-01-01T12:55:00.000Z,Siang,,,7.0,Landing,1,-7.38,112.7851


In [44]:
pip install retry_requests

Collecting retry_requests
  Obtaining dependency information for retry_requests from https://files.pythonhosted.org/packages/b1/f3/8ce908497bebbc2790ef06240a2c0fb28c096abb59062d88f85090464a5f/retry_requests-2.0.0-py3-none-any.whl.metadata
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Downloading retry_requests-2.0.0-py3-none-any.whl (15 kB)
Installing collected packages: retry_requests
Successfully installed retry_requests-2.0.0
Note: you may need to restart the kernel to use updated packages.


#### <b>modeling

In [None]:
# CatBoostClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# === 1. Pilih fitur & target ===
features = ["jumlah burung pada titik x", "titik", "hour", "dayofweek", "is_weekend", "waktu_Dini Hari", "waktu_Malam", "waktu_Pagi", "waktu_Siang", "waktu_Sore", "cuaca_Cerah Berawan", "cuaca_Hujan", "cuaca_Mendung", "fase_Take Off"]
target = "strike"

# Use the already processed X and y
# X = df_model[features]
# y = df_model[target]

# === 2. Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# === 3. Identifikasi kolom kategori ===
# Since we used one-hot encoding, there are no categorical features in X_train
# If you want to use CatBoost's internal categorical handling, you would need to adjust preprocessing.
# For now, we will treat all features as numerical as they are already one-hot encoded or numerical.
cat_features = [] # Assuming X is already preprocessed with one-hot encoding

# === 4. Hitung scale_pos_weight ===
neg, pos = y_train.value_counts()
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

# === 5. Definisikan CatBoost ===
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    # cat_features=cat_features, # Remove this if treating all as numerical after one-hot
    scale_pos_weight=scale_pos_weight
)

# === 6. Training ===
# Create CatBoost Pool if using internal categorical handling, otherwise fit directly
# train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
# test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

# cat_model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)


# === 7. Prediksi & Evaluasi ===
y_prob = cat_model.predict_proba(X_test)[:, 1]
y_pred = cat_model.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

In [None]:
# CatBoostClassifier with treshold tuning
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# ambil probabilitas kelas 1 dari CatBoost
y_prob = cat_model.predict_proba(X_test)[:, 1]

# coba beberapa threshold
thresholds = [0.5, 0.3, 0.2, 0.1] # dicoba semakin besar (0,9)

for thr in thresholds:
    print(f"\n===== Threshold: {thr} =====")
    y_pred_thr = (y_prob >= thr).astype(int)

    cm = confusion_matrix(y_test, y_pred_thr)
    print("Confusion Matrix:\n", cm)

    print(classification_report(y_test, y_pred_thr, digits=4))

    roc_auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", roc_auc)

In [None]:
# EasyEnsembleClassifier (boosting khusus imbalance) & CatBoostClassifier (stabil di data imbalance)
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import EasyEnsembleClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Model base
easy = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,  # pakai imbalance ratio
    verbose=0,
    random_state=42
)

# Meta-model (level-2)
meta = LogisticRegression(max_iter=1000, class_weight="balanced")

# Stacking Ensemble
stack_model = StackingClassifier(
    estimators=[('easy', easy), ('cat', cat)],
    final_estimator=meta,
    cv=5,
    n_jobs=-1,
    passthrough=True  # biar meta-model juga dapat input fitur asli
)

# Training
stack_model.fit(X_train, y_train)

# Evaluasi
y_pred = stack_model.predict(X_test)
y_prob = stack_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# Makin jelek jangan dipake
# DO NOT USE THIS