In [1]:
#Cell 1: Imports & konfigurasi
#Import library dasar
import os, json, time, math
from collections import Counter, defaultdict

#Import library numerik dan manipulasi data
import numpy as np
import pandas as pd

#Import fungsi untuk split data train-test
from sklearn.model_selection import train_test_split

#CONFIG
DATAFILE = "Data.csv"  
DELAY_THRESHOLD = 15   # menit, ambang batas untuk label delay
TOP_K_CARRIER = 20     # jumlah top maskapai (AC) yang dijadikan fitur
TOP_K_AIRPORT = 30     # jumlah top airport (DEPSTN/ARRSTN) yang dijadikan fitur 
TEST_SIZE = 0.2        # proporsi data test
RANDOM_STATE = 42      # seed untuk reproducibility 
SAVE_OUTPUTS = True    # flag untuk simpan hasil output


In [None]:
#Cell 2: Auto-detect & Load CSV file
import os
import pandas as pd

#Cari file CSV apapun di folder kerja
csv_files = [f for f in os.listdir() if f.lower().endswith(".csv")]

#Jika tidak ada file CSV ditemukan, hentikan program
if len(csv_files) == 0:
    raise FileNotFoundError("Tidak ada file CSV ditemukan. Silakan upload dataset terlebih dahulu.")

#Jika hanya satu file CSV ditemukan, gunakan itu
elif len(csv_files) == 1:
    DATAFILE = csv_files[0]

#Jika ada banyak file CSV, tampilkan list dan pilih file pertama
else:
    print("Beberapa file CSV ditemukan, menggunakan file pertama:")
    for i, f in enumerate(csv_files, start=1):
        print(f"{i}. {f}")
    DATAFILE = csv_files[0]

print(f"\n=== FILE TERDETEKSI ===\nMenggunakan file: {DATAFILE}\n")

#Load data
df = pd.read_csv(DATAFILE, low_memory=False)
#Tampilkan baris pertama secara vertikal agar mudah dibaca
print("=== 5 BARIS PERTAMA (VERTIKAL) ===\n")
for col, val in df.iloc[0].items():
    print(f"{col:25} : {val}")

#Info singkat dataset: jumlah kolom, tipe data, memory usage
print("\n=== INFO DATA ===")
print(df.info())



=== FILE TERDETEKSI ===
Menggunakan file: Data.csv

=== 5 BARIS PERTAMA (VERTIKAL) ===

ID                        : train_id_0
DATOP                     : 2016-01-03
FLTID                     : TU 0712 
DEPSTN                    : CMN
ARRSTN                    : TUN
STD                       : 2016-01-03 10:30:00
STA                       : 2016-01-03 12.55.00
STATUS                    : ATA
AC                        : TU 32AIMN
target                    : 260.0

=== INFO DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107833 entries, 0 to 107832
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      107833 non-null  object 
 1   DATOP   107833 non-null  object 
 2   FLTID   107833 non-null  object 
 3   DEPSTN  107833 non-null  object 
 4   ARRSTN  107833 non-null  object 
 5   STD     107833 non-null  object 
 6   STA     107833 non-null  object 
 7   STATUS  107833 non-null  object 
 8   AC      107833 no

In [3]:
#Cell 3: Column summary & quick checks
#Menampilkan daftar kolom
print("=== COLUMN SUMMARY ===")
print("Columns list (compact):", list(df.columns))

#Hitung jumlah null per kolom
print("\nNull counts per column (vertical):")
for c in df.columns:
    print(f" {c:15}: {df[c].isnull().sum()}")

#Tampilkan tipe data tiap kolom
print("\nData types (vertical):")
for c in df.columns:
    print(f" {c:15}: {df[c].dtype}")


=== COLUMN SUMMARY ===
Columns list (compact): ['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS', 'AC', 'target']

Null counts per column (vertical):
 ID             : 0
 DATOP          : 0
 FLTID          : 0
 DEPSTN         : 0
 ARRSTN         : 0
 STD            : 0
 STA            : 0
 STATUS         : 0
 AC             : 0
 target         : 0

Data types (vertical):
 ID             : object
 DATOP          : object
 FLTID          : object
 DEPSTN         : object
 ARRSTN         : object
 STD            : object
 STA            : object
 STATUS         : object
 AC             : object
 target         : float64


In [4]:
#Cell 4: Descriptive statistics for numeric columns
#Pilih kolom numeric
print("=== DESCRIPTIVE STATISTICS FOR NUMERIC COLUMNS ===")
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) == 0:
    print(" No numeric columns detected.")

# deskriptif statistik: count, mean, std, min, quartile, max
else:
    desc = df[num_cols].describe().T
    for idx, row in desc.iterrows():
        print(f"\nColumn: {idx}")
        for stat_name, stat_val in row.items():
            print(f"  {stat_name:10}: {stat_val}")


=== DESCRIPTIVE STATISTICS FOR NUMERIC COLUMNS ===

Column: target
  count     : 107833.0
  mean      : 48.73301308504818
  std       : 117.13556208555867
  min       : 0.0
  25%       : 0.0
  50%       : 14.0
  75%       : 43.0
  max       : 3451.0


In [8]:
#Cell 5: Prepare target column 'target' and basic distribution

print("=== EXAMINE 'target' COLUMN ===")

#Cek apakah kolom 'target' ada
if 'target' not in df.columns:
    
    #Cari kolom kandidat yang mengandung 'target' atau 'delay'
    alt = [c for c in df.columns if 'target' in c.lower() or 'delay' in c.lower()]
    print("No exact 'target' column. Candidate columns:", alt)
    if alt:
        chosen = alt[0]
        print("Using candidate column:", chosen)
        df['target_raw'] = pd.to_numeric(df[chosen], errors='coerce')
    else:
        print("No candidate column found automatically. Columns list (first 30):")
        for i, c in enumerate(df.columns[:30], start=1):
            print(f" {i}. {c}")
        raise RuntimeError("Column 'target' not found and no candidate detected.")
else:
    df['target_raw'] = pd.to_numeric(df['target'], errors='coerce')

#Hitung jumlah nilai target yang valid
print("Count non-null target:", int(df['target_raw'].notnull().sum()))

#Menampilkan sampai 500 baris
pd.set_option('display.max_rows', 500)

#Tampilkan statistik target
s = df['target_raw'].dropna()
if s.empty:
    print("Target column has no numeric values.")
else:
    print("\nTarget stats:")
    print(f"  min   : {s.min()}")
    print(f"  25%   : {s.quantile(0.25)}")
    print(f"  median: {s.median()}")
    print(f"  mean  : {s.mean()}")
    print(f"  75%   : {s.quantile(0.75)}")
    print(f"  max   : {s.max()}")

    # Tampilkan sample target pertama 40 baris
    print("\nSample target values (first 40):")
    for idx, val in enumerate(s.head(40), start=1):
        print(f"  {idx}. {val}")


=== EXAMINE 'target' COLUMN ===
Count non-null target: 107833

Target stats:
  min   : 0.0
  25%   : 0.0
  median: 14.0
  mean  : 48.73301308504818
  75%   : 43.0
  max   : 3451.0

Sample target values (first 40):
  1. 260.0
  2. 20.0
  3. 0.0
  4. 0.0
  5. 22.0
  6. 53.0
  7. 10.0
  8. 15.0
  9. 16.0
  10. 21.0
  11. 12.0
  12. 18.0
  13. 0.0
  14. 0.0
  15. 18.0
  16. 0.0
  17. 37.0
  18. 149.0
  19. 30.0
  20. 19.0
  21. 27.0
  22. 0.0
  23. 31.0
  24. 10.0
  25. 24.0
  26. 9.0
  27. 0.0
  28. 0.0
  29. 0.0
  30. 0.0
  31. 9.0
  32. 49.0
  33. 0.0
  34. 24.0
  35. 12.0
  36. 7.0
  37. 20.0
  38. 20.0
  39. 125.0
  40. 13.0


In [9]:
#Cell 6: Create classification label 'is_delay' and show distribution

print("=== CREATE CLASSIFICATION LABEL (is_delay) ===")

#Buat label biner: 1 = delay >= DELAY_THRESHOLD, 0 = tidak delay, NaN jika target missing
df['is_delay'] = df['target_raw'].apply(lambda x: 1 if pd.notnull(x) and x >= DELAY_THRESHOLD else (0 if pd.notnull(x) else np.nan))

#Statistik singkat
cnt_total = len(df)
cnt_usable = int(df['is_delay'].notnull().sum())
cnt_delays = int(df['is_delay'].sum()) if df['is_delay'].notnull().any() else 0
print(f"Total rows           : {cnt_total}")
print(f"Rows with target     : {cnt_usable}")
print(f"Rows labeled delayed : {cnt_delays} ({(cnt_delays/cnt_usable*100) if cnt_usable>0 else 0:.2f}% of usable)")

#Tampilkan distribusi label
print("\nValue counts (is_delay):")
vc = df['is_delay'].value_counts(dropna=False)
for k,v in vc.items():
    print(f" is_delay={k} : {v}")


=== CREATE CLASSIFICATION LABEL (is_delay) ===
Total rows           : 107833
Rows with target     : 107833
Rows labeled delayed : 53009 (49.16% of usable)

Value counts (is_delay):
 is_delay=0 : 54824
 is_delay=1 : 53009


In [10]:
#Cell 7: EDA by categorical columns (DEPSTN, ARRSTN, AC, STATUS)
print("=== TOP GROUPS & DELAY RATES ===")

#Kolom kandidat untuk analisis
candidates = ['DEPSTN','ARRSTN','AC','STATUS']
for col in candidates:
    if col in df.columns:
        print(f"\nTop values for {col}:")
        vc = df[col].value_counts().head(20)    #Tampilkan 20 teratas
        for k,v in vc.items():
            mask = (df[col] == k) & (df['is_delay'].notnull())
            if mask.sum() > 0:
                rate = df.loc[mask, 'is_delay'].mean()  #Rata-rata delay
                print(f"  {k:12}: count={v:6}, delay_rate={rate:.3f}")
            else:
                print(f"  {k:12}: count={v:6}")
    else:
        print(f"\nColumn {col} not present.")


=== TOP GROUPS & DELAY RATES ===

Top values for DEPSTN:
  TUN         : count= 42522, delay_rate=0.498
  DJE         : count= 10252, delay_rate=0.237
  ORY         : count=  6755, delay_rate=0.643
  MIR         : count=  5248, delay_rate=0.265
  MRS         : count=  2845, delay_rate=0.568
  LYS         : count=  2358, delay_rate=0.690
  NCE         : count=  2227, delay_rate=0.507
  ALG         : count=  1573, delay_rate=0.701
  MXP         : count=  1534, delay_rate=0.548
  IST         : count=  1442, delay_rate=0.621
  FRA         : count=  1378, delay_rate=0.598
  BRU         : count=  1373, delay_rate=0.530
  CMN         : count=  1349, delay_rate=0.395
  FCO         : count=  1266, delay_rate=0.408
  TOE         : count=  1203, delay_rate=0.203
  CDG         : count=  1153, delay_rate=0.493
  JED         : count=  1138, delay_rate=0.788
  GVA         : count=  1114, delay_rate=0.525
  NBE         : count=   918, delay_rate=0.279
  SFA         : count=   917, delay_rate=0.254

To

In [11]:
#Cell 8: Preprocessing - parse DATOP/STD/STA -> dep_hour, sched_duration, handle categorical top-k
print("=== PREPROCESSING ===")

#Convert DATOP ke datetime
if 'DATOP' in df.columns:
    df['DATOP_dt'] = pd.to_datetime(df['DATOP'], errors='coerce')
else:
    df['DATOP_dt'] = pd.NaT

#Fungsi robust untuk gabungkan tanggal & jam
def parse_dt_with_date(date_series, time_series):
    res = []
    for d,t in zip(date_series, time_series):
        try:
            if pd.isna(d):
                res.append(pd.NaT); continue
            base = pd.to_datetime(d, errors='coerce')
            if pd.isna(base):
                res.append(pd.NaT); continue
            ts = str(t)
            ts_fixed = ts.replace('.',':').strip()
            # if ts_fixed likely includes full datetime
            if len(ts_fixed) > 12 and any(ch in ts_fixed for ch in ['-','/']):
                dt = pd.to_datetime(ts_fixed, errors='coerce')
                res.append(dt)
            else:
                dt_str = base.strftime("%Y-%m-%d") + " " + ts_fixed
                dt = pd.to_datetime(dt_str, errors='coerce')
                res.append(dt)
        except:
            res.append(pd.NaT)
    return pd.Series(res)

#Convert STD/STA ke datetime
if 'STD' in df.columns and 'STA' in df.columns and 'DATOP' in df.columns:
    df['STD_dt'] = parse_dt_with_date(df['DATOP'], df['STD'])
    df['STA_dt'] = parse_dt_with_date(df['DATOP'], df['STA'])
else:
    df['STD_dt'] = pd.NaT; df['STA_dt'] = pd.NaT

#Extract dep_hour dari STD_dt
df['dep_hour'] = df['STD_dt'].dt.hour
#Hitung durasi terjadwal dalam menit
df['sched_dur_min'] = (df['STA_dt'] - df['STD_dt']).dt.total_seconds() / 60.0
df['sched_dur_min'] = df['sched_dur_min'].apply(lambda x: x if pd.notnull(x) and x >= 0 else np.nan)

#Buat feature matrix
feature_df = pd.DataFrame(index=df.index)
feature_df['dep_hour'] = df['dep_hour']
feature_df['sched_dur_min'] = df['sched_dur_min']

#One-hot encoding top-k untuk kolom kategorikal
cat_cols = [c for c in ['DEPSTN','ARRSTN','AC','STATUS'] if c in df.columns]
for c in cat_cols:
    topk = df[c].value_counts().head(TOP_K_CARRIER).index.tolist()
    feature_df[c + "_top"] = df[c].apply(lambda x: x if x in topk else 'OTHER')
    dummies = pd.get_dummies(feature_df[c + "_top"], prefix=c, dummy_na=False)
    feature_df = pd.concat([feature_df, dummies], axis=1)
    feature_df.drop(columns=[c + "_top"], inplace=True)

#Coerce numeric dan isi NaN dengan median
for col in feature_df.columns:
    feature_df[col] = pd.to_numeric(feature_df[col], errors='coerce')
    if feature_df[col].isnull().any():
        med = feature_df[col].median()
        feature_df[col].fillna(med, inplace=True)

#Tampilkan contoh feature matrix
print("Feature matrix sample (first 6 rows, vertical):")
for ridx,row in feature_df.head(6).iterrows():
    print(f"\n-- Row {ridx} --")
    for c,v in row.items():
        print(f"{c:20}: {v}")


=== PREPROCESSING ===
Feature matrix sample (first 6 rows, vertical):

-- Row 0 --
dep_hour            : 10
sched_dur_min       : 145.0
DEPSTN_ALG          : False
DEPSTN_BRU          : False
DEPSTN_CDG          : False
DEPSTN_CMN          : True
DEPSTN_DJE          : False
DEPSTN_FCO          : False
DEPSTN_FRA          : False
DEPSTN_GVA          : False
DEPSTN_IST          : False
DEPSTN_JED          : False
DEPSTN_LYS          : False
DEPSTN_MIR          : False
DEPSTN_MRS          : False
DEPSTN_MXP          : False
DEPSTN_NBE          : False
DEPSTN_NCE          : False
DEPSTN_ORY          : False
DEPSTN_OTHER        : False
DEPSTN_SFA          : False
DEPSTN_TOE          : False
DEPSTN_TUN          : False
ARRSTN_ALG          : False
ARRSTN_BRU          : False
ARRSTN_CDG          : False
ARRSTN_CMN          : False
ARRSTN_DJE          : False
ARRSTN_FCO          : False
ARRSTN_FRA          : False
ARRSTN_GVA          : False
ARRSTN_IST          : False
ARRSTN_JED          : Fal

In [12]:
#Cell 9: Build X,y for regression & classification
print("=== BUILD X, y for REGRESSION & CLASSIFICATION ===")
X_full = feature_df.copy()              #Semua fitur
y_reg_full = df['target_raw'].copy()    #Target regression
y_clf_full = df['is_delay'].copy()      #Target classification

#Tampilkan info
print("Feature count:", X_full.shape[1])
print("Samples with regression target:", int(y_reg_full.notnull().sum()))
print("Samples with classification label:", int(y_clf_full.notnull().sum()))
print("Feature columns (first 40):")
for i,c in enumerate(X_full.columns[:40], start=1):
    print(f" {i}. {c}")


=== BUILD X, y for REGRESSION & CLASSIFICATION ===
Feature count: 70
Samples with regression target: 107833
Samples with classification label: 107833
Feature columns (first 40):
 1. dep_hour
 2. sched_dur_min
 3. DEPSTN_ALG
 4. DEPSTN_BRU
 5. DEPSTN_CDG
 6. DEPSTN_CMN
 7. DEPSTN_DJE
 8. DEPSTN_FCO
 9. DEPSTN_FRA
 10. DEPSTN_GVA
 11. DEPSTN_IST
 12. DEPSTN_JED
 13. DEPSTN_LYS
 14. DEPSTN_MIR
 15. DEPSTN_MRS
 16. DEPSTN_MXP
 17. DEPSTN_NBE
 18. DEPSTN_NCE
 19. DEPSTN_ORY
 20. DEPSTN_OTHER
 21. DEPSTN_SFA
 22. DEPSTN_TOE
 23. DEPSTN_TUN
 24. ARRSTN_ALG
 25. ARRSTN_BRU
 26. ARRSTN_CDG
 27. ARRSTN_CMN
 28. ARRSTN_DJE
 29. ARRSTN_FCO
 30. ARRSTN_FRA
 31. ARRSTN_GVA
 32. ARRSTN_IST
 33. ARRSTN_JED
 34. ARRSTN_LYS
 35. ARRSTN_MIR
 36. ARRSTN_MRS
 37. ARRSTN_MXP
 38. ARRSTN_NBE
 39. ARRSTN_NCE
 40. ARRSTN_ORY


In [14]:
#Cell 10: Train-test split for both tasks (preserve indices)
print("=== TRAIN-TEST SPLIT ===")
#Regression split
reg_mask = y_reg_full.notnull()
X_reg = X_full[reg_mask].values.astype(np.float64)
y_reg = y_reg_full[reg_mask].values.astype(np.float64)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE)

#Classification split
clf_mask = y_clf_full.notnull()
X_clf = X_full[clf_mask].values.astype(np.float64)
y_clf = y_clf_full[clf_mask].values.astype(np.int64)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_clf if len(np.unique(y_clf))>1 else None)

print("Regression shapes:", X_reg_train.shape, X_reg_test.shape)
print("Classification shapes:", X_clf_train.shape, X_clf_test.shape)


=== TRAIN-TEST SPLIT ===
Regression shapes: (86266, 70) (21567, 70)
Classification shapes: (86266, 70) (21567, 70)


In [15]:
#Cell 11: Linear Regression (Normal Equation) & metrics
print("=== REGRESSION: NORMAL EQUATION ===")

#Fungsi tambah bias column (intercept)
def add_bias(X):
    return np.hstack([np.ones((X.shape[0],1), dtype=np.float64), X])

#Normal equation
if X_reg_train.shape[0] == 0:
    print("No regression training rows available.")
else:
    Xb = add_bias(X_reg_train)
    lambda_reg = 1e-6   #Regularization kecil
    XtX = Xb.T.dot(Xb) + lambda_reg * np.eye(Xb.shape[1])
    XtY = Xb.T.dot(y_reg_train)
    theta = np.linalg.pinv(XtX).dot(XtY)
    Xb_test = add_bias(X_reg_test)
    y_pred_reg = Xb_test.dot(theta)

    #Definisi metrics
    def regression_metrics(y_true, y_pred):
        n = len(y_true)
        mse = np.mean((y_true - y_pred)**2)
        rmse = math.sqrt(mse)
        mae = np.mean(np.abs(y_true - y_pred))
        ss_res = np.sum((y_true - y_pred)**2)
        ss_tot = np.sum((y_true - np.mean(y_true))**2)
        r2 = 1 - ss_res/ss_tot if ss_tot>0 else 0.0
        return {'mse':mse, 'rmse':rmse, 'mae':mae, 'r2':r2}

    metrics_reg_ne = regression_metrics(y_reg_test, y_pred_reg)
    print("Regression (Normal Eq) metrics:")
    for k,v in metrics_reg_ne.items():
        print(f" {k:6}: {v:.6f}")


=== REGRESSION: NORMAL EQUATION ===
Regression (Normal Eq) metrics:
 mse   : 13237.069596
 rmse  : 115.052465
 mae   : 54.558030
 r2    : 0.044021


In [16]:
#Cell 12: Linear Regression (Gradient Descent) & metrics
print("=== REGRESSION: GRADIENT DESCENT ===")
if X_reg_train.shape[0] == 0:
    print("No regression rows to train GD.")
else:
    def train_linear_gd(X, y, lr=0.0005, epochs=2000, reg=1e-6, verbose=False):
        m, n = X.shape
        w = np.zeros(n, dtype=np.float64)
        b = 0.0
        for ep in range(1, epochs+1):
            y_pred = X.dot(w) + b
            error = y_pred - y
            gw = (X.T.dot(error)) / m + reg * w
            gb = np.mean(error)
            w -= lr * gw
            b -= lr * gb
            if verbose and (ep==1 or ep%max(1, epochs//10)==0):
                loss = np.mean(error**2)/2 + 0.5*reg*np.sum(w*w)
                print(f" Epoch {ep}/{epochs} loss:{loss:.6f}")
        return w, b

    w_gd, b_gd = train_linear_gd(X_reg_train, y_reg_train, lr=0.0005, epochs=2000, reg=1e-6, verbose=False)
    y_pred_reg_gd = X_reg_test.dot(w_gd) + b_gd
    metrics_reg_gd = regression_metrics(y_reg_test, y_pred_reg_gd)
    print("Regression (GD) metrics:")
    for k,v in metrics_reg_gd.items():
        print(f" {k:6}: {v:.6f}")


=== REGRESSION: GRADIENT DESCENT ===


  gw = (X.T.dot(error)) / m + reg * w
  gw = (X.T.dot(error)) / m + reg * w
  w -= lr * gw


Regression (GD) metrics:
 mse   : nan
 rmse  : nan
 mae   : nan
 r2    : nan


In [17]:
#Cell 13: Logistic Regression manual for classification & metrics (vertical)
print("=== LOGISTIC REGRESSION (MANUAL) ===")
def sigmoid(z):
    z = np.asarray(z, dtype=np.float64)
    return 1.0 / (1.0 + np.exp(-z))

def train_logistic(X, y, lr=1.0, epochs=200, reg=0.01, verbose=True):
    m, n = X.shape
    w = np.zeros(n, dtype=np.float64)
    b = 0.0
    for ep in range(1, epochs+1):
        z = X.dot(w) + b
        preds = sigmoid(z)
        diff = preds - y
        dw = (X.T.dot(diff)) / m + reg * w
        db = np.mean(diff)
        w -= lr * dw
        b -= lr * db
        if verbose and (ep==1 or ep%max(1, epochs//10)==0):
            eps = 1e-12
            loss = - np.mean(y*np.log(preds+eps) + (1-y)*np.log(1-preds+eps)) + 0.5*reg*np.sum(w*w)
            print(f" Epoch {ep}/{epochs} loss:{loss:.6f}")
    return w, b

#Training logistic regression
if X_clf_train.shape[0] == 0:
    print("No classification training rows available.")
else:
    w_log, b_log = train_logistic(X_clf_train, y_clf_train, lr=1.0, epochs=200, reg=0.01, verbose=True)
    def predict_proba(X, w, b):
        return sigmoid(X.dot(w) + b)
    def predict_labels(X, w, b, thr=0.5):
        probs = predict_proba(X, w, b)
        return (probs >= thr).astype(int), probs
    y_clf_pred, y_clf_prob = predict_labels(X_clf_test, w_log, b_log, thr=0.5)

    def conf_matrix(y_true, y_pred):
        tp = int(((y_true==1) & (y_pred==1)).sum())
        tn = int(((y_true==0) & (y_pred==0)).sum())
        fp = int(((y_true==0) & (y_pred==1)).sum())
        fn = int(((y_true==1) & (y_pred==0)).sum())
        return tp, fp, tn, fn

    def classification_metrics(y_true, y_pred):
        tp, fp, tn, fn = conf_matrix(y_true, y_pred)
        accuracy = (tp+tn) / (tp+tn+fp+fn) if (tp+tn+fp+fn)>0 else 0
        precision = tp / (tp+fp) if (tp+fp)>0 else 0
        recall = tp / (tp+fn) if (tp+fn)>0 else 0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0
        return {'accuracy':accuracy,'precision':precision,'recall':recall,'f1':f1,'tp':tp,'fp':fp,'tn':tn,'fn':fn}

    metrics_clf = classification_metrics(y_clf_test, y_clf_pred)
    print("Classification metrics (threshold=0.5):")
    for k,v in metrics_clf.items():
        print(f" {k:10}: {v}")


=== LOGISTIC REGRESSION (MANUAL) ===
 Epoch 1/200 loss:11.897639


  return 1.0 / (1.0 + np.exp(-z))


 Epoch 20/200 loss:81.154907
 Epoch 40/200 loss:55.168619
 Epoch 60/200 loss:77.655897
 Epoch 80/200 loss:101.936542
 Epoch 100/200 loss:106.541188
 Epoch 120/200 loss:128.891501
 Epoch 140/200 loss:137.768570
 Epoch 160/200 loss:131.814197
 Epoch 180/200 loss:130.314720
 Epoch 200/200 loss:101.982776
Classification metrics (threshold=0.5):
 accuracy  : 0.4915843650020865
 precision : 0.4915843650020865
 recall    : 1.0
 f1        : 0.6591438962976779
 tp        : 10602
 fp        : 10965
 tn        : 0
 fn        : 0


In [18]:
#Cell 14: ROC AUC manual & threshold sweep (vertical)
print("=== ROC AUC & THRESHOLD SWEEP ===")
def auc_manual(y_true, y_score):
    idx = np.argsort(-y_score)
    y_sorted = y_true[idx]
    tp=0; fp=0
    P = int((y_true==1).sum()); N = int((y_true==0).sum())
    tps=[]; fps=[]
    for v in y_sorted:
        if v==1: tp+=1
        else: fp+=1
        tps.append(tp); fps.append(fp)
    auc = 0.0
    prev_fp=0; prev_tp=0
    for f,t in zip(fps,tps):
        x1 = prev_fp / N if N>0 else 0; x2 = f / N if N>0 else 0
        y1 = prev_tp / P if P>0 else 0; y2 = t / P if P>0 else 0
        auc += (x2-x1) * (y1+y2) / 2
        prev_fp=f; prev_tp=t
    return auc

#Hitung ROC AUC
if X_clf_train.shape[0] == 0:
    print("No classification rows for ROC.")
else:
    roc_auc_clf = auc_manual(y_clf_test, y_clf_prob)
    print(f"ROC AUC (manual): {roc_auc_clf:.6f}")
    
    #Sweep threshold untuk F1
    best_thr = 0.5; best_f1 = -1
    for thr in np.linspace(0.1,0.9,17):
        y_pred_thr = (y_clf_prob >= thr).astype(int)
        m = classification_metrics(y_clf_test, y_pred_thr)
        print(f" thr {thr:.2f} -> acc {m['accuracy']:.4f}, prec {m['precision']:.4f}, rec {m['recall']:.4f}, f1 {m['f1']:.4f}")
        if m['f1'] > best_f1:
            best_f1 = m['f1']; best_thr = thr
    print(f"Best threshold by F1: {best_thr:.2f} (F1={best_f1:.4f})")


=== ROC AUC & THRESHOLD SWEEP ===
ROC AUC (manual): 0.496753
 thr 0.10 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.15 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.20 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.25 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.30 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.35 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.40 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.45 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.50 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.55 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.60 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.65 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.70 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.75 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.80 -> acc 0.4916, prec 0.4916, rec 1.0000, f1 0.6591
 thr 0.85 -> acc 0.4916, prec 0.4916, r

In [None]:
#Cell 15: Error analysis (FP/FN)
print("=== ERROR ANALYSIS (CLASSIFICATION) ===")
if X_clf_train.shape[0] == 0:
    print("No classification predictions to analyze.")
else:
    fp_list = []; fn_list = []
    for i,(pred, true, prob) in enumerate(zip(y_clf_pred, y_clf_test, y_clf_prob)):
        if pred==1 and true==0 and len(fp_list)<10:
            fp_list.append((i, prob))
        if pred==0 and true==1 and len(fn_list)<10:
            fn_list.append((i, prob))
    print("\nFalse Positives (index_in_test, prob):")
    for idx,prob in fp_list:
        print(f" FP idx_in_test={idx} prob={prob:.4f}")
    print("\nFalse Negatives (index_in_test, prob):")
    for idx,prob in fn_list:
        print(f" FN idx_in_test={idx} prob={prob:.4f}")


=== ERROR ANALYSIS (CLASSIFICATION) ===

False Positives (index_in_test, prob):
 FP idx_in_test=0 prob=1.0000
 FP idx_in_test=2 prob=1.0000
 FP idx_in_test=4 prob=1.0000
 FP idx_in_test=5 prob=1.0000
 FP idx_in_test=7 prob=1.0000
 FP idx_in_test=9 prob=1.0000
 FP idx_in_test=11 prob=1.0000
 FP idx_in_test=12 prob=1.0000
 FP idx_in_test=18 prob=1.0000
 FP idx_in_test=19 prob=1.0000

False Negatives (index_in_test, prob):


In [None]:
#Cell 16: Save outputs (models & predictions)
print("=== SAVING OUTPUTS ===")
if SAVE_OUTPUTS:
    out_dir = "flights_output"
    os.makedirs(out_dir, exist_ok=True)

    #Simpan metadata model
    try:
        meta = {}
        if 'theta' in globals():
            meta['reg_normal_theta'] = theta.tolist()
        if 'w_gd' in globals():
            meta['reg_gd_weights'] = w_gd.tolist(); meta['reg_gd_bias'] = float(b_gd)
        if 'w_log' in globals():
            meta['logreg_weights'] = w_log.tolist(); meta['logreg_bias'] = float(b_log)
        meta['feature_names'] = X_full.columns.tolist()
        with open(os.path.join(out_dir, "models_meta.json"), "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2)
    except Exception as e:
        print("Warning saving model meta:", e)
    #Simpan metrics
    try:
        metrics_all = {}
        if 'metrics_reg_ne' in globals(): metrics_all['reg_normal'] = metrics_reg_ne
        if 'metrics_reg_gd' in globals(): metrics_all['reg_gd'] = metrics_reg_gd
        if 'metrics_clf' in globals(): metrics_all['clf'] = metrics_clf
        if 'roc_auc_clf' in globals(): metrics_all['roc_auc'] = float(roc_auc_clf)
        if 'best_thr' in globals(): metrics_all['best_threshold_by_f1'] = float(best_thr)
        with open(os.path.join(out_dir, "metrics.json"), "w", encoding="utf-8") as f:
            json.dump(metrics_all, f, indent=2)
    except Exception as e:
        print("Warning saving metrics:", e)
    #Simpan predictions
    try:
        if 'y_reg_test' in globals():
            pred_reg_df = pd.DataFrame({'y_true': y_reg_test, 'y_pred_ne': y_pred_reg if 'y_pred_reg' in globals() else None, 'y_pred_gd': y_pred_reg_gd if 'y_pred_reg_gd' in globals() else None})
            pred_reg_df.to_csv(os.path.join(out_dir, "predictions_regression.csv"), index=False)
        if 'y_clf_test' in globals():
            pred_clf_df = pd.DataFrame({'y_true': y_clf_test, 'y_prob': y_clf_prob if 'y_clf_prob' in globals() else None, 'y_pred': y_clf_pred if 'y_clf_pred' in globals() else None})
            pred_clf_df.to_csv(os.path.join(out_dir, "predictions_classification.csv"), index=False)
        print("Saved outputs to", out_dir)
    except Exception as e:
        print("Warning saving predictions:", e)
else:
    print("SAVE_OUTPUTS is False -> skipping save.")


=== SAVING OUTPUTS ===
Saved outputs to flights_output


In [None]:
#Cell 17: Final Summary
print("=== FINAL SUMMARY ===")
print(f"Dataset total rows         : {len(df)}")
print(f"Rows used for regression   : {X_reg.shape[0] if 'X_reg' in globals() else 0}")
print(f"Rows used for classification: {X_clf.shape[0] if 'X_clf' in globals() else 0}")

#Ringkasan evaluasi regression
if 'metrics_reg_ne' in globals():
    print("\nRegression (Normal Eq) metrics:")
    for k,v in metrics_reg_ne.items():
        print(f" {k:6}: {v:.6f}")
if 'metrics_reg_gd' in globals():
    print("\nRegression (GD) metrics:")
    for k,v in metrics_reg_gd.items():
        print(f" {k:6}: {v:.6f}")
#Ringkasan evaluasi classification
if 'metrics_clf' in globals():
    print("\nClassification metrics:")
    for k,v in metrics_clf.items():
        print(f" {k:9}: {v}")
if 'roc_auc_clf' in globals():
    print(f"\nROC AUC (clf): {roc_auc_clf:.6f}")
if 'best_thr' in globals():
    print(f"Best threshold by F1: {best_thr:.2f} (F1={best_f1:.4f})")
#Lokasi penyimpanan output
if SAVE_OUTPUTS:
    print("\nSaved outputs folder:", "flights_output")


=== FINAL SUMMARY ===
Dataset total rows         : 107833
Rows used for regression   : 107833
Rows used for classification: 107833

Regression (Normal Eq) metrics:
 mse   : 13237.069596
 rmse  : 115.052465
 mae   : 54.558030
 r2    : 0.044021

Regression (GD) metrics:
 mse   : nan
 rmse  : nan
 mae   : nan
 r2    : nan

Classification metrics:
 accuracy : 0.4915843650020865
 precision: 0.4915843650020865
 recall   : 1.0
 f1       : 0.6591438962976779
 tp       : 10602
 fp       : 10965
 tn       : 0
 fn       : 0

ROC AUC (clf): 0.496753
Best threshold by F1: 0.10 (F1=0.6591)

Saved outputs folder: flights_output
