In [16]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# ======================================================
# A) Load + sample data (300 rows per month-folder)
# ======================================================

base_dir = os.getcwd()
sampled_dfs = []

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)

    # Skip if not a directory or not a month folder
    if not os.path.isdir(folder_path) or " " not in folder:
        continue

    csv_path = os.path.join(folder_path, "T_ONTIME_REPORTING.csv")

    if os.path.exists(csv_path):
        try:
            df_month = pd.read_csv(csv_path)

            sample_size = min(300, len(df_month))
            sampled = df_month.sample(n=sample_size, random_state=42)

            sampled_dfs.append(sampled)

        except Exception as e:
            print(f"Error reading {csv_path}: {e}")

if sampled_dfs:
    df = pd.concat(sampled_dfs, ignore_index=True)
    print("Loaded combined df shape:", df.shape)
    print(df.head())
else:
    raise RuntimeError("No data sampled. Check folder structure / file names.")

    


Loaded combined df shape: (1200, 24)
   YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK                FL_DATE  \
0  2025        2      4             6            7   4/6/2025 12:00:00 AM   
1  2025        2      4            18            5  4/18/2025 12:00:00 AM   
2  2025        2      4            21            1  4/21/2025 12:00:00 AM   
3  2025        2      4            25            5  4/25/2025 12:00:00 AM   
4  2025        2      4            25            5  4/25/2025 12:00:00 AM   

   OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID ORIGIN ORIGIN_STATE_NM  ...  \
0             5683.0              12478    JFK        New York  ...   
1             2923.0              11537    ELM        New York  ...   
2              781.0              12478    JFK        New York  ...   
3             2307.0              11292    DEN        Colorado  ...   
4             1114.0              11298    DFW           Texas  ...   

   DEST_CITY_NAME  DEST_STATE_NM  CRS_DEP_TIME DEP_TIME CRS_ARR_TIME ARR_

In [18]:
# ======================================================
# B) Time conversion helpers + delay features
# ======================================================

def hhmm_to_minutes(x):
    """Convert HHMM (e.g. 1325) -> minutes since midnight."""
    try:
        x = int(float(str(x).strip()))
        h, m = x // 100, x % 100
        if m >= 60:
            return np.nan
        return h * 60 + m
    except:
        return np.nan

# Actual vs scheduled arrival delay
df["ARR_TIME_MIN"] = df["ARR_TIME"].apply(hhmm_to_minutes)
df["CRS_ARR_TIME_MIN"] = df["CRS_ARR_TIME"].apply(hhmm_to_minutes)
df["ARR_DELAY_MIN"] = df["ARR_TIME_MIN"] - df["CRS_ARR_TIME_MIN"]

# Actual vs scheduled departure delay
df["DEP_TIME_MIN"] = df["DEP_TIME"].apply(hhmm_to_minutes)
df["CRS_DEP_TIME_MIN"] = df["CRS_DEP_TIME"].apply(hhmm_to_minutes)
df["DEP_DELAY_MIN"] = df["DEP_TIME_MIN"] - df["CRS_DEP_TIME_MIN"]

In [19]:
# ======================================================
# C) Drop columns you don't want
# ======================================================

cols_to_drop = [
    "FL_DATE", "ARR_DEL15", "FLIGHTS", "DEST_CITY_NAME", "ORIGIN_STATE_NM",
    "DEST_AIRPORT_SEQ_ID", "DEST_CITY_MARKET_ID", "ARR_TIME_BLK"
]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])




In [20]:
# ======================================================
# D) Holiday + cyclical scheduled departure time features
# ======================================================

df["Holiday?"] = (
    ((df["MONTH"] == 12) & (df["DAY_OF_MONTH"] >= 20)) |  # Dec 20–31
    ((df["MONTH"] == 1) & (df["DAY_OF_MONTH"] <= 5))   |  # Jan 1–5
    (df["MONTH"].isin([7, 8]))                         # July & August
).astype(int)

# Encode scheduled departure time (CRS_DEP_TIME_MIN) as cyclical features
df["DEP_TIME_sin"] = np.sin(2 * np.pi * df["CRS_DEP_TIME_MIN"] / 1440)
df["DEP_TIME_cos"] = np.cos(2 * np.pi * df["CRS_DEP_TIME_MIN"] / 1440)

# Optional route feature (kept because you created it)
df["ROUTE"] = df["ORIGIN"].astype(str) + "_" + df["DEST"].astype(str)

In [21]:
# ======================================================
# E) Classification target
#    Here: departure delay >= 15 minutes
#    (If you want ARR delay instead, change DEP_DELAY_MIN -> ARR_DELAY_MIN)
# ======================================================

DELAY_THRESHOLD_MIN = 15
df = df.dropna(subset=["DEP_DELAY_MIN"]).copy()
df["DELAYED_15"] = (df["DEP_DELAY_MIN"] >= DELAY_THRESHOLD_MIN).astype(int)

y = df["DELAYED_15"]


In [23]:
# ======================================================
# F) Build features (with OneHot for categorical)
# ======================================================

# Numeric features
numeric_features = [
    "MONTH",
    "DAY_OF_WEEK",
    "DAY_OF_MONTH",
    "DISTANCE",
    "DEP_TIME_sin",
    "DEP_TIME_cos",
    "Holiday?"
]

# Categorical features to one-hot encode
cat_features = ["ORIGIN", "DEST", "OP_CARRIER_FL_NUM"]

# Drop rows with missing feature values (important before encoding)
df = df.dropna(subset=numeric_features + cat_features + ["DELAYED_15"]).copy()
y = df["DELAYED_15"]

# One-hot encode categorical columns
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_cat = encoder.fit_transform(df[cat_features])
X_cat = pd.DataFrame(
    X_cat,
    columns=encoder.get_feature_names_out(cat_features),
    index=df.index
)

# Combine numeric + categorical
X = pd.concat([df[numeric_features], X_cat], axis=1)

print("Final feature matrix:", X.shape)
print("Class balance:", y.value_counts(normalize=True).round(3))


Final feature matrix: (1178, 1308)
Class balance: DELAYED_15
0    0.787
1    0.213
Name: proportion, dtype: float64


In [24]:
# ======================================================
# G) Train/test split (stratified)
# ======================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
# ======================================================
# H) Train Random Forest Classifier
# ======================================================

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

rf_model.fit(X_train, y_train)


In [26]:
# ======================================================
# I) Evaluate with classification metrics
# ======================================================

y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

print("\n=== Classification Metrics (DELAYED_15) ===")
print(f"Accuracy:           {accuracy_score(y_test, y_pred):.3f}")
print(f"Balanced Accuracy:  {balanced_accuracy_score(y_test, y_pred):.3f}")
print(f"Precision (delay):  {precision_score(y_test, y_pred, zero_division=0):.3f}")
print(f"Recall (delay):     {recall_score(y_test, y_pred, zero_division=0):.3f}")
print(f"F1-score:           {f1_score(y_test, y_pred, zero_division=0):.3f}")
print(f"ROC-AUC:            {roc_auc_score(y_test, y_proba):.3f}")

print("\nConfusion Matrix [ [TN FP], [FN TP] ]:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=["On-time (<15m)", "Delayed (>=15m)"],
    digits=3
))









=== Classification Metrics (DELAYED_15) ===
Accuracy:           0.686
Balanced Accuracy:  0.567
Precision (delay):  0.300
Recall (delay):     0.360
F1-score:           0.327
ROC-AUC:            0.574

Confusion Matrix [ [TN FP], [FN TP] ]:
[[144  42]
 [ 32  18]]

Classification Report:
                 precision    recall  f1-score   support

 On-time (<15m)      0.818     0.774     0.796       186
Delayed (>=15m)      0.300     0.360     0.327        50

       accuracy                          0.686       236
      macro avg      0.559     0.567     0.561       236
   weighted avg      0.708     0.686     0.696       236



Mean Squared Error (MSE): 893.13
Root Mean Squared Error (RMSE): 29.89
Mean Absolute Error (MAE): 14.06
R² Score: 0.60


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
