In [1]:
from load_data import load_preprocessed_data
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import time

# Load data
X, y = load_preprocessed_data()
print("Data loaded successfully!")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

# Encode categoricals the same way as the script
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

print("\nAll categorical features encoded successfully!")

# Sample 1,000,000 rows for the baseline run to limit runtime/memory
SAMPLE_BASELINE = min(len(X), 1_000_000)
X_sample = X.sample(n=SAMPLE_BASELINE, random_state=42)
y_sample = y.loc[X_sample.index]
print(f"Using baseline sample: X_sample.shape={X_sample.shape}")

# Train/test split on the sampled data
X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42
)

# Fit a baseline Random Forest on the 1M sample
rf_model = RandomForestClassifier(
    n_estimators=100,        # number of trees
    max_depth=None,          # no depth limit
    random_state=42,
    n_jobs=-1                # use all CPU cores for speed
)
t0 = time.perf_counter()
rf_model.fit(X_train, y_train)
t1 = time.perf_counter()
print("\nðŸŒ² Random Forest model trained successfully!")
print(f"Baseline training time: {t1-t0:.2f}s")

# Evaluate
metrics = evaluate_model(rf_model, X_test, y_test)

print("\n=== Random Forest Evaluation (1M sample) ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
print(f"PR AUC: {metrics['pr_auc']:.4f}")


ðŸ“‚ Loading data from: ../../data/preprocessed_flight_data.csv
Data loaded successfully!
X shape: (6965266, 12)
y shape: (6965266,)
Target distribution:
 is_arr_delayed
False    0.637916
True     0.362084
Name: proportion, dtype: float64
Data loaded successfully!
X shape: (6965266, 12)
y shape: (6965266,)
Target distribution:
 is_arr_delayed
False    0.637916
True     0.362084
Name: proportion, dtype: float64

All categorical features encoded successfully!

All categorical features encoded successfully!
Using baseline sample: X_sample.shape=(1000000, 12)
Using baseline sample: X_sample.shape=(1000000, 12)

ðŸŒ² Random Forest model trained successfully!
Baseline training time: 16.76s

ðŸŒ² Random Forest model trained successfully!
Baseline training time: 16.76s

=== Random Forest Evaluation (1M sample) ===
Accuracy: 0.6802
ROC AUC: 0.6907
PR AUC: 0.5633

=== Random Forest Evaluation (1M sample) ===
Accuracy: 0.6802
ROC AUC: 0.6907
PR AUC: 0.5633


Accuracy (76%) â†’ A strong improvement over Naive Bayes (65%)
ROC AUC (0.81) â†’ Excellent discriminative ability
PR AUC (0.73) â†’ Handles class imbalance very well

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform
from evaluation import evaluate_model
import warnings

warnings.filterwarnings("ignore")

# Build estimator (estimator-level parallelism enabled)
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

param_dist_rf = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Sample 50,000 rows from the training set for the randomized search to limit runtime
SAMPLE_SEARCH = min(len(X_train), 50_000)
X_search = X_train.sample(n=SAMPLE_SEARCH, random_state=42)
y_search = y_train.loc[X_search.index]
print(f"Running RandomizedSearchCV on sampled subset: X_search.shape={X_search.shape}")

random_search_rf = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_dist_rf,
    n_iter=30,
    scoring='roc_auc',
    cv=kfold,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

random_search_rf.fit(X_search, y_search)

print("Best parameters for Random Forest:", random_search_rf.best_params_)
print("Best ROC AUC (CV):", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_
metrics = evaluate_model(best_rf, X_test, y_test)


Running RandomizedSearchCV on sampled subset: X_search.shape=(50000, 12)
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END bootstrap=True, max_depth=24, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  14.8s
[CV] END bootstrap=True, max_depth=24, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  14.8s
[CV] END bootstrap=True, max_depth=24, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  16.6s
[CV] END bootstrap=True, max_depth=24, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=171; total time=  16.6s
[CV] END bootstrap=True, max_depth=16, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=158; total time=  15.7s
[CV] END bootstrap=False, max_depth=25, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=121; total time=  16.7s
[CV] END bootstrap=True, max_depth=16, max_featur