In [None]:

import os
print("Current working directory:", os.getcwd())

from load_data import load_preprocessed_data
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder


X, y = load_preprocessed_data()
print("Data loaded successfully!")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Target distribution:\n", y.value_counts(normalize=True))


for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

print("\n All categorical features encoded successfully!")


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


metrics = evaluate_model(nb_model, X_test, y_test)


print("\n=== Model Evaluation ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
print(f"PR AUC: {metrics['pr_auc']:.4f}")


Current working directory: /Users/info/Desktop/FlightDelay_ML_project/src/classification
ðŸ“‚ Loading data from: ../../data/preprocessed_flight_data.csv
Data loaded successfully!
X shape: (6965266, 12)
y shape: (6965266,)
Target distribution:
 is_arr_delayed
False    0.637916
True     0.362084
Name: proportion, dtype: float64

âœ… All categorical features encoded successfully!

=== Model Evaluation ===
Accuracy: 0.6456
ROC AUC: 0.6268
PR AUC: 0.4679


Accuracy (0.65) â€“ means the model correctly predicts delay status about 65% of the time.
ROC AUC (0.63) â€“ a bit above random guessing (0.5), showing the model does capture patterns.
PR AUC (0.47) â€“ decent, considering class imbalance (fewer delayed flights).


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform
import warnings

warnings.filterwarnings("ignore")

nb_clf = GaussianNB()

param_dist_nb = {
    'var_smoothing': loguniform(1e-10, 1e-2)
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


random_search_nb = RandomizedSearchCV(
    estimator=nb_clf,
    param_distributions=param_dist_nb,
    n_iter=20,
    scoring='roc_auc',
    cv=kfold,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

random_search_nb.fit(X_train, y_train)

print("Best parameters for Naive Bayes:", random_search_nb.best_params_)
print("Best ROC AUC (CV):", random_search_nb.best_score_)

# Evaluate on test set
best_nb = random_search_nb.best_estimator_
metrics = evaluate_model(best_nb, X_test, y_test)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END ................var_smoothing=9.915644566638397e-08; total time=  11.1s
[CV] END ................var_smoothing=9.915644566638397e-08; total time=  11.5s
[CV] END .................var_smoothing=0.004033800832600388; total time=  13.2s
[CV] END .................var_smoothing=0.004033800832600388; total time=  13.3s
[CV] END ................var_smoothing=7.177141927992012e-05; total time=  13.3s
[CV] END .................var_smoothing=0.004033800832600388; total time=  13.4s
[CV] END ................var_smoothing=7.177141927992012e-05; total time=  13.5s
[CV] END ................var_smoothing=9.915644566638397e-08; total time=  13.6s
[CV] END ................var_smoothing=7.177141927992012e-05; total time=   6.7s
[CV] END ................var_smoothing=6.155564318973019e-06; total time=   7.2s
[CV] END ...............var_smoothing=1.7707168643537793e-09; total time=   8.6s
[CV] END ................var_smoothing=6.1555643