In [1]:

import os
print("Current working directory:", os.getcwd())

from load_data import load_preprocessed_data
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

#. Load data
X, y = load_preprocessed_data()
print("Data loaded successfully!")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

# 2. Encode categorical columns
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

print("\n✅ All categorical features encoded successfully!")

# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# 5. Evaluate model
metrics = evaluate_model(nb_model, X_test, y_test)

# 6. Display results
print("\n=== Model Evaluation ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
print(f"PR AUC: {metrics['pr_auc']:.4f}")


Current working directory: /home/adam/university/sem5/machine_learning/Project/src/classification
📂 Loading data from: ../../data/preprocessed_flight_data.csv
Data loaded successfully!
X shape: (6965266, 12)
y shape: (6965266,)
Target distribution:
 is_arr_delayed
False    0.637916
True     0.362084
Name: proportion, dtype: float64

✅ All categorical features encoded successfully!

=== Model Evaluation ===
Accuracy: 0.6456
ROC AUC: 0.6268
PR AUC: 0.4679
