## Baseline Model

Just predicting that there are never any cancellations at all, given that the percentage of cancelled flights is ~2.64%. This represents a naive baseline where we always predict "no cancellation".

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score, average_precision_score

# Load data (assuming you have a processed dataset with features)
# If you have a processed CSV from feature engineering, load it here:
# df = pd.read_csv("../data/processed_flights.csv")
# Otherwise, load raw data and do minimal processing:
csv_path = "../data/flights_sample_3m.csv"
df = pd.read_csv(csv_path)

# For now, let's create a simple train/test split
# Separate features and target
X = df.drop(columns=["CANCELLED"])  # Features
y = df["CANCELLED"]  # Target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train):,}")
print(f"Test set size: {len(X_test):,}")
print(f"\nCancellation rate in test set: {y_test.mean():.4f} ({y_test.mean()*100:.2f}%)")

Training set size: 2,400,000
Test set size: 600,000

Cancellation rate in test set: 0.0264 (2.64%)


In [5]:
# Baseline: Predict all zeros (no cancellations)
baseline_pred = np.zeros_like(y_test)

print(f"Baseline predictions: {np.unique(baseline_pred, return_counts=True)}")
print(f"\nActual test labels: {np.unique(y_test, return_counts=True)}")


Baseline predictions: (array([0.]), array([600000]))

Actual test labels: (array([0., 1.]), array([584172,  15828]))


In [6]:
# Evaluate baseline performance
baseline_accuracy = accuracy_score(y_test, baseline_pred)
baseline_precision = precision_score(y_test, baseline_pred, zero_division=0)
baseline_recall = recall_score(y_test, baseline_pred, zero_division=0)

# For ROC-AUC and PR-AUC, we need probability scores
# Since baseline_pred is all zeros (binary predictions), we'll treat them as probability 0.0
# Note: This baseline will have AUC = 0.0 because it never predicts positive class
baseline_proba = baseline_pred.astype(float)  # Convert to float (all 0.0)

try:
    baseline_roc_auc = roc_auc_score(y_test, baseline_proba)
except ValueError as e:
    baseline_roc_auc = 0.0  # Will be 0.0 since no positive predictions
    print(f"Note: ROC-AUC calculation issue: {e}")

try:
    baseline_pr_auc = average_precision_score(y_test, baseline_proba)
except ValueError as e:
    baseline_pr_auc = 0.0
    print(f"Note: PR-AUC calculation issue: {e}")

print("Baseline Model Performance:")
print(f"  Accuracy:  {baseline_accuracy:.4f}")
print(f"  Precision: {baseline_precision:.4f}")
print(f"  Recall:    {baseline_recall:.4f}")
print(f"  ROC-AUC:   {baseline_roc_auc:.4f}")
print(f"  PR-AUC:    {baseline_pr_auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, baseline_pred))


Baseline Model Performance:
  Accuracy:  0.9736
  Precision: 0.0000
  Recall:    0.0000
  ROC-AUC:   0.5000
  PR-AUC:    0.0264

Confusion Matrix:
[[584172      0]
 [ 15828      0]]


**Interpretation:**
- This baseline predicts **no cancellations** for all flights
- It achieves ~97.36% accuracy (simply because most flights aren't cancelled)
- However, it has **0% recall** (fails to catch any cancellations) and **0% precision**
- **ROC-AUC = 0.5000**: This equals random guessing performance. When all predictions are 0.0, the ROC curve becomes a diagonal line (TPR = FPR), giving AUC = 0.5. This is the baseline for ROC-AUC (worse than random would be < 0.5)
- **PR-AUC = 0.0264**: This equals the positive class rate (2.64% cancellation rate). When a model predicts all negatives, the PR-AUC equals the proportion of positive examples. This represents the baseline for PR-AUC
- Any real model should beat this baseline by achieving ROC-AUC > 0.5 and PR-AUC > 0.0264
- **Note**: For imbalanced datasets, PR-AUC is often more informative than ROC-AUC, as it focuses on the minority class performance

In [None]:
# Preprocessing: Handle categorical variables with label encoding
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Make a copy for preprocessing
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

# Identify categorical columns
categorical_cols = X_train_processed.select_dtypes(include=['object']).columns.tolist()

# Label encode categorical variables (vectorized approach for speed)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Fit on training data only
    X_train_processed[col] = le.fit_transform(X_train_processed[col].astype(str))
    
    # Transform test data using vectorized operations
    # Create mapping dictionary for faster lookup
    category_to_code = dict(zip(le.classes_, le.transform(le.classes_)))
    # Use map with fillna for unseen categories (set to 0)
    X_test_processed[col] = X_test_processed[col].astype(str).map(category_to_code).fillna(0).astype(int)
    
    label_encoders[col] = le

# Fill any remaining NaN values with median (for numerical) or 0
X_train_processed = X_train_processed.fillna(X_train_processed.median())
X_test_processed = X_test_processed.fillna(X_train_processed.median())

print(f"Processed training shape: {X_train_processed.shape}")
print(f"Processed test shape: {X_test_processed.shape}")
print(f"Categorical columns encoded: {categorical_cols}")


KeyboardInterrupt: 

In [None]:
# Helper function to evaluate models
def evaluate_model(y_true, y_pred, y_proba, model_name):
    """Evaluate model performance and return metrics dictionary"""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    
    results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    }
    
    return results


## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train_processed, y_train)

# Predictions
lr_pred = lr_model.predict(X_test_processed)
lr_proba = lr_model.predict_proba(X_test_processed)[:, 1]

# Evaluate
lr_results = evaluate_model(y_test, lr_pred, lr_proba, 'Logistic Regression')
print("\nLogistic Regression Performance:")
for metric, value in lr_results.items():
    if metric != 'Model':
        print(f"  {metric}: {value:.4f}")

## 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Using a subset for faster training - can adjust n_estimators and remove sampling for full training
rf_model = RandomForestClassifier(
    n_estimators=100,  # Reduce for faster training; increase for better performance
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    max_depth=20,  # Limit depth for faster training
    min_samples_split=100,
    min_samples_leaf=50
)

rf_model.fit(X_train_processed, y_train)

# Predictions
rf_pred = rf_model.predict(X_test_processed)
rf_proba = rf_model.predict_proba(X_test_processed)[:, 1]

# Evaluate
rf_results = evaluate_model(y_test, rf_pred, rf_proba, 'Random Forest')
print("\nRandom Forest Performance:")
for metric, value in rf_results.items():
    if metric != 'Model':
        print(f"  {metric}: {value:.4f}")

## 3. XGBoost

In [None]:
try:
    import xgboost as xgb
    
    # Calculate scale_pos_weight for class imbalance
    # scale_pos_weight = number of negative samples / number of positive samples
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,  # Reduce for faster training
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss',
        tree_method='hist'  # Faster training method
    )
    
    xgb_model.fit(X_train_processed, y_train)
    
    # Predictions
    xgb_pred = xgb_model.predict(X_test_processed)
    xgb_proba = xgb_model.predict_proba(X_test_processed)[:, 1]
    
    # Evaluate
    xgb_results = evaluate_model(y_test, xgb_pred, xgb_proba, 'XGBoost')
    print("\nXGBoost Performance:")
    for metric, value in xgb_results.items():
        if metric != 'Model':
            print(f"  {metric}: {value:.4f}")
            
except ImportError:
    print("XGBoost not installed. Install with: pip install xgboost")
    xgb_results = None

## 4. LightGBM

In [None]:
try:
    import lightgbm as lgb
    
    print("Training LightGBM...")
    
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    lgb_model = lgb.LGBMClassifier(
        n_estimators=100,  # Reduce for faster training
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        verbose=-1  # Suppress output
    )
    
    lgb_model.fit(X_train_processed, y_train)
    
    # Predictions
    lgb_pred = lgb_model.predict(X_test_processed)
    lgb_proba = lgb_model.predict_proba(X_test_processed)[:, 1]
    
    # Evaluate
    lgb_results = evaluate_model(y_test, lgb_pred, lgb_proba, 'LightGBM')
    print("\nLightGBM Performance:")
    for metric, value in lgb_results.items():
        if metric != 'Model':
            print(f"  {metric}: {value:.4f}")
            
except ImportError:
    print("LightGBM not installed. Install with: pip install lightgbm")
    lgb_results = None


## Model Comparison

In [None]:
# Collect all results
all_results = [lr_results, rf_results]

if xgb_results:
    all_results.append(xgb_results)
if lgb_results:
    all_results.append(lgb_results)

# Create comparison DataFrame
comparison_df = pd.DataFrame(all_results)
comparison_df = comparison_df.set_index('Model')

# Add baseline for comparison
baseline_results = {
    'Model': 'Baseline (All Zeros)',
    'Accuracy': baseline_accuracy,
    'Precision': baseline_precision,
    'Recall': baseline_recall,
    'ROC-AUC': baseline_roc_auc,
    'PR-AUC': baseline_pr_auc
}
baseline_df = pd.DataFrame([baseline_results]).set_index('Model')

# Combine baseline with model results
comparison_df = pd.concat([baseline_df, comparison_df])

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison_df.round(4))
print("\n" + "="*70)

# Show which model performs best on each metric
print("\nBest performing models by metric:")
metrics_to_compare = ['Accuracy', 'Precision', 'Recall', 'ROC-AUC', 'PR-AUC']
for metric in metrics_to_compare:
    if metric in ['ROC-AUC', 'PR-AUC', 'Accuracy', 'Precision', 'Recall']:
        best_idx = comparison_df[metric].idxmax()
        best_value = comparison_df.loc[best_idx, metric]
        print(f"  {metric:12s}: {best_idx:20s} ({best_value:.4f})")