## Stage 3: Model Training, Cost-Sensitive Learning, and Evaluation

## Objective
Implement a robust ML pipeline using ColumnTransformer, apply
cost-sensitive techniques to handle the class imbalance (18.49% delay rate),
and justify the optimal model based on the operational metric, RECALL.

### 1. Configuration and Library Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve
import numpy as np
import matplotlib.pyplot as plt
import joblib

### 2. Load Data and Define Features

In [2]:
try:
    df = pd.read_csv('../data/interim/02_feature_engineered_data.csv', parse_dates=['fecha_i', 'fecha_o'])
    print(f"Data loaded from interim with {len(df)} rows.")
except FileNotFoundError:
    raise SystemExit("Data loading failed. Cannot proceed to Stage 3.")

Data loaded from interim with 68206 rows.


  df = pd.read_csv('../data/interim/02_feature_engineered_data.csv', parse_dates=['fecha_i', 'fecha_o'])


### 3. Data Split and Preprocessing Pipeline

In [3]:
FEATURES = ['month', 'dianom', 'tipovuelo', 'opera', 'siglades', 'period_day', 
            'tavg', 'tavg_is_missing', 
            'opera_historical_delay_rate', 'dest_historical_delay_rate'] 
TARGET = 'delay_15'

# Drop rows where 'opera_historical_delay_rate' might be NaN (due to lack of historical data)
df_model = df.dropna(subset=FEATURES).copy() 
X = df_model[FEATURES]
y = df_model[TARGET]

# Stratified Split: Ensures the 18.49% delay rate is maintained in both sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set size: {len(X_train)} | Test set size: {len(X_test)}")


Training set size: 54564 | Test set size: 13642


### 3.1 Preprocessing Pipeline (ColumnTransformer)

In [5]:
categorical_features = ['dianom', 'tipovuelo', 'opera', 'siglades', 'period_day']
numeric_features = ['month', 'tavg', 'tavg_is_missing', 'opera_historical_delay_rate', 'dest_historical_delay_rate']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features), # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features) # OHE categorical features
    ],
    remainder='passthrough'
)

# Apply the preprocessor and save it for deployment 
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
joblib.dump(preprocessor, '../models/preprocessor_final.pkl')
print("\nPreprocessor fitted, transformed data, and saved for deployment.")


Preprocessor fitted, transformed data, and saved for deployment.


### 4. Model Training with Cost-Sensitive Learning


- Function to calculate scale_pos_weight for tree models (XGBoost)
- Weight = (Count of Negative Samples / Count of Positive Samples)

In [6]:
scale_pos_weight = (y_train.value_counts()[0] / y_train.value_counts()[1])
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

models = {
    # Use class_weight='balanced' to address class imbalance for linear/non-parametric models
    "Logistic_Regression": LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced'),
    "Random_Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    
    # Use scale_pos_weight for XGBoost
    "XGBoost": XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_estimators=100
    )
}

# Train all models
y_probas = {}
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_probas[name] = model.predict_proba(X_test_processed)[:, 1]
    print(f"Trained {name}.")

Calculated scale_pos_weight for XGBoost: 4.41
Trained Logistic_Regression.
Trained Random_Forest.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trained XGBoost.


### 5. Evaluation

In [7]:
def evaluate_model(y_test, y_proba, model_name, threshold=0.5):
    y_pred = (y_proba > threshold).astype(int)
    print(f"\n--- Model: {model_name} (Threshold: {threshold:.2f}) ---")
    
    # Senior Rationale: For alerting, Recall is primary.
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))

# Evaluate models with cost-sensitive settings at default threshold (0.5)
for name, y_proba in y_probas.items():
    evaluate_model(y_test, y_proba, name)



--- Model: Logistic_Regression (Threshold: 0.50) ---
ROC AUC: 0.6535
              precision    recall  f1-score   support

           0       0.87      0.64      0.74     11119
           1       0.27      0.58      0.37      2523

    accuracy                           0.63     13642
   macro avg       0.57      0.61      0.55     13642
weighted avg       0.76      0.63      0.67     13642


--- Model: Random_Forest (Threshold: 0.50) ---
ROC AUC: 0.6353
              precision    recall  f1-score   support

           0       0.83      0.90      0.87     11119
           1       0.32      0.20      0.25      2523

    accuracy                           0.77     13642
   macro avg       0.58      0.55      0.56     13642
weighted avg       0.74      0.77      0.75     13642


--- Model: XGBoost (Threshold: 0.50) ---
ROC AUC: 0.6914
              precision    recall  f1-score   support

           0       0.88      0.69      0.78     11119
           1       0.31      0.59      0.40  

#### 5.1 Threshold Tuning (Operational Decision-Making)

In [8]:
# Plot Precision-Recall curve to find the optimal operational point.
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probas['Logistic_Regression'])

# We aim for an operationally acceptable Recall (e.g., 65%) with the best possible Precision.
# Plotting helps, but we can programmatically find a point or use a justified value.
# Let's use a lower threshold (0.35) often needed for high recall in imbalanced datasets.
optimal_threshold_lr = 0.35

evaluate_model(y_test, y_probas['Logistic_Regression'], "Logistic Regression (Tuned for High Recall)", threshold=optimal_threshold_lr)


--- Model: Logistic Regression (Tuned for High Recall) (Threshold: 0.35) ---
ROC AUC: 0.6535
              precision    recall  f1-score   support

           0       0.92      0.22      0.35     11119
           1       0.21      0.91      0.34      2523

    accuracy                           0.35     13642
   macro avg       0.56      0.56      0.35     13642
weighted avg       0.79      0.35      0.35     13642



### 5.2 Final Model Selection

In [11]:
# Create and train the Voting Classifier ensemble for robustness.
final_model = VotingClassifier(
    estimators=[(n, models[n]) for n in models.keys()], 
    voting='soft', 
    weights=[1, 1, 1]
)
final_model.fit(X_train_processed, y_train)
final_y_proba = final_model.predict_proba(X_test_processed)[:, 1]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
# Evaluate the final robust ensemble
evaluate_model(y_test, final_y_proba, "Final Voting Classifier Ensemble")
joblib.dump(final_model, '../models/voting_classifier_final.pkl')
joblib.dump(models['XGBoost'], '../models/xgb_final.pkl')

print("\nFinal Voting Classifier saved to /models/voting_classifier_final.pkl.")
print("XGBoost model saved separately for Feature Importance analysis in Stage 4.")


--- Model: Final Voting Classifier Ensemble (Threshold: 0.50) ---
ROC AUC: 0.6839
              precision    recall  f1-score   support

           0       0.85      0.86      0.86     11119
           1       0.36      0.34      0.35      2523

    accuracy                           0.76     13642
   macro avg       0.60      0.60      0.60     13642
weighted avg       0.76      0.76      0.76     13642


Final Voting Classifier saved to /models/voting_classifier_final.pkl.
XGBoost model saved separately for Feature Importance analysis in Stage 4.
