# Airline No-Show Optimizer - Model Development

This notebook develops and evaluates machine learning models to predict passenger no-show probability.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from data_preprocessing import DataPreprocessor
from feature_engineering import FeatureEngineer
from models import ModelTrainer
from optimization import RevenueOptimizer

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## 1. Data Loading and Preprocessing

In [None]:
# Load processed data from EDA
# data_path = '../data/processed/airline_data_processed.csv'
# df = pd.read_csv(data_path)

# For demonstration, show expected data structure
print("Please ensure your processed data is available at '../data/processed/airline_data_processed.csv'")
print("Or run the exploratory analysis notebook first to generate the processed data.")

In [None]:
# Initialize preprocessor and feature engineer
preprocessor = DataPreprocessor()
feature_engineer = FeatureEngineer()

# Clean and prepare data
# df_clean = preprocessor.clean_data(df)
# print(f"Data shape after cleaning: {df_clean.shape}")

## 2. Feature Engineering

In [None]:
# Create comprehensive feature set
# df_features = feature_engineer.create_all_features(df_clean)
# print(f"Data shape after feature engineering: {df_features.shape}")

# Display new feature columns
# new_features = set(df_features.columns) - set(df_clean.columns)
# print(f"\nNew features created: {len(new_features)}")
# print(list(new_features)[:20])  # Show first 20 new features

## 3. Feature Selection

In [None]:
# Select most important features
# target_column = 'no_show'
# important_features = feature_engineer.select_features(df_features, target_column)
# print(f"Selected {len(important_features)} important features")
# print(important_features)

## 4. Train-Test Split

In [None]:
# Prepare data for modeling
# X = df_features[important_features]
# y = df_features[target_column]

# Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# print(f"Training set size: {X_train.shape}")
# print(f"Test set size: {X_test.shape}")
# print(f"No-show rate in training: {y_train.mean():.3f}")
# print(f"No-show rate in test: {y_test.mean():.3f}")

## 5. Model Training

In [None]:
# Initialize model trainer
model_trainer = ModelTrainer()

# Train all models
# models = model_trainer.train_all_models(X_train, y_train, X_test, y_test)

# Display model performance
# performance_summary = pd.DataFrame({
#     'Model': list(models.keys()),
#     'Train_Score': [models[m]['train_score'] for m in models.keys()],
#     'Test_Score': [models[m]['test_score'] for m in models.keys()]
# })
# 
# print("Model Performance Summary:")
# print(performance_summary.round(4))

## 6. Model Evaluation

In [None]:
# Detailed evaluation of best model
# best_model_name = performance_summary.loc[performance_summary['Test_Score'].idxmax(), 'Model']
# best_model = models[best_model_name]

# print(f"Best model: {best_model_name}")
# print(f"Test accuracy: {best_model['test_score']:.4f}")

# # Detailed metrics
# y_pred = best_model['predictions']
# y_pred_proba = best_model['probabilities']
# 
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
# 
# print(f"\nAUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
# Visualize model performance
# fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# # ROC Curve
# fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
# auc_score = roc_auc_score(y_test, y_pred_proba)
# 
# axes[0,0].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
# axes[0,0].plot([0, 1], [0, 1], 'k--')
# axes[0,0].set_xlabel('False Positive Rate')
# axes[0,0].set_ylabel('True Positive Rate')
# axes[0,0].set_title('ROC Curve')
# axes[0,0].legend()

# # Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm, annot=True, fmt='d', ax=axes[0,1], cmap='Blues')
# axes[0,1].set_xlabel('Predicted')
# axes[0,1].set_ylabel('Actual')
# axes[0,1].set_title('Confusion Matrix')

# # Prediction Distribution
# axes[1,0].hist(y_pred_proba[y_test == 0], bins=30, alpha=0.7, label='No Show = 0', density=True)
# axes[1,0].hist(y_pred_proba[y_test == 1], bins=30, alpha=0.7, label='No Show = 1', density=True)
# axes[1,0].set_xlabel('Predicted Probability')
# axes[1,0].set_ylabel('Density')
# axes[1,0].set_title('Prediction Distribution')
# axes[1,0].legend()

# # Feature Importance (if available)
# if 'feature_importance' in best_model:
#     importance = best_model['feature_importance']
#     top_features = dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:15])
#     
#     feature_names = list(top_features.keys())
#     feature_values = list(top_features.values())
#     
#     axes[1,1].barh(range(len(feature_names)), feature_values)
#     axes[1,1].set_yticks(range(len(feature_names)))
#     axes[1,1].set_yticklabels(feature_names)
#     axes[1,1].set_xlabel('Importance')
#     axes[1,1].set_title('Top 15 Feature Importances')
# else:
#     axes[1,1].text(0.5, 0.5, 'Feature importance\nnot available', 
#                    ha='center', va='center', transform=axes[1,1].transAxes)
#     axes[1,1].set_title('Feature Importance')

# plt.tight_layout()
# plt.show()

## 7. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for best model
# print("Performing hyperparameter tuning...")
# tuning_results = model_trainer.hyperparameter_tuning(X_train, y_train, best_model_name)

# print(f"\nBest parameters: {tuning_results['best_params']}")
# print(f"Best cross-validation score: {tuning_results['best_score']:.4f}")

# # Evaluate tuned model
# tuned_model = tuning_results['best_model']
# tuned_predictions = tuned_model.predict(X_test)
# tuned_probabilities = tuned_model.predict_proba(X_test)[:, 1]
# tuned_accuracy = tuned_model.score(X_test, y_test)
# tuned_auc = roc_auc_score(y_test, tuned_probabilities)

# print(f"\nTuned model performance:")
# print(f"Accuracy: {tuned_accuracy:.4f}")
# print(f"AUC: {tuned_auc:.4f}")

## 8. Model Interpretation

In [None]:
# Feature importance analysis
# if 'feature_importance' in best_model:
#     feature_importance = model_trainer.get_feature_importance(best_model_name, top_n=20)
#     
#     importance_df = pd.DataFrame({
#         'Feature': list(feature_importance.keys()),
#         'Importance': list(feature_importance.values())
#     })
#     
#     print("Top 20 Most Important Features:")
#     print(importance_df.round(4))
#     
#     # Visualize feature importance
#     plt.figure(figsize=(10, 8))
#     plt.barh(range(len(importance_df)), importance_df['Importance'])
#     plt.yticks(range(len(importance_df)), importance_df['Feature'])
#     plt.xlabel('Feature Importance')
#     plt.title('Top 20 Feature Importances')
#     plt.gca().invert_yaxis()
#     plt.tight_layout()
#     plt.show()

## 9. Business Impact Analysis

In [None]:
# Analyze model performance across different passenger segments
# test_data = X_test.copy()
# test_data['actual_no_show'] = y_test
# test_data['predicted_no_show'] = y_pred
# test_data['predicted_probability'] = y_pred_proba

# # Performance by probability ranges
# test_data['risk_category'] = pd.cut(test_data['predicted_probability'], 
#                                     bins=[0, 0.1, 0.3, 0.7, 1.0],
#                                     labels=['Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk'])

# risk_analysis = test_data.groupby('risk_category').agg({
#     'actual_no_show': ['count', 'mean'],
#     'predicted_probability': 'mean'
# }).round(3)

# risk_analysis.columns = ['count', 'actual_no_show_rate', 'predicted_probability']
# print("Model Performance by Risk Category:")
# print(risk_analysis)

## 10. Revenue Optimization

In [None]:
# Initialize revenue optimizer
revenue_optimizer = RevenueOptimizer()

# Example optimization for a specific flight
# flight_capacity = 180
# sample_flight_data = test_data.sample(150).copy()  # Simulate a flight with 150 bookings
# sample_flight_data['no_show_probability'] = sample_flight_data['predicted_probability']
# sample_flight_data['ticket_price'] = np.random.normal(400, 150, len(sample_flight_data))  # Sample prices

# # Optimize overbooking
# optimization_results = revenue_optimizer.optimize_overbooking_rate(
#     sample_flight_data, flight_capacity
# )

# print("Optimization Results:")
# for key, value in optimization_results.items():
#     if isinstance(value, float):
#         print(f"{key}: {value:.4f}")
#     else:
#         print(f"{key}: {value}")

## 11. Model Deployment Preparation

In [None]:
# Save the best model
# model_save_path = '../results/best_model.pkl'
# model_trainer.save_model(best_model_name, model_save_path)
# print(f"Model saved to {model_save_path}")

# Save feature list for deployment
# feature_list_path = '../results/feature_list.txt'
# with open(feature_list_path, 'w') as f:
#     for feature in important_features:
#         f.write(f"{feature}\n")
# print(f"Feature list saved to {feature_list_path}")

## 12. Key Findings and Recommendations

In [None]:
print("=== MODEL PERFORMANCE SUMMARY ===")
print("Best model: [TO BE DETERMINED]")
print("Test accuracy: [TO BE CALCULATED]")
print("AUC score: [TO BE CALCULATED]")
print("")
print("=== KEY FEATURES ===")
print("Most important predictors: [TO BE IDENTIFIED]")
print("")
print("=== BUSINESS IMPACT ===")
print("Expected revenue improvement: [TO BE CALCULATED]")
print("Optimal overbooking rate: [TO BE OPTIMIZED]")
print("")
print("=== NEXT STEPS ===")
print("1. Deploy model to production environment")
print("2. Implement real-time prediction pipeline")
print("3. Monitor model performance and retrain as needed")
print("4. A/B test optimization strategies")
print("5. Expand to additional routes and airlines")