# Hotels.com Customer Churn Analysis
## Main Analysis Notebook

This notebook orchestrates the complete churn analysis pipeline using modular Python code from the `src/` folder.

**Features:**
- ✅ Loads saved models if they exist (no retraining needed)
- ✅ Generates all visualizations
- ✅ Complete statistical analysis
- ✅ Customer risk scoring

**Run all cells sequentially to execute the full analysis.**


In [None]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')

# Import custom modules from src folder
import sys
sys.path.append('.')

from src import config
from src import data_loader
from src import visualizations
from src import statistical_tests
from src import models

# Setup plot style
config.setup_plot_style()

print("✓ All modules imported successfully")
print("✓ Plot style configured")


## Step 1: Data Loading and Preprocessing


In [None]:
# Load and preprocess data
df = data_loader.load_data()
data_loader.get_data_summary(df)

df_processed = data_loader.preprocess_data(df)
customer_df = data_loader.aggregate_to_customer_level(df_processed)


## Step 2: Exploratory Data Analysis


In [None]:
# Overall churn distribution
visualizations.plot_churn_distribution(df_processed)


In [None]:
# Churn by categorical variables
print("Customer Type Analysis:")
customer_type_analysis = visualizations.plot_churn_by_category(df_processed, 'customer_type', 'Customer Type')

print("\nLoyalty Tier Analysis:")
print("  0 = Not a member, 1 = Base member, 2 = Silver/Gold member")
loyalty_analysis = visualizations.plot_churn_by_category(df_processed, 'loyalty_tier', 'Loyalty Tier')

print("\nPlatform Analysis:")
platform_analysis = visualizations.plot_churn_by_category(df_processed, 'platform', 'Platform')

print("\nMarketing Channel Analysis:")
marketing_analysis = visualizations.plot_churn_by_category(df_processed, 'marketing_channel', 'Marketing Channel', figsize=(12, 6))


In [None]:
# Binary flags analysis
visualizations.plot_binary_flags(df_processed, config.BINARY_FLAGS)


In [None]:
# Numerical feature analysis
statistical_tests.calculate_mean_comparison(df_processed, config.NUMERICAL_COLS)
visualizations.plot_numerical_distributions(df_processed, config.NUMERICAL_COLS)


In [None]:
# Correlation analysis
visualizations.plot_correlation_heatmap(df_processed)


## Step 3: Statistical Significance Testing


In [None]:
# Statistical significance tests
ttest_results = statistical_tests.perform_ttest(df_processed, config.NUMERICAL_COLS)

chi_square_cols = ['customer_type', 'loyalty_tier', 'platform', 'marketing_channel',
                   'coupon_flag', 'pay_now_flag', 'cancel_flag']
chi_square_results = statistical_tests.perform_chi_square_tests(df_processed, chi_square_cols)


## Step 4: Model Training and Evaluation

**Note:** If models already exist in `results/` folder, they will be loaded instead of retraining.


In [None]:
# Prepare features
X, y, feature_cols, model_df_dummies = models.prepare_features(customer_df)

# Check if models exist
models_exist_flag = models.models_exist()

if models_exist_flag:
    print("\n" + "="*60)
    print("LOADING EXISTING MODELS (Skipping training)")
    print("="*60)
    saved_models = models.load_models()
    
    if saved_models:
        lr_model = saved_models['logistic_regression']
        rf_model = saved_models['random_forest']
        gb_model = saved_models['gradient_boosting']
        scaler = saved_models['scaler']
        feature_cols = saved_models['feature_cols']
        
        # Split data for evaluation
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42, stratify=y
        )
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Get predictions
        y_pred_lr = lr_model.predict(X_test_scaled)
        y_prob_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
        y_pred_rf = rf_model.predict(X_test)
        y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
        y_pred_gb = gb_model.predict(X_test)
        y_prob_gb = gb_model.predict_proba(X_test)[:, 1]
        
        print("\n✓ Models loaded successfully!")
        training_needed = False
    else:
        training_needed = True
else:
    training_needed = True

if training_needed:
    print("\n" + "="*60)
    print("TRAINING NEW MODELS")
    print("="*60)


In [None]:
if training_needed:
    # Split and scale data
    X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler = models.split_and_scale_data(X, y)
    
    # Train Logistic Regression
    lr_model, y_pred_lr, y_prob_lr = models.train_logistic_regression(
        X_train_scaled, y_train, X_test_scaled, y_test
    )
    
    # Get odds ratios
    odds_ratios = models.get_logistic_regression_odds_ratios(lr_model, feature_cols)
    
    # Visualize LR coefficients
    import matplotlib.pyplot as plt
    import pandas as pd
    from src.config import COLORS
    
    fig, ax = plt.subplots(figsize=(10, 8))
    top_features = odds_ratios.head(15)
    colors = [COLORS['positive'] if c > 0 else COLORS['negative'] for c in top_features['Coefficient']]
    ax.barh(range(len(top_features)), top_features['Coefficient'], color=colors, 
            edgecolor='white', linewidth=1)
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features['Feature'])
    ax.set_xlabel('Coefficient (Log Odds)', fontsize=12)
    ax.set_title('Logistic Regression: Top 15 Feature Coefficients', fontweight='bold')
    ax.axvline(x=0, color=COLORS['text'], linestyle='-', linewidth=1.5)
    plt.tight_layout()
    plt.gca().invert_yaxis()
    plt.show()
else:
    print("⏭ Skipping training - using loaded models")


In [None]:
if training_needed:
    # Train Random Forest
    rf_model, y_pred_rf, y_prob_rf = models.train_random_forest(
        X_train, y_train, X_test, y_test
    )
    
    # Feature importance
    print("\n--- Random Forest Feature Importance ---")
    rf_importance = visualizations.plot_feature_importance(
        feature_cols, rf_model.feature_importances_,
        title='Random Forest: Top 15 Feature Importance'
    )
else:
    print("⏭ Skipping Random Forest training - using loaded model")
    # Still show feature importance from loaded model
    rf_importance = visualizations.plot_feature_importance(
        feature_cols, rf_model.feature_importances_,
        title='Random Forest: Top 15 Feature Importance'
    )


In [None]:
if training_needed:
    # Train Gradient Boosting
    gb_model, y_pred_gb, y_prob_gb = models.train_gradient_boosting(
        X_train, y_train, X_test, y_test
    )
else:
    print("⏭ Skipping Gradient Boosting training - using loaded model")


## Step 5: Model Comparison and Risk Scoring


In [None]:
# Model comparison - ROC curves and metrics
metrics_comparison = visualizations.plot_model_comparison(
    y_test, y_prob_lr, y_prob_rf, y_prob_gb,
    y_pred_lr, y_pred_rf, y_pred_gb
)

# Confusion matrices
print("\n--- Confusion Matrices ---")
visualizations.plot_confusion_matrices(y_test, y_pred_lr, y_pred_rf, y_pred_gb)


In [None]:
# Customer Risk Scoring (using Random Forest - best model)
customer_scores = models.score_customers(rf_model, X, model_df_dummies)
visualizations.plot_risk_segmentation(customer_scores)


## Summary


In [None]:
print("=" * 70)
print("                      ANALYSIS SUMMARY")
print("=" * 70)

print(f"""
┌─────────────────────────────────────────────────────────────────────┐
│                        DATA SUMMARY                                  │
├─────────────────────────────────────────────────────────────────────┤
│  Total Bookings Analysed:         {len(df):>10,}                          │
│  Unique Customers:                {len(customer_df):>10,}                          │
│  Overall Churn Rate:              {df['churn_flag'].mean()*100:>10.2f}%                        │
│  Date Range:                      {df['bk_date'].min()} to {df['bk_date'].max()}       │
├─────────────────────────────────────────────────────────────────────┤
│                        MODEL PERFORMANCE                             │
├─────────────────────────────────────────────────────────────────────┤
│  Best Model:                      Random Forest                      │
│  ROC-AUC Score:                   {metrics_comparison.loc[metrics_comparison['Model'] == 'Random Forest', 'ROC-AUC'].values[0]:>10.4f}                        │
│  Accuracy:                        {metrics_comparison.loc[metrics_comparison['Model'] == 'Random Forest', 'Accuracy'].values[0]:>10.4f}                        │
├─────────────────────────────────────────────────────────────────────┤
│                        KEY DRIVERS OF CHURN                          │
├─────────────────────────────────────────────────────────────────────┤
│  1. Customer Type (New vs Existing)                                  │
│  2. Loyalty Tier (Non-member vs Member)                              │
│  3. Cancellation Behaviour                                           │
│  4. Website Engagement (Visit duration, pages viewed)                │
│  5. Number of Previous Bookings                                      │
└─────────────────────────────────────────────────────────────────────┘
""")

print("✅ Analysis Complete - Ready for presentation to leadership team")
print(f"✅ Models saved to: results/")
print(f"   • logistic_regression.pkl")
print(f"   • random_forest.pkl")
print(f"   • gradient_boosting.pkl")
print(f"   • scaler.pkl")
print(f"   • feature_cols.pkl")
