# Customer Churn Prediction - Part 2: Model Building

## Overview
This notebook covers:
1. Train-Test Split
2. Feature Scaling
3. Model Training (Multiple Algorithms)
4. Hyperparameter Tuning
5. Model Comparison

## Step 1: Import Libraries

In [ ]:
# Data manipulation
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

# Save models
import joblib
import os

# Warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## Step 2: Load Processed Data

In [ ]:
# Load processed data from previous notebook
X = pd.read_csv('data/X_processed.csv')
y = pd.read_csv('data/y_processed.csv').squeeze()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())

## Step 3: Train-Test Split

In [ ]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain churn distribution
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set churn distribution:")
print(y_train.value_counts())
print(f"\nTesting set churn distribution:")
print(y_test.value_counts())

## Step 4: Feature Scaling

In [ ]:
# Initialize scaler
scaler = StandardScaler()

# Fit scaler on training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using training scaler
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Feature scaling completed!")
print(f"\nScaled training set shape: {X_train_scaled.shape}")
print(f"Scaled testing set shape: {X_test_scaled.shape}")

# Save scaler for later use
os.makedirs('models', exist_ok=True)
joblib.dump(scaler, 'models/scaler.pkl')
print("\nScaler saved to models/scaler.pkl")

## Step 5: Model Training - Baseline Models

### 5.1 Logistic Regression

In [ ]:
# Initialize Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_roc_auc = roc_auc_score(y_test, y_pred_proba_lr)

print("Logistic Regression Results:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print(f"ROC-AUC: {lr_roc_auc:.4f}")

# Save model
joblib.dump(lr_model, 'models/logistic_regression.pkl')
print("\nModel saved to models/logistic_regression.pkl")

### 5.2 Random Forest Classifier

In [ ]:
# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model (no scaling needed for tree-based models)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, y_pred_proba_rf)

print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print(f"ROC-AUC: {rf_roc_auc:.4f}")

# Save model
joblib.dump(rf_model, 'models/random_forest.pkl')
print("\nModel saved to models/random_forest.pkl")

### 5.3 XGBoost Classifier

In [ ]:
if XGBOOST_AVAILABLE:
    # Initialize XGBoost
    xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
    
    # Train the model
    xgb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_xgb = xgb_model.predict(X_test)
    y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    xgb_precision = precision_score(y_test, y_pred_xgb)
    xgb_recall = recall_score(y_test, y_pred_xgb)
    xgb_f1 = f1_score(y_test, y_pred_xgb)
    xgb_roc_auc = roc_auc_score(y_test, y_pred_proba_xgb)
    
    print("XGBoost Results:")
    print(f"Accuracy: {xgb_accuracy:.4f}")
    print(f"Precision: {xgb_precision:.4f}")
    print(f"Recall: {xgb_recall:.4f}")
    print(f"F1-Score: {xgb_f1:.4f}")
    print(f"ROC-AUC: {xgb_roc_auc:.4f}")
    
    # Save model
    joblib.dump(xgb_model, 'models/xgboost.pkl')
    print("\nModel saved to models/xgboost.pkl")
else:
    print("XGBoost not available. Skipping...")

### 5.4 Model Comparison

In [ ]:
# Create comparison dataframe
comparison_data = {
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [lr_accuracy, rf_accuracy],
    'Precision': [lr_precision, rf_precision],
    'Recall': [lr_recall, rf_recall],
    'F1-Score': [lr_f1, rf_f1],
    'ROC-AUC': [lr_roc_auc, rf_roc_auc]
}

if XGBOOST_AVAILABLE:
    comparison_data['Model'].append('XGBoost')
    comparison_data['Accuracy'].append(xgb_accuracy)
    comparison_data['Precision'].append(xgb_precision)
    comparison_data['Recall'].append(xgb_recall)
    comparison_data['F1-Score'].append(xgb_f1)
    comparison_data['ROC-AUC'].append(xgb_roc_auc)

comparison_df = pd.DataFrame(comparison_data)

print("Model Comparison:")
print("="*60)
print(comparison_df.to_string(index=False))

# Visualize comparison
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']

for idx, metric in enumerate(metrics):
    row = idx // 3
    col = idx % 3
    comparison_df.plot(x='Model', y=metric, kind='bar', ax=axes[row, col], legend=False)
    axes[row, col].set_title(f'{metric} Comparison', fontweight='bold')
    axes[row, col].set_ylabel(metric)
    axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('models/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Step 6: Hyperparameter Tuning

### 6.1 Tune Random Forest

In [ ]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# Perform grid search (this may take some time)
print("Starting Random Forest hyperparameter tuning...")
rf_grid_search.fit(X_train, y_train)

# Get best parameters
print(f"\nBest parameters: {rf_grid_search.best_params_}")
print(f"Best cross-validation score: {rf_grid_search.best_score_:.4f}")

# Train best model
rf_best_model = rf_grid_search.best_estimator_

# Evaluate on test set
y_pred_rf_tuned = rf_best_model.predict(X_test)
y_pred_proba_rf_tuned = rf_best_model.predict_proba(X_test)[:, 1]

rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
rf_tuned_roc_auc = roc_auc_score(y_test, y_pred_proba_rf_tuned)

print(f"\nTuned Random Forest Test Accuracy: {rf_tuned_accuracy:.4f}")
print(f"Tuned Random Forest Test ROC-AUC: {rf_tuned_roc_auc:.4f}")

# Save tuned model
joblib.dump(rf_best_model, 'models/random_forest_tuned.pkl')
print("\nTuned model saved to models/random_forest_tuned.pkl")

### 6.2 Tune XGBoost (if available)

In [ ]:
if XGBOOST_AVAILABLE:
    # Define parameter grid for XGBoost
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    }
    
    # Initialize GridSearchCV
    xgb_grid_search = GridSearchCV(
        estimator=xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
        param_grid=xgb_param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    # Perform grid search
    print("Starting XGBoost hyperparameter tuning...")
    xgb_grid_search.fit(X_train, y_train)
    
    # Get best parameters
    print(f"\nBest parameters: {xgb_grid_search.best_params_}")
    print(f"Best cross-validation score: {xgb_grid_search.best_score_:.4f}")
    
    # Train best model
    xgb_best_model = xgb_grid_search.best_estimator_
    
    # Evaluate on test set
    y_pred_xgb_tuned = xgb_best_model.predict(X_test)
    y_pred_proba_xgb_tuned = xgb_best_model.predict_proba(X_test)[:, 1]
    
    xgb_tuned_accuracy = accuracy_score(y_test, y_pred_xgb_tuned)
    xgb_tuned_roc_auc = roc_auc_score(y_test, y_pred_proba_xgb_tuned)
    
    print(f"\nTuned XGBoost Test Accuracy: {xgb_tuned_accuracy:.4f}")
    print(f"Tuned XGBoost Test ROC-AUC: {xgb_tuned_roc_auc:.4f}")
    
    # Save tuned model
    joblib.dump(xgb_best_model, 'models/xgboost_tuned.pkl')
    print("\nTuned model saved to models/xgboost_tuned.pkl")
else:
    print("XGBoost not available. Skipping tuning...")

## Step 7: Save Training and Test Data

In [ ]:
# Save train-test split data for evaluation notebook
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

print("Training and test data saved successfully!")
print("Files saved:")
print("- data/X_train.csv")
print("- data/X_test.csv")
print("- data/y_train.csv")
print("- data/y_test.csv")

## Summary

### Models Trained:
1. Logistic Regression (baseline)
2. Random Forest (with hyperparameter tuning)
3. XGBoost (with hyperparameter tuning, if available)

### Next Steps:
- Proceed to Model Evaluation notebook for detailed performance analysis