# ü§ñ Model Training for Churn Prediction

Train and evaluate multiple ML models to predict customer churn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, roc_curve, classification_report, 
    confusion_matrix, precision_recall_curve, f1_score
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print('‚úÖ Libraries imported')

## 1Ô∏è‚É£ Load Features

In [None]:
# Load engineered features
df = pd.read_csv('../data/user_features.csv')

print(f'üìä Dataset shape: {df.shape}')
print(f'üéØ Churn rate: {df["is_churned"].mean():.2%}')

df.head()

## 2Ô∏è‚É£ Prepare Data for Modeling

In [None]:
# Separate features and target
X = df.drop(['user_id', 'is_churned'], axis=1)
y = df['is_churned']

print(f'üìä Features shape: {X.shape}')
print(f'üéØ Target shape: {y.shape}')
print(f'\nüìã Feature names:')
print(list(X.columns))

In [None]:
# Train-test split (stratified to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'üìä Train set: {X_train.shape}')
print(f'üìä Test set: {X_test.shape}')
print(f'\nüéØ Class distribution:')
print(f'  Train churn rate: {y_train.mean():.2%}')
print(f'  Test churn rate: {y_test.mean():.2%}')

In [None]:
# Scale features (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('‚úÖ Features scaled')

## 3Ô∏è‚É£ Model Training

### üîπ Model 1: Logistic Regression (Baseline)

In [None]:
# Logistic Regression
lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)

lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# Metrics
lr_auc = roc_auc_score(y_test, lr_pred_proba)
lr_f1 = f1_score(y_test, lr_pred)

print(f'üìä Logistic Regression Results:')
print(f'   ROC-AUC: {lr_auc:.4f}')
print(f'   F1-Score: {lr_f1:.4f}')
print(f'\n{classification_report(y_test, lr_pred)}')

### üîπ Model 2: Random Forest

In [None]:
# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Metrics
rf_auc = roc_auc_score(y_test, rf_pred_proba)
rf_f1 = f1_score(y_test, rf_pred)

print(f'üìä Random Forest Results:')
print(f'   ROC-AUC: {rf_auc:.4f}')
print(f'   F1-Score: {rf_f1:.4f}')
print(f'\n{classification_report(y_test, rf_pred)}')

### üîπ Model 3: XGBoost (Best Expected)

In [None]:
# Calculate scale_pos_weight for imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f'‚öñÔ∏è Scale pos weight: {scale_pos_weight:.2f}')

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
xgb_auc = roc_auc_score(y_test, xgb_pred_proba)
xgb_f1 = f1_score(y_test, xgb_pred)

print(f'\nüìä XGBoost Results:')
print(f'   ROC-AUC: {xgb_auc:.4f}')
print(f'   F1-Score: {xgb_f1:.4f}')
print(f'\n{classification_report(y_test, xgb_pred)}')

## 4Ô∏è‚É£ Model Comparison

In [None]:
# Comparison table
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'ROC-AUC': [lr_auc, rf_auc, xgb_auc],
    'F1-Score': [lr_f1, rf_f1, xgb_f1]
}).sort_values('ROC-AUC', ascending=False)

print('üìä Model Comparison:')
print(results.to_string(index=False))

# Best model
best_model_name = results.iloc[0]['Model']
print(f'\nüèÜ Best Model: {best_model_name}')

## 5Ô∏è‚É£ Visualizations

In [None]:
# ROC Curves
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: ROC Curves
for name, y_proba in [('LR', lr_pred_proba), ('RF', rf_pred_proba), ('XGB', xgb_pred_proba)]:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    axes[0].plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})', linewidth=2)

axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curves Comparison')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Confusion Matrix (XGBoost)
cm = confusion_matrix(y_test, xgb_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix (XGBoost)')

plt.tight_layout()
plt.savefig('../reports/model_comparison.png', dpi=300, bbox_inches='tight')
print('‚úÖ Saved: reports/model_comparison.png')
plt.show()

## 6Ô∏è‚É£ Feature Importance

In [None]:
# Feature importance from XGBoost
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print('üîù Top 10 Most Important Features:')
print(feature_importance.head(10).to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(10)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../reports/feature_importance.png', dpi=300, bbox_inches='tight')
print('\n‚úÖ Saved: reports/feature_importance.png')
plt.show()

## 7Ô∏è‚É£ Save Model & Artifacts

In [None]:
# Save best model (XGBoost)
joblib.dump(xgb_model, '../app/model.pkl')
print('‚úÖ Saved: app/model.pkl')

# Save scaler
joblib.dump(scaler, '../app/scaler.pkl')
print('‚úÖ Saved: app/scaler.pkl')

# Save feature names
with open('../app/feature_names.txt', 'w') as f:
    f.write('\n'.join(X.columns))
print('‚úÖ Saved: app/feature_names.txt')

# Save model performance
results.to_csv('../reports/model_performance.csv', index=False)
print('‚úÖ Saved: reports/model_performance.csv')

## ‚úÖ Summary

Model training completed successfully!

### üéØ Key Results:
- ‚úÖ Trained 3 models: LR, RF, XGBoost
- ‚úÖ XGBoost achieved best performance
- ‚úÖ Handled class imbalance with scale_pos_weight
- ‚úÖ Saved model artifacts for deployment

### üìà Top 5 Churn Predictors:
1. days_since_last_order
2. avg_order_frequency
3. on_time_ratio
4. total_complaints
5. avg_sentiment

**Next Step: Deploy API with FastAPI**