In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Final_Dataset.csv')

# Exploratory Data Analysis
print("Dataset Shape:", df.shape)
print("\nChurn Distribution:")
print(df['ChurnStatus'].value_counts())
print("\nMissing Values:")
print(df.isnull().sum())

# Prepare features and target
X = df.drop('ChurnStatus', axis=1)
y = df['ChurnStatus']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'SVM': SVC(probability=True, random_state=42)
}

# Model training and evaluation
results = {}
best_model = None
best_score = 0

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = model.score(X_test, y_test)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'auc': auc_score,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} - Accuracy: {accuracy:.4f}, AUC: {auc_score:.4f}")
    
    if auc_score > best_score:
        best_score = auc_score
        best_model = name

print(f"\n*** Best Model: {best_model} with AUC: {best_score:.4f} ***")

# Hyperparameter tuning for the best model
if best_model == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    }
elif best_model == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
else:  # Gradient Boosting as default
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }

print(f"\nPerforming hyperparameter tuning for {best_model}...")
grid_search = GridSearchCV(
    models[best_model], param_grid, cv=5, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Best tuned model
best_tuned_model = grid_search.best_estimator_
y_pred_tuned = best_tuned_model.predict(X_test)
y_pred_proba_tuned = best_tuned_model.predict_proba(X_test)[:, 1]

print(f"Best parameters: {grid_search.best_params_}")
print(f"Tuned {best_model} AUC: {roc_auc_score(y_test, y_pred_proba_tuned):.4f}")

# Feature Importance Analysis
if hasattr(best_tuned_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_tuned_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importance')
    plt.tight_layout()
    plt.show()

# Detailed evaluation
print("\n=== Detailed Classification Report ===")
print(classification_report(y_test, y_pred_tuned))

print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred_tuned)
print(cm)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# ROC Curve
plt.figure(figsize=(8, 6))
for name, result in results.items():
    fpr, tpr, _ = precision_recall_curve(y_test, result['probabilities'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {result["auc"]:.3f})')

fpr_tuned, tpr_tuned, _ = precision_recall_curve(y_test, y_pred_proba_tuned)
plt.plot(fpr_tuned, tpr_tuned, label=f'Tuned {best_model} (AUC = {roc_auc_score(y_test, y_pred_proba_tuned):.3f})', 
         linewidth=2, linestyle='--')

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.show()

# Business metrics
def calculate_business_metrics(y_true, y_pred, y_pred_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall (Sensitivity)': recall,
        'Specificity': specificity,
        'F1-Score': f1,
        'AUC': roc_auc_score(y_true, y_pred_proba)
    }

business_metrics = calculate_business_metrics(y_test, y_pred_tuned, y_pred_proba_tuned)
print("\n=== Business Metrics ===")
for metric, value in business_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save the best model
import joblib
joblib.dump(best_tuned_model, 'best_churn_model.pkl')
print("\nBest model saved as 'best_churn_model.pkl'")

# Prediction function for new data
def predict_churn_probability(new_customer_data, model=best_tuned_model):
    """
    Predict churn probability for new customer data
    """
    if isinstance(new_customer_data, pd.DataFrame):
        probabilities = model.predict_proba(new_customer_data)
        predictions = model.predict(new_customer_data)
        
        results_df = new_customer_data.copy()
        results_df['Churn_Probability'] = probabilities[:, 1]
        results_df['Churn_Prediction'] = predictions
        results_df['Risk_Level'] = pd.cut(probabilities[:, 1], 
                                        bins=[0, 0.3, 0.7, 1], 
                                        labels=['Low', 'Medium', 'High'])
        
        return results_df
    else:
        raise ValueError("Input should be a pandas DataFrame")

# Example usage (commented out as we don't have new data)
# new_customers = pd.DataFrame({...})  # Your new data with same features
# churn_predictions = predict_churn_probability(new_customers)
# print(churn_predictions[['Churn_Probability', 'Churn_Prediction', 'Risk_Level']])