| Output File                   | Description                                             | Format |
| ----------------------------- | ------------------------------------------------------- | ------ |
| all_customers_predictions.csv | Predictions and probabilities for each customer         | CSV    |
| shap_all_customers.csv        | Customer, predictions, probabilities, SHAP explanations | CSV    |
| shap_all_customers.json       | Same as above, formatted for UI/API                     | JSON   |

In [6]:
import pandas as pd
import joblib

# 1. Load full feature set for all customers (excluding target column if present)
X_full = pd.read_csv('customer_churn_features_with_nlp_ready.csv')

# 2. Load customer IDs from processed CSV (ensure order aligns with X_full)
customer_ids = pd.read_csv('processed_customer_churn_data_with_feedback.csv')['customerID']

# 3. Load your trained model from the models/ directory
model = joblib.load('../models/best_xgb_model.pkl')

# 4. Predict churn for all customers
churn_preds = model.predict(X_full)
churn_probs = model.predict_proba(X_full)[:, 1]

# 5. Combine into DataFrame
df_all_preds = pd.DataFrame({
    'customerID': customer_ids,
    'churn_prediction': churn_preds,
    'churn_probability': churn_probs
})

# 6. Save output to CSV (for UI, dashboards, etc.)
df_all_preds.to_csv('all_customers_predictions.csv', index=False)

print("Saved all customer predictions for full dataset.")


Saved all customer predictions for full dataset.


In [2]:
import shap
import numpy as np
import pandas as pd
import joblib
import os


# Load full dataset features and customer IDs
X_full = pd.read_csv('customer_churn_features_with_nlp_ready.csv')
customer_ids = pd.read_csv('processed_customer_churn_data_with_feedback.csv')['customerID']

# Load trained XGBoost model
model = joblib.load('../models/best_xgb_model.pkl')

# Predict churn on the full dataset
churn_preds = model.predict(X_full)
churn_probs = model.predict_proba(X_full)[:, 1]

# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)

# Compute SHAP values for the full dataset (this may be time-consuming)
shap_values = explainer.shap_values(X_full)

# Determine top 5 SHAP features per customer
shap_abs = np.abs(shap_values)
top_indices = np.argsort(-shap_abs, axis=1)[:, :5]
feature_names = X_full.columns

top_shap_features = []
for i, feature_idxs in enumerate(top_indices):
    features = feature_names[feature_idxs].tolist()
    values = shap_values[i, feature_idxs].tolist()
    top_shap_features.append({"features": features, "values": values})

# Combine results with customer IDs
df_shap = pd.DataFrame({
    'customerID': customer_ids,
    'churn_prediction': churn_preds,
    'churn_probability': churn_probs,
    'top_shap_features': top_shap_features
})

# Paths for output files
json_path = '../metrics/shap_all_customers.json'
csv_path = '../metrics/shap_all_customers.csv'

# Remove existing files if they exist to avoid duplicate appends
if os.path.exists(json_path):
    os.remove(json_path)
if os.path.exists(csv_path):
    os.remove(csv_path)

# Save SHAP explanations and predictions to valid JSON array and CSV
df_shap.to_json(json_path, orient='records', indent=4)

df_shap.to_csv(csv_path, index=False)

print("Full dataset SHAP explanations and predictions saved without duplication.")


Full dataset SHAP explanations and predictions saved without duplication.
