In [None]:
# PROJECT #3 – Predictive Lead Scoring (RandomForest + SHAP explanations)
# 41k Bank Marketing clients → probability of buying a term deposit
# Run all cells – takes ~2 minutes

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
import shap
import matplotlib.pyplot as plt
%matplotlib inline

# Load the original clean file
df = pd.read_csv('bank-additional-full.csv', sep=';')

print(f"Loaded {df.shape[0]:,} leads with {df.shape[1]} columns")
print("Target distribution:", df['y'].value_counts(normalize=True).round(3))

# Target encoding
df['y'] = (df['y'] == 'yes').astype(int)

# Simple feature engineering (you can add more later)
df['age_group'] = pd.cut(df['age'], bins=[0,30,40,60,100], labels=['<30','30-40','40-60','60+'])
df['has_housing_loan'] = (df['housing'] == 'yes').astype(int)
df['previous_success'] = (df['poutcome'] == 'success').astype(int)

# One-hot encode categoricals (fast & clean)
cat_cols = ['job','marital','education','contact','month','day_of_week','poutcome','age_group']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Features & target
X = df_encoded.drop(['y','age','housing'], axis=1)  # drop originals we already encoded
y = df_encoded['y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Model
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predictions
y_prob = rf.predict_proba(X_test)[:,1]
y_pred = rf.predict(X_test)

print(f"\nROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print("Top 5 feature importances:")
print(pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head())

# Add predictions to original test set (for client delivery)
test_results = df.iloc[X_test.index].copy().reset_index(drop=True)
test_results['conversion_probability_%'] = (y_prob * 100).round(1)
test_results['predicted_tier'] = pd.cut(y_prob, 
                                        bins=[0, 0.5, 0.7, 1.0], 
                                        labels=['Silver (<50%)','Gold (50-70%)','Platinum (≥70%)'])

# Save full client-ready file
test_results.to_csv('03_Bank_Predictive_Scored_Leads.csv', index=False)
print("\nSaved → 03_Bank_Predictive_Scored_Leads.csv (client delivery file)")

# SHAP explanations (this is what makes clients go “wow”)
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test.iloc[:300])  # first 300 for speed

# 1. Summary plot
shap.summary_plot(shap_values[1], X_test.iloc[:300], show=False)
plt.title('SHAP Summary – Why Leads Convert')
plt.tight_layout()
plt.show()

# 2. Waterfall for the #1 hottest lead
best_idx = np.argmax(y_prob)
shap.plots.waterfall(shap.Explanation(
    values=shap_values[1][best_idx],
    base_values=explainer.expected_value[1],
    data=X_test.iloc[best_idx]
), show=False)
plt.title(f'SHAP Waterfall – Why This Lead Has {y_prob[best_idx]*100:.1%} Probability')
plt.tight_layout()
plt.show()

print("\nPROJECT #3 COMPLETE")
print("You now have the exact notebook that wins $4k–$10k gigs")