In [None]:
pip install scikit-learn xgboost shap

In [9]:
import pandas as pd
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
#Load data from CSV (export from Snowflake for simplicity)
path = r"C:\Users\maxwell.bicking\Downloads\2025-03-31 12_13pm (2).csv"

df = pd.read_csv(path)

In [None]:
#Convert labels
df['CHURNED_BINARY'] = df['CHURNED'].map({'Yes': 1, 'No': 0})
df['PUSHED_BINARY'] = df['PUSHED'].map({'Yes': 1, 'No': 0})


#Handle missing values (people who never donated, went to meetings, logged in, etc)
#Fill with a very large number (e.g., max + 1)
max_days = df['DAYS_SINCE_LAST_ACTIVITY'].max()
df['DAYS_SINCE_LAST_ACTIVITY'] = df['DAYS_SINCE_LAST_ACTIVITY'].fillna(max_days + 1)

max_days = df['DAYS_SINCE_JOINED'].max()
df['DAYS_SINCE_JOINED'] = df['DAYS_SINCE_JOINED'].fillna(max_days + 1)

max_days = df['DAYS_SINCE_MOST_RECENT_EVENT'].max()
df['DAYS_SINCE_MOST_RECENT_EVENT'] = df['DAYS_SINCE_MOST_RECENT_EVENT'].fillna(max_days + 1)
df['NEVER_ATTENDED_EVENT'] = df['DAYS_SINCE_MOST_RECENT_EVENT'].isna().astype(int) #Also flag them separately with a binary column

max_days = df['DAYS_SINCE_MOST_RECENT_DONATION'].max()
df['DAYS_SINCE_MOST_RECENT_DONATION'] = df['DAYS_SINCE_MOST_RECENT_DONATION'].fillna(max_days + 1)
df['NEVER_DONATED'] = df['DAYS_SINCE_MOST_RECENT_DONATION'].isna().astype(int) #Also flag them separately with a binary column

# Identify boolean columns 
bool_cols = df.select_dtypes(include='bool').columns

# Convert them to 0/1
df[bool_cols] = df[bool_cols].astype(int)


In [None]:
#Drop original targets for training
X = df.drop(columns=['CHURNED', 'CHURNED_BINARY', 'PUSHED', 'PUSHED_BINARY'])
y = df['CHURNED_BINARY']  # use PUSHED_BINARY for push prediction

#Handle categorical columns (XGBoost needs numeric input)
X = pd.get_dummies(X, dummy_na=True)

#Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#Train XGBoost model
model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    missing=None  # let XGBoost handle NaNs
)
model.fit(X_train, y_train)

#Predictions and evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))



In [None]:
# SHAP for feature importance
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, max_display=20)