In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

Load Data

In [None]:
"""
syn data for pub: r"C:\Users\maxwell.bicking\downloads\donation_data_syn.csv"
"""

contacts_df = pd.read_csv(r"C:\Users\maxwell.bicking\Downloads\contacts_for_donor_propensity.csv")

#import median income by zip code data from Census Bureau
census_df = pd.read_csv(r"C:\Users\maxwell.bicking\data-science-portfolio\Donor Propensity\Median Income by ZIP.csv")

census_df['ZIP'] = census_df['Geographic Area Name'].str.strip().str[-5:] #add zip column to join to contact table

df = contacts_df.merge(
    census_df[['ZIP', 'Median Income']],
    left_on='MAILING_ZIP_CODE',
    right_on='ZIP',
    how='left'
)

df = df.drop(columns=['ZIP', 'DAYS_SINCE_MOST_RECENT_EVENT', 'TOTAL_MEETING_PAID_AMOUNT_LAST_YEAR', 'TOTAL_MEETING_PAID_AMOUNT'])

In [3]:
df.head(10)

Unnamed: 0,MAILING_COUNTRY,MAILING_ZIP_CODE,AGE,HAS_OPTED_OUT_OF_EMAIL,DO_NOT_CALL,DAYS_SINCE_CREATED,DAYS_SINCE_MODIFIED,DAYS_SINCE_LAST_ACTIVITY,HOME_DO_NOT_CALL,MOBILE_DO_NOT_CALL,...,DAYS_SINCE_MOST_RECENT_DONATION,TOTAL_DONATION_AMOUNT,TOTAL_OPPORTUNITIES,TOTAL_AMOUNT_LAST_YEAR,TOTAL_OPPORTUNITIES_LAST_YEAR,ASSOCIATED_WITH_MEMBERSHIP,TITLE_CHANGE,PUSHED,CHURNED,Median Income
0,United States,19008,,False,False,2372,37,1268.0,False,False,...,10145.0,50.0,1,0.0,0,0,0,0,0,127646.0
1,United States,55905,73.0,False,False,2372,37,242.0,False,False,...,884.0,25.0,1,0.0,0,1,0,1,0,
2,United States,10023,,False,False,492,37,463.0,False,False,...,493.0,104.0,1,0.0,0,0,0,0,0,157866.0
3,United States,8033,,False,False,1408,37,,False,False,...,1418.0,50.0,1,0.0,0,0,0,0,0,143661.0
4,United States,21012,,False,False,1012,92,1003.0,False,False,...,1034.0,5.0,1,0.0,0,0,0,0,0,137544.0
5,United States,19382,,False,False,706,37,,False,False,...,706.0,50.0,1,0.0,0,0,0,0,0,126159.0
6,United States,10021,,False,False,153,37,,False,False,...,153.0,100.0,1,100.0,1,0,0,0,0,156712.0
7,United States,1940,,False,False,490,37,,False,False,...,506.0,104.75,1,0.0,0,0,0,0,0,171044.0
8,United States,94040,,False,False,2372,37,1268.0,False,False,...,3072.0,50.0,1,0.0,0,0,0,0,0,184494.0
9,United States,55372,,False,False,2094,37,,False,False,...,2094.0,25.0,1,0.0,0,0,0,0,0,140835.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85374 entries, 0 to 85373
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   MAILING_COUNTRY                  75077 non-null  object 
 1   MAILING_ZIP_CODE                 73909 non-null  object 
 2   AGE                              30409 non-null  float64
 3   HAS_OPTED_OUT_OF_EMAIL           85374 non-null  bool   
 4   DO_NOT_CALL                      85374 non-null  bool   
 5   DAYS_SINCE_CREATED               85374 non-null  int64  
 6   DAYS_SINCE_MODIFIED              85374 non-null  int64  
 7   DAYS_SINCE_LAST_ACTIVITY         37851 non-null  float64
 8   HOME_DO_NOT_CALL                 85374 non-null  bool   
 9   MOBILE_DO_NOT_CALL               85374 non-null  bool   
 10  OTHER_DO_NOT_CALL                85374 non-null  bool   
 11  PERSONAL_EMAIL_OPT_OUT           85374 non-null  bool   
 12  WORK_DO_NOT_CALL  

In [None]:
df[["GENDER", "RACE", "MAILING_COUNTRY", "MAILING_ZIP_CODE", 
    "INCOME_LEVEL", "INSTITUTION_TYPE", "PRIMARY_RESEARCH_AREA", 
    "HIGHEST_DEGREE", "POLITICAL_PARTY"]] = df[["GENDER", "RACE", "MAILING_COUNTRY", "MAILING_ZIP_CODE", 
    "INCOME_LEVEL", "INSTITUTION_TYPE", "PRIMARY_RESEARCH_AREA", 
    "HIGHEST_DEGREE", "POLITICAL_PARTY"]] .fillna("Unknown", inplace = True)

df["NET_WORTH_QUARTILE"] = pd.qcut(df["NET_WORTH"], q=4, labels=[1, 2, 3, 4])
df["NET_WORTH_QUARTILE"] = df["NET_WORTH_QUARTILE"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "HIGHEST_DEGREE", "POLITICAL_PARTY"]] .fillna("Unknown", inplace = True)


### To do:

- Ensure TOTAL_DONATION_AMOUNT exists
- Add contact ID faker
- Nulls:
- Gender, race, country, zip, Income level, institution type, primary research area, highest degree, political party -> unknown
- Member type, mem status -> nonmember
 
- Tons of DAYS_SINCE columns to worry about, will fill with max vals

SUGGESTED COLUMNS:
- Total number of donations
- First gift amount
- Time since first gift
- Net worth (or wealth score)
- Event attendance
- Engagement metrics (volunteer, emails, etc.)

df["donation_growth_rate"] = df["total_donated_last_2y"] / df["total_donated_first_2y"]

Add binary column "has donated in the last year"

Add binary column "is top donor" for total >$10,000

Add HAS_MADE_LARGE_DONATION and/or LARGEST_DONATION

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# ---------------------------
# Step 1: Data Loading and Exploration
# ---------------------------
# Replace 'contacts_data.csv' with the path to your CSV file
df = pd.read_csv('contacts_data.csv')

# Quick look at the data
print("Data shape:", df.shape)
print(df.head())

# ---------------------------
# Step 2: Data Preprocessing
# ---------------------------
# Define the target column and determine feature columns.
# In this example, we predict "top_donor" (assumed to be 0/1 or similar).
target_column = 'top_donor'
# Remove the target column from the list of features.
# You might choose to drop columns that are not predictive or have too many unique values (e.g., MAILING_ZIP_CODE)
drop_columns = ['MAILING_ZIP_CODE']  # You can add others if needed

# Separate features and target
X = df.drop(columns=[target_column] + drop_columns)
y = df[target_column]

# Identify lists for categorical and numerical columns.
# You can use your data types to decide. Here are some suggestions:
categorical_cols = [
    'MAILING_COUNTRY', 'GENDER', 'INCOME_LEVEL', 'INSTITUTION_TYPE', 
    'MEMBER_TYPE', 'MEMBERSHIP_STATUS', 'PRIMARY_RESEARCH_AREA',
    'RACE', 'POLITICAL_PARTY', 'HIGHEST_DEGREE', 'Median Income'
]
# The remaining columns (or explicitly defined ones) are numerical or boolean.
# Booleans can be treated as categorical (or as numbers: 0 and 1). For our model, we can keep them as numerical.
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# If there are boolean columns in categorical_cols, they may be left as numerical.
# It might be best to make sure data types are correct:
# For example:
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype(int)

# Fill missing values.
# For numerical features, we can fill with the median.
# For categorical features, fill with a constant such as 'missing'.
for col in numerical_cols:
    X[col].fillna(X[col].median(), inplace=True)
for col in categorical_cols:
    X[col].fillna('missing', inplace=True)

# ---------------------------
# Step 3: Building a Preprocessing Pipeline
# ---------------------------
# We will create a ColumnTransformer that:
# - One-hot encodes categorical columns.
# - Scales numerical columns.
# You may decide to do additional feature engineering on high cardinality columns.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# ---------------------------
# Step 4: Splitting Data into Training and Test Sets
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Step 5: Building and Training the Predictive Model
# ---------------------------
# We create a pipeline that performs the preprocessing then fits a Random Forest classifier.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Optionally, you can do hyperparameter tuning via GridSearchCV.
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters from GridSearchCV:", grid_search.best_params_)
print("Best ROC-AUC score from GridSearchCV:", grid_search.best_score_)

# Use the best estimator for evaluation
model = grid_search.best_estimator_

# ---------------------------
# Step 6: Model Evaluation
# ---------------------------
# Evaluate on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC score:", roc_auc)

# ---------------------------
# Step 7: Identifying Potential Donors with Limited Donation History
# ---------------------------
# In this section, we want to identify contacts who have not yet donated or have donated very little,
# but whose attributes are similar to our top donors.
#
# Here, we assume that "TOTAL_DONATION_AMOUNT" is available in the original DataFrame.
# Define a threshold below which you consider a donation as minimal.
donation_threshold = 10  # Adjust the threshold as appropriate (for example, $10)

# Identify contacts who have given little or nothing (you might also want to consider using MOST_RECENT_DONATION_AMOUNT)
low_donors = df[df['TOTAL_DONATION_AMOUNT'] < donation_threshold].copy()

# Ensure all required features are processed similarly to X.
# Note: low_donors should include all the features needed for the model.
X_low_donors = low_donors.drop(columns=[target_column] + drop_columns)

# Fill missing values in the low_donors set as was done in preprocessing.
for col in numerical_cols:
    if col in X_low_donors.columns:
        X_low_donors[col].fillna(X_low_donors[col].median(), inplace=True)
for col in categorical_cols:
    if col in X_low_donors.columns:
        X_low_donors[col].fillna('missing', inplace=True)

# Convert booleans to integers (if not already handled)
for col in X_low_donors.columns:
    if X_low_donors[col].dtype == 'bool':
        X_low_donors[col] = X_low_donors[col].astype(int)

# Generate predicted probabilities for these contacts.
low_donors_probs = model.predict_proba(X_low_donors)[:, 1]

# Append the predicted probabilities to the low_donors DataFrame for ranking.
low_donors['predicted_top_donor_score'] = low_donors_probs

# Sort contacts by predicted probability of being a top donor (descending order).
potential_donors = low_donors.sort_values(by='predicted_top_donor_score', ascending=False)

print("\nTop potential donors from contacts with low donation history:")
print(potential_donors[['predicted_top_donor_score', 'TOTAL_DONATION_AMOUNT']].head(10))

# ---------------------------
# Step 8: Conclusion and Next Steps
# ---------------------------
# The script above demonstrates:
# 1. Data preprocessing including handling missing values, encoding, and scaling.
# 2. Splitting your data into training and test sets.
# 3. Building a predictive model with hyperparameter tuning.
# 4. Evaluating the model's performance using several metrics.
# 5. Using the model to identify contacts who look like top donors but have donated little.
#
# Next steps could include:
# - Further feature engineering and exploration (e.g., clustering analysis on high-probability candidates).
# - Testing additional models and ensemble methods.
# - Validating the model’s predictions with domain experts and iterating.

### Bulk below

In [11]:
# -------- Step 2: Define Target --------
donation_cutoff = df["TOTAL_DONATION_AMOUNT"].quantile(0.90)
df["top_donor"] = (df["TOTAL_DONATION_AMOUNT"] >= donation_cutoff).astype(int)

# -------- Step 3: Feature Engineering --------
df["DONATION_QUARTILE"] = pd.cut(
    df["TOTAL_DONATION_AMOUNT"],
    bins=[-1, 0, 100, 1000, df["TOTAL_DONATION_AMOUNT"].max()],
    labels=[0, 1, 2, 3]  # You can relabel these too
)

In [12]:
df["DONATION_QUARTILE"].describe()
df["DONATION_QUARTILE"].value_counts().head(10)

DONATION_QUARTILE
0    56244
1    22854
2     5480
3      795
Name: count, dtype: int64

In [14]:
df_datatypes = pd.DataFrame(df.dtypes)
df_null_count = df.count()

In [15]:
df_datatypes

Unnamed: 0,0
MAILING_COUNTRY,object
MAILING_ZIP_CODE,object
AGE,float64
HAS_OPTED_OUT_OF_EMAIL,bool
DO_NOT_CALL,bool
DAYS_SINCE_CREATED,int64
DAYS_SINCE_MODIFIED,int64
DAYS_SINCE_LAST_ACTIVITY,float64
HOME_DO_NOT_CALL,bool
MOBILE_DO_NOT_CALL,bool


In [None]:
# -------- Step 2: Define Target --------
donation_cutoff = df["TOTAL_DONATION_AMOUNT"].quantile(0.90)
df["top_donor"] = (df["TOTAL_DONATION_AMOUNT"] >= donation_cutoff).astype(int)

# -------- Step 3: Feature Engineering --------
df["donation_bucket"] = pd.qcut(df["TOTAL_DONATION_AMOUNT"], q=5,
                                 labels=["Very Low", "Low", "Medium", "High", "Very High"])

# -------- Step 4: Handle Nulls --------
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != "top_donor"]

df[categorical_cols] = df[categorical_cols].fillna("No Answer")
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# -------- Step 5: Feature Importance --------
le = LabelEncoder()
X_cat = df[categorical_cols].apply(lambda col: le.fit_transform(col.astype(str)))
chi2_vals, p_vals = chi2(X_cat, df["top_donor"])
chi2_scores = pd.DataFrame({
    "Feature": categorical_cols,
    "Importance": chi2_vals,
    "p_value": p_vals,
    "Method": "Chi2"
})

X_num = df[numerical_cols]
mi_scores = mutual_info_classif(X_num, df["top_donor"])
mi_df = pd.DataFrame({
    "Feature": numerical_cols,
    "Importance": mi_scores,
    "Method": "Mutual_Info"
})

# -------- Step 6: Correlation Heatmap --------
corr_matrix = df[numerical_cols + ["top_donor"]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# -------- Step 7: Combine Feature Scores --------
feature_scores = pd.concat([chi2_scores, mi_df])
feature_scores = feature_scores.sort_values("Importance", ascending=False)
print("\nTop Features (Pre-Model):")
print(feature_scores.head(10))

# -------- Step 8: Prepare Data for Modeling --------
X = pd.get_dummies(df.drop(columns=["top_donor", "Unnamed: 0"]), drop_first=True)
y = df["top_donor"]
X = X.fillna(0)
X_scaled = StandardScaler().fit_transform(X)

# -------- Step 9: Train/Test Split --------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

# -------- Step 10: Train Model --------
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# -------- Step 11: Evaluate --------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# -------- Step 12: Feature Importance Plot --------
model_feature_importance = pd.Series(model.feature_importances_, index=X.columns)
model_top_features = model_feature_importance.sort_values(ascending=False).head(10)

print("\nTop Features (Model-Based):")
print(model_top_features)

model_top_features.plot(kind="barh", title="Top 10 Features (Random Forest)")
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.tight_layout()
plt.show()