In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

Load Data

In [44]:
contacts_df = pd.read_csv(r"C:\Users\maxwell.bicking\Downloads\contacts_for_donor_propensity.csv")

#import median income by zip code data from Census Bureau
census_df = pd.read_csv(r"C:\Users\maxwell.bicking\data-science-portfolio\Donor Propensity\Median Income by ZIP.csv")

census_df['ZIP'] = census_df['Geographic Area Name'].str.strip().str[-5:] #add zip column to join to contact table

df = contacts_df.merge(
    census_df[['ZIP', 'Median Income']],
    left_on='MAILING_ZIP_CODE',
    right_on='ZIP',
    how='left'
)

df = df.drop(columns=['ZIP', 'DAYS_SINCE_MOST_RECENT_EVENT', 'TOTAL_MEETING_PAID_AMOUNT_LAST_YEAR', 'TOTAL_MEETING_PAID_AMOUNT'])

In [None]:
"""
Cleaning steps:

Nulls:
-Gender, race, country, zip, Income level, institution type, primary research area, highest degree, political party -> unknown
-Member type, mem status -> nonmember

-Tons of DAYS_SINCE columns to worry about, will fill with max vals

SUGGESTED COLUMNS:
-Total number of donations
-First gift amount
-Time since first gift
-Net worth (or wealth score)
-Event attendance
-Engagement metrics (volunteer, emails, etc.)

df["donation_growth_rate"] = df["total_donated_last_2y"] / df["total_donated_first_2y"]

Add binary column "has donated in the last year"

Add binary column "is top donor" for total >$10,000

Add HAS_MADE_LARGE_DONATION and/or LARGEST_DONATION

Add net worth category i.e.
        CASE 
            WHEN CON."net_worth" < 10000000 THEN '1 - Standard'
            WHEN CON."net_worth" < 100000000 THEN '2 - High'
            WHEN CON."net_worth" < 1000000000 THEN '3 - Very High'
            WHEN CON."net_worth" >= 1000000000 THEN '4 - Ultra High'

NEED TO FIX TOTAL AMOUNT LOGIC            







"""

df[["GENDER", "RACE", "MAILING_COUNTRY", "MAILING_ZIP_CODE", 
    "INCOME_LEVEL", "INSTITUTION_TYPE", "PRIMARY_RESEARCH_AREA", 
    "HIGHEST_DEGREE", "POLITICAL_PARTY"]] = df[["GENDER", "RACE", "MAILING_COUNTRY", "MAILING_ZIP_CODE", 
    "INCOME_LEVEL", "INSTITUTION_TYPE", "PRIMARY_RESEARCH_AREA", 
    "HIGHEST_DEGREE", "POLITICAL_PARTY"]] .fillna("Unknown", inplace = True)





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "HIGHEST_DEGREE", "POLITICAL_PARTY"]] .fillna("Unknown", inplace = True)


In [None]:
# ---------------- Step 1: Load Data ----------------
donations = pd.read_csv("donations.csv", parse_dates=["donation_date"])
contacts = pd.read_csv("contacts.csv")

# ---------------- Step 2: Aggregate Donations ----------------
cutoff = pd.to_datetime("today")
two_years_ago = cutoff - pd.DateOffset(years=2)

recent = donations[donations["donation_date"] >= two_years_ago]
early = donations[donations["donation_date"] < two_years_ago]

donated_last_2y = recent.groupby("contact_id")["amount"].sum().rename("total_donated_last_2y")
donated_first_2y = early.groupby("contact_id")["amount"].sum().rename("total_donated_first_2y")
total_donated = donations.groupby("contact_id")["amount"].sum().rename("total_donated")

# Merge donation aggregates into contact-level data
donor_agg = pd.concat([total_donated, donated_last_2y, donated_first_2y], axis=1).fillna(0)
df = contacts.merge(donor_agg, how="left", on="contact_id").fillna(0)

# ---------------- Step 3: Engineer Features ----------------
df["donation_growth_rate"] = df["total_donated_last_2y"] / (df["total_donated_first_2y"] + 1)

# ---------------- Step 4: Define Target ----------------
donation_cutoff = df["total_donated"].quantile(0.90)
df["top_donor"] = (df["total_donated"] >= donation_cutoff).astype(int)
target_col = "top_donor"

# ---------------- Step 5: Handle Nulls ----------------
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != target_col]

df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
df[categorical_cols] = df[categorical_cols].fillna("No Answer")

# ---------------- Step 6: Feature Importance ----------------
# Chi-Squared (Categoricals)
"""
The Chi-Squared Test is a statistical method used to determine whether two categorical 
variables are independent — in our case, whether a categorical feature (like gender, 
race, or member type) is related to the target variable (top_donor).

When you're building a predictive model, you want to know:
    --“Does this feature actually tell me something about who donates?”

Chi-squared helps answer that question before you train a model.
For example:
    --Is donation behavior distributed equally across genders?
    --Are people of different membership statuses more or less likely to donate?
If the answer is no — and the test is statistically significant — then that feature might be informative and worth keeping.

How It Works (Conceptually)
It compares two distributions:
    --Observed counts: how many people of each category actually donated
    --Expected counts: how many would have donated if donation was random
Then it asks:
    --Are the differences between observed and expected counts too big to be due to chance?
It outputs:
    --A Chi-squared value (the test statistic)
    --A p-value (probability of observing that result by chance)
If the p-value is low (usually < 0.05), we say:
“This feature is significantly related to the target.”
"""


le = LabelEncoder()
X_cat = df[categorical_cols].apply(lambda col: le.fit_transform(col.astype(str)))
chi2_vals, p_vals = chi2(X_cat, df[target_col])
chi2_scores = pd.DataFrame({
    "Feature": categorical_cols,
    "Importance": chi2_vals,
    "p_value": p_vals,
    "Method": "Chi2"
})

# Mutual Info (Numericals)
X_num = df[numerical_cols].fillna(0)
mi_scores = mutual_info_classif(X_num, df[target_col])
mi_df = pd.DataFrame({
    "Feature": numerical_cols,
    "Importance": mi_scores,
    "Method": "Mutual_Info"
})

# Correlation Heatmap
corr_matrix = df[numerical_cols + [target_col]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Combine Scores
feature_scores = pd.concat([chi2_scores, mi_df])
feature_scores = feature_scores.sort_values("Importance", ascending=False)
print("\nTop Features (Pre-Model):")
print(feature_scores.head(10))

# ---------------- Step 7: Modeling ----------------
# One-hot encode categoricals
X = pd.get_dummies(df.drop(columns=[target_col, "contact_id"]), drop_first=True)
y = df[target_col]
X_scaled = StandardScaler().fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# Feature Importances
model_feature_importance = pd.Series(model.feature_importances_, index=X.columns)
model_top_features = model_feature_importance.sort_values(ascending=False).head(10)
print("\nTop Features (Model-Based):")
print(model_top_features)

# Plot
model_top_features.plot(kind="barh", title="Top 10 Features (Random Forest)")
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.tight_layout()
plt.show()