In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

Load Data

In [32]:
contacts_df = pd.read_csv(r"C:\Users\maxwell.bicking\Downloads\contacts_for_donor_propensity.csv")

#import median income by zip code data from Census Bureau
census_df = pd.read_csv(r"C:\Users\maxwell.bicking\data-science-portfolio\Donor Propensity\Median Income by ZIP.csv")

census_df['ZIP'] = census_df['Geographic Area Name'].str.strip().str[-5:] #add zip column to join to contact table

df = contacts_df.merge(
    census_df[['ZIP', 'Median Income']],
    left_on='MAILING_ZIP_CODE',
    right_on='ZIP',
    how='left'
)

df = df.drop(columns='ZIP')

In [None]:
df.isna().sum()

In [None]:
"""
Cleaning steps:

Nulls:
-Gender, race -> "No answer"
-country, zip, Income level, institution type, primary research area, highest degree, political party -> unknown
-Member type, mem status -> nonmember


Add binary column "has donated in the last year"

Add binary column "is top donor" for total >$1M

Add net worth category i.e.
        CASE 
            WHEN CON."net_worth" < 10000000 THEN '1 - Standard'
            WHEN CON."net_worth" < 100000000 THEN '2 - High'
            WHEN CON."net_worth" < 1000000000 THEN '3 - Very High'
            WHEN CON."net_worth" >= 1000000000 THEN '4 - Ultra High'

NEED TO FIX TOTAL AMOUNT LOGIC            


"""



'\nCleaning steps:\n\nNulls:\n-Gender, race -> "No answer"\n-country, zip, Income level, institution type, primary research area, highest degree, political party -> unknown\n-Member type, mem status -> nonmember\n\n'

In [None]:
# -------- Step 1: Load data --------
df = pd.read_csv("your_data.csv")  # Update this path
target_col = "donated_last_year"   # Binary target column

# -------- Step 2: Split features --------
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != target_col]

#handle nulls
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])



# -------- Step 3: Chi-Squared for Categoricals --------
le = LabelEncoder()
X_cat = df[categorical_cols].apply(lambda col: le.fit_transform(col.astype(str)))
chi2_vals, p_vals = chi2(X_cat, df[target_col])
chi2_scores = pd.DataFrame({
    "Feature": categorical_cols,
    "Importance": chi2_vals,
    "p_value": p_vals,
    "Method": "Chi2"
})

# -------- Step 4: Mutual Info for Numericals --------
X_num = df[numerical_cols].fillna(0)
mi_scores = mutual_info_classif(X_num, df[target_col])
mi_df = pd.DataFrame({
    "Feature": numerical_cols,
    "Importance": mi_scores,
    "Method": "Mutual_Info"
})

# -------- Step 5: Correlation Matrix (Optional) --------
corr_matrix = df[numerical_cols + [target_col]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# -------- Step 6: Combine Feature Scores --------
feature_scores = pd.concat([chi2_scores, mi_df])
feature_scores = feature_scores.sort_values("Importance", ascending=False)
print("\nTop Features (Pre-Model):")
print(feature_scores.head(10))

# -------- Step 7: One-Hot Encode + Scale --------
X = pd.get_dummies(df.drop(columns=[target_col]), drop_first=True)
y = df[target_col]
X_scaled = StandardScaler().fit_transform(X)

# -------- Step 8: Train/Test Split --------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# -------- Step 9: Train Model --------
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# -------- Step 10: Evaluate Model --------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# -------- Step 11: Model-Based Feature Importance --------
model_feature_importance = pd.Series(model.feature_importances_, index=X.columns)
model_top_features = model_feature_importance.sort_values(ascending=False).head(10)
print("\nTop Features (Model-Based):")
print(model_top_features)

# Optional: Plot feature importances
model_top_features.plot(kind="barh", title="Top 10 Features (Random Forest)")
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.show()
