In [1]:
# Python Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [3]:
# read in and clean Constituent Data
constituent_data = pd.read_csv('constituents_sh.csv')


constituent_data.rename(columns={
    'Constituent type': 'constituent_type',
    'First gift amount': 'first_gift_amount',
    'Constituent codes': 'constituent_codes',
    'Lifetime giving': 'lifetime_giving',
    'First gift type': 'first_gift_type',
    'First gift date': 'first_gift_date',
    'First gift fund': 'first_gift_fund',
    'Latest gift amount': 'latest_gift_amount',
    'Latest gift date': 'latest_gift_date',
    'Latest gift fund': 'latest_gift_fund',
    'Greatest gift amount': 'greatest_gift_amount',
    'Greatest gift type': 'greatest_gift_type',
    'Greatest gift date': 'greatest_gift_date',
    'Greatest gift fund': 'greatest_gift_fund',
    'Marital status': 'marital_status',
    'Solicit codes': 'solicit_codes',
    'Student Organizations': 'student_organizations',
    'Fraternal Organizations': 'fraternal_organizations',
    "Men's Athletics": 'mens_athletics',
    "Scholarship Recipient": 'scholarship_recipient',
    "Women's Athletics": 'womens_athletics'
}, inplace=True)

# Remove rows where 'first_gift_date' is null
constituent_data_cleaned = constituent_data.dropna(subset=['first_gift_date'])

constituent_data_cleaned.head()

Unnamed: 0,ConstituentID,City,State,Postcode,Country,constituent_type,constituent_codes,lifetime_giving,first_gift_amount,first_gift_type,...,Education,Gender,marital_status,solicit_codes,student_organizations,Arts,fraternal_organizations,mens_athletics,scholarship_recipient,womens_athletics
1,43735,Hermantown,MN,55811-1755,United States,Organization,Other Organizations (8/29/2012 - No end date),50,50,One-time gift,...,,,,,,,,,,
2,60145,Esko,MN,55733-9645,United States,Organization,Other Organizations (No start date - No end date),50,50,One-time gift,...,,,,,,,,,,
3,21332,,,,United States,Organization,Business (No start date - No end date),25,25,One-time gift,...,,,,,,,,,,
5,21351,,,,United States,Organization,Business (No start date - No end date),125,125,One-time gift,...,,,,,,,,,,
6,36583,Superior,WI,54880-2934,United States,Organization,Business (No start date - No end date),450,50,One-time gift,...,,,,,,,,,,


In [5]:
# read in and clean Gift Data
gift_data = pd.read_csv('gifts_sh.csv')

gift_data.rename(columns={
    'Gift ID': 'gift_ID',
    'Is anonymous': 'is_anonymous',
    'Gift subtype': 'gift_subtype',
    'Gift constituency': 'gift_constituency',
    'Payment method': 'payment_method',
    'Online Gift': 'online_gift',
    "Superior Fund": 'Superior_fund'
}, inplace=True)

# Remove rows where 'Type' is pledge
gift_data_cleaned = gift_data[gift_data['Type'] != 'Pledge']

# Export the cleaned DataFrame to a CSV file
gift_data_cleaned.to_csv('cleaned_gift_data.csv', index=False)


  gift_data = pd.read_csv('gifts_sh.csv')


In [7]:
# merge data
combined_data = pd.merge(constituent_data_cleaned, gift_data_cleaned, on='ConstituentID', how='inner')

In [9]:
# export .csv
combined_data.to_csv('merged_data.csv', index=False)

In [11]:
# only alumni
# 'constituent_codes' contains the word 'alumni'
alumni_data = combined_data[combined_data['constituent_codes'].str.contains('alumni', case=False, na=False)]

# Check the result
alumni_data.head()

Unnamed: 0,ConstituentID,City,State,Postcode,Country,constituent_type,constituent_codes,lifetime_giving,first_gift_amount,first_gift_type,...,Amount,Fund,Campaign,Appeal,gift_subtype,gift_constituency,payment_method,Package,online_gift,Superior_fund
14,44089,Mesa,AZ,85205-5437,United States,Individual,Alumni (No start date - No end date),350.0,300,Pledge,...,250,Men's Basketball Fund,Annual,Champions Club,Credit Card,Alumni,Credit card,,,
15,44089,Mesa,AZ,85205-5437,United States,Individual,Alumni (No start date - No end date),350.0,300,Pledge,...,100,Men's Basketball Fund,Annual,Locker Stall Wrap - Basketball,Credit Card,Alumni,Credit card,,,
19,39070,Elk Mound,WI,54739-9298,United States,Individual,Alumni (No start date - No end date),52.0,52,One-time gift,...,52,Superior Fund,Annual,2010-11 Phone-a-Thon,,Alumni,Other,Fall 2010-11 Calling Program,,
87,15605,Tarpon Springs,FL,34689-7537,United States,Individual,Alumni (No start date - No end date),1300.0,50,Pledge,...,100,Superior Fund,Annual,2013-2014 Fall Direct Mail,,Alumni,Personal check,,,
88,15605,Tarpon Springs,FL,34689-7537,United States,Individual,Alumni (No start date - No end date),1300.0,50,Pledge,...,100,Superior Fund,Annual,2012-2013 Fall Direct Mail,,Alumni,Personal check,,,


In [13]:
# Extract the last 4 digits as graduation year
alumni_data['graduation_year'] = alumni_data['Education'].str.extract(r'(\d{4})').fillna('0000')

# Convert 'graduation_year' to numeric (optional, if needed for further analysis)
alumni_data['graduation_year'] = pd.to_numeric(alumni_data['graduation_year'], errors='coerce', downcast='integer')

# For missing years (NaNs after conversion), fill with 0 or '0000'
alumni_data['graduation_year'] = alumni_data['graduation_year'].fillna(0).astype(int)

# Remove the last 4 digits (year) from the 'Education' column
alumni_data['Education'] = alumni_data['Education'].str.replace(r',\d{4}$', '', regex=True)

# Check the result
print(alumni_data[['Education', 'graduation_year']].head())


                                      Education  graduation_year
14    UW-Superior,Primary,Physical Education,BS             2013
15    UW-Superior,Primary,Physical Education,BS             2013
19  UW-Superior,Primary,Elementary Education,BS             2007
87               UW-Superior,Primary,Physics,BS             1974
88               UW-Superior,Primary,Physics,BS             1974


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['graduation_year'] = alumni_data['Education'].str.extract(r'(\d{4})').fillna('0000')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['graduation_year'] = pd.to_numeric(alumni_data['graduation_year'], errors='coerce', downcast='integer')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [17]:
# Extract everything after the last comma as 'degree'
alumni_data['degree'] = alumni_data['Education'].str.split(',').str[-1].str.strip()

# Remove everything after the last comma (including the comma itself) from the 'Education' column
alumni_data['Education'] = alumni_data['Education'].str.rsplit(',', n=1).str[0].str.strip()

# Check the result
print(alumni_data[['Education', 'degree']].head())


                                   Education degree
14    UW-Superior,Primary,Physical Education     BS
15    UW-Superior,Primary,Physical Education     BS
19  UW-Superior,Primary,Elementary Education     BS
87               UW-Superior,Primary,Physics     BS
88               UW-Superior,Primary,Physics     BS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['degree'] = alumni_data['Education'].str.split(',').str[-1].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['Education'] = alumni_data['Education'].str.rsplit(',', n=1).str[0].str.strip()


In [19]:
# Optionally, save the updated DataFrame to the same CSV file
alumni_data.to_csv('alumni_data.csv', index=False)

## Gifts to Superior Fund

In [None]:
# Filter data to include only donors who gave to the 'Superior Fund'
superior_fund_data = combined_data[combined_data['Fund'] == 'Superior Fund']

In [None]:
# Convert marital status, fraternal organizations, mens athletics scholarship recipient, women's athletics, gender

In [None]:
# extract grad years

In [None]:
# 2. Group by 'ConstituentID' and sum the 'Amount' of donations per constituent
# Retain other relevant features, like age, latest gift amount, etc.
superior_fund_grouped = superior_fund_data.groupby('ConstituentID').agg({
    'Amount': 'sum',  # Sum all the donation amounts per constituent
    'first_gift_amount': 'first',  # Keep the first gift amount
    'latest_gift_amount': 'first',  # Keep the latest gift amount
    'greatest_gift_amount': 'first',  # Keep the greatest gift amount
    'Age': 'first',
    'Gender': 'first',
    'lifetime_giving': 'first',
    'first_gift_date': 'first',
    'latest_gift_date': 'first',
    'greatest_gift_date': 'first',
    'marital_status': 'first',
    'student_organizations': 'first',
    'fraternal_organizations': 'first',
    'mens_athletics': 'first',
    'scholarship_recipient': 'first',
    "Women's Athletics": 'womens_athletics'
}).reset_index()

In [None]:
# Select features for PCA
numeric_columns = [
    'first_gift_amount', 'lifetime_giving', 'latest_gift_amount', 
    'greatest_gift_amount', 'age', 'AgeInMonths'
    # Add more numerical columns as needed
]