In [1]:
# Python Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [5]:
# read in and clean Constituent Data
constituent_data = pd.read_csv('constituents_sh.csv')


constituent_data.rename(columns={
    'Constituent type': 'constituent_type',
    'First gift amount': 'first_gift_amount',
    'Constituent codes': 'constituent_codes',
    'Lifetime giving': 'lifetime_giving',
    'First gift type': 'first_gift_type',
    'First gift date': 'first_gift_date',
    'First gift fund': 'first_gift_fund',
    'Latest gift amount': 'latest_gift_amount',
    'Latest gift date': 'latest_gift_date',
    'Latest gift fund': 'latest_gift_fund',
    'Greatest gift amount': 'greatest_gift_amount',
    'Greatest gift type': 'greatest_gift_type',
    'Greatest gift date': 'greatest_gift_date',
    'Greatest gift fund': 'greatest_gift_fund',
    'Marital status': 'marital_status',
    'Solicit codes': 'solicit_codes',
    'Student Organizations': 'student_organizations',
    'Fraternal Organizations': 'fraternal_organizations',
    "Men's Athletics": 'mens_athletics',
    "Scholarship Recipient": 'scholarship_recipient',
    "Women's Athletics": 'womens_athletics'
}, inplace=True)

# Remove rows where 'first_gift_date' is null
constituent_data_cleaned = constituent_data.dropna(subset=['first_gift_date'])

constituent_data_cleaned.head()

Unnamed: 0,ConstituentID,City,State,Postcode,Country,constituent_type,constituent_codes,lifetime_giving,first_gift_amount,first_gift_type,...,Education,Gender,marital_status,solicit_codes,student_organizations,Arts,fraternal_organizations,mens_athletics,scholarship_recipient,womens_athletics
1,43735,Hermantown,MN,55811-1755,United States,Organization,Other Organizations (8/29/2012 - No end date),50,50,One-time gift,...,,,,,,,,,,
2,60145,Esko,MN,55733-9645,United States,Organization,Other Organizations (No start date - No end date),50,50,One-time gift,...,,,,,,,,,,
3,21332,,,,United States,Organization,Business (No start date - No end date),25,25,One-time gift,...,,,,,,,,,,
5,21351,,,,United States,Organization,Business (No start date - No end date),125,125,One-time gift,...,,,,,,,,,,
6,36583,Superior,WI,54880-2934,United States,Organization,Business (No start date - No end date),450,50,One-time gift,...,,,,,,,,,,


In [7]:
# read in and clean Gift Data
gift_data = pd.read_csv('gifts_sh.csv')

gift_data.rename(columns={
    'Gift ID': 'gift_ID',
    'Is anonymous': 'is_anonymous',
    'Gift subtype': 'gift_subtype',
    'Gift constituency': 'gift_constituency',
    'Payment method': 'payment_method',
    'Online Gift': 'online_gift',
    "Superior Fund": 'Superior_fund'
}, inplace=True)

# Remove rows where 'Type' is pledge
gift_data_cleaned = gift_data[gift_data['Type'] != 'Pledge']

# Export the cleaned DataFrame to a CSV file
gift_data_cleaned.to_csv('cleaned_gift_data.csv', index=False)


  gift_data = pd.read_csv('gifts_sh.csv')


In [9]:
# merge data
combined_data = pd.merge(constituent_data_cleaned, gift_data_cleaned, on='ConstituentID', how='inner')

In [11]:
# export .csv
combined_data.to_csv('merged_data.csv', index=False)

In [15]:
# only alumni
# 'constituent_codes' contains the word 'alumni'
alumni_data = combined_data[combined_data['constituent_codes'].str.contains('alumni', case=False, na=False)]

# Check the result
alumni_data.head()

Unnamed: 0,ConstituentID,City,State,Postcode,Country,constituent_type,constituent_codes,lifetime_giving,first_gift_amount,first_gift_type,...,Amount,Fund,Campaign,Appeal,gift_subtype,gift_constituency,payment_method,Package,online_gift,Superior_fund
14,44089,Mesa,AZ,85205-5437,United States,Individual,Alumni (No start date - No end date),350.0,300,Pledge,...,250,Men's Basketball Fund,Annual,Champions Club,Credit Card,Alumni,Credit card,,,
15,44089,Mesa,AZ,85205-5437,United States,Individual,Alumni (No start date - No end date),350.0,300,Pledge,...,100,Men's Basketball Fund,Annual,Locker Stall Wrap - Basketball,Credit Card,Alumni,Credit card,,,
19,39070,Elk Mound,WI,54739-9298,United States,Individual,Alumni (No start date - No end date),52.0,52,One-time gift,...,52,Superior Fund,Annual,2010-11 Phone-a-Thon,,Alumni,Other,Fall 2010-11 Calling Program,,
87,15605,Tarpon Springs,FL,34689-7537,United States,Individual,Alumni (No start date - No end date),1300.0,50,Pledge,...,100,Superior Fund,Annual,2013-2014 Fall Direct Mail,,Alumni,Personal check,,,
88,15605,Tarpon Springs,FL,34689-7537,United States,Individual,Alumni (No start date - No end date),1300.0,50,Pledge,...,100,Superior Fund,Annual,2012-2013 Fall Direct Mail,,Alumni,Personal check,,,


In [17]:
columns_to_clean = ['Amount', 'first_gift_amount', 'lifetime_giving', 
                    'latest_gift_amount', 'greatest_gift_amount']

for col in columns_to_clean:
    alumni_data[col] = alumni_data[col].replace({',': ''}, regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data[col] = alumni_data[col].replace({',': ''}, regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data[col] = alumni_data[col].replace({',': ''}, regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data[col] = alumni_data[col].replace

In [19]:
# Extract the last 4 digits as graduation year
alumni_data['graduation_year'] = alumni_data['Education'].str.extract(r'(\d{4})').fillna('0000')

# Convert 'graduation_year' to numeric (optional, if needed for further analysis)
alumni_data['graduation_year'] = pd.to_numeric(alumni_data['graduation_year'], errors='coerce', downcast='integer')

# For missing years (NaNs after conversion), fill with 0 or '0000'
alumni_data['graduation_year'] = alumni_data['graduation_year'].fillna(0).astype(int)

# Remove the last 4 digits (year) from the 'Education' column
alumni_data['Education'] = alumni_data['Education'].str.replace(r',\d{4}$', '', regex=True)

# Check the result
print(alumni_data[['Education', 'graduation_year']].head())


                                      Education  graduation_year
14    UW-Superior,Primary,Physical Education,BS             2013
15    UW-Superior,Primary,Physical Education,BS             2013
19  UW-Superior,Primary,Elementary Education,BS             2007
87               UW-Superior,Primary,Physics,BS             1974
88               UW-Superior,Primary,Physics,BS             1974


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['graduation_year'] = alumni_data['Education'].str.extract(r'(\d{4})').fillna('0000')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['graduation_year'] = pd.to_numeric(alumni_data['graduation_year'], errors='coerce', downcast='integer')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [21]:
# Extract everything after the last comma as 'degree'
alumni_data['degree'] = alumni_data['Education'].str.split(',').str[-1].str.strip()

# Remove everything after the last comma (including the comma itself) from the 'Education' column
alumni_data['Education'] = alumni_data['Education'].str.rsplit(',', n=1).str[0].str.strip()

# Check the result
print(alumni_data[['Education', 'degree']].head())


                                   Education degree
14    UW-Superior,Primary,Physical Education     BS
15    UW-Superior,Primary,Physical Education     BS
19  UW-Superior,Primary,Elementary Education     BS
87               UW-Superior,Primary,Physics     BS
88               UW-Superior,Primary,Physics     BS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['degree'] = alumni_data['Education'].str.split(',').str[-1].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['Education'] = alumni_data['Education'].str.rsplit(',', n=1).str[0].str.strip()


In [23]:
# Replace 'female' with 1, 'male' with 2
alumni_data['Gender'] = alumni_data['Gender'].replace({'Unknown': 0, 'Female': 1, 'Male': 2})

# Replace missing values (NaN or empty) with 0
alumni_data['Gender'] = alumni_data.replace({'': np.nan})['Gender'].fillna(0)

# Check the result
print(alumni_data['Gender'].value_counts())


Gender
2    16787
1    12083
0       15
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['Gender'] = alumni_data['Gender'].replace({'Unknown': 0, 'Female': 1, 'Male': 2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['Gender'] = alumni_data.replace({'': np.nan})['Gender'].fillna(0)


In [25]:
# Update marital_status
alumni_data['marital_status'] = alumni_data['marital_status'].replace({'Unknown': 0, 'Married': 1, 'Single': 2, 'Divorced': 3, 'Widowed': 4, 'Engaged': 5, 'Separated': 6})

# Replace missing values (NaN or empty) with 0
alumni_data['marital_status'] = alumni_data['marital_status'].fillna(0)

# If the column contains empty strings instead of NaNs, handle that too
alumni_data['marital_status'] = alumni_data['marital_status'].replace('', 0)

# Check the result
print(alumni_data['marital_status'].value_counts())

marital_status
1.0    20467
0.0     4506
4.0     2150
3.0     1016
2.0      739
5.0        5
6.0        2
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['marital_status'] = alumni_data['marital_status'].replace({'Unknown': 0, 'Married': 1, 'Single': 2, 'Divorced': 3, 'Widowed': 4, 'Engaged': 5, 'Separated': 6})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['marital_status'] = alumni_data['marital_status'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [27]:
# Ensure both NaN and empty strings are treated as missing values
alumni_data['student_organizations'] = np.where(alumni_data['student_organizations'].fillna('') == '', 0, 1)

# Check the result
print(alumni_data['student_organizations'].value_counts())



student_organizations
0    21998
1     6887
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['student_organizations'] = np.where(alumni_data['student_organizations'].fillna('') == '', 0, 1)


In [29]:
# Ensure both NaN and empty strings are treated as missing values
alumni_data['mens_athletics'] = np.where(alumni_data['mens_athletics'].fillna('') == '', 0, 1)

# Check the result
print(alumni_data['mens_athletics'].value_counts())


mens_athletics
0    25182
1     3703
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['mens_athletics'] = np.where(alumni_data['mens_athletics'].fillna('') == '', 0, 1)


In [31]:
# Ensure both NaN and empty strings are treated as missing values
alumni_data['fraternal_organizations'] = np.where(alumni_data['fraternal_organizations'].fillna('') == '', 0, 1)

# Check the result
print(alumni_data['fraternal_organizations'].value_counts())


fraternal_organizations
0    22916
1     5969
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['fraternal_organizations'] = np.where(alumni_data['fraternal_organizations'].fillna('') == '', 0, 1)


In [33]:
# Ensure both NaN and empty strings are treated as missing values
alumni_data['scholarship_recipient'] = np.where(alumni_data['scholarship_recipient'].fillna('') == '', 0, 1)

# Check the result
print(alumni_data['scholarship_recipient'].value_counts())


scholarship_recipient
0    22027
1     6858
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['scholarship_recipient'] = np.where(alumni_data['scholarship_recipient'].fillna('') == '', 0, 1)


In [35]:
# Ensure both NaN and empty strings are treated as missing values
alumni_data['womens_athletics'] = np.where(alumni_data['womens_athletics'].fillna('') == '', 0, 1)

# Check the result
print(alumni_data['womens_athletics'].value_counts())


womens_athletics
0    28124
1      761
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_data['womens_athletics'] = np.where(alumni_data['womens_athletics'].fillna('') == '', 0, 1)


In [37]:
# save the updated DataFrame to the same CSV file
alumni_data.to_csv('alumni_data.csv', index=False)

## Gifts to Superior Fund

In [39]:
# Handle missing Age (e.g., dropping rows with missing Age)
alumni_data_cleaned = alumni_data.dropna(subset=['Age'])

In [41]:
# Filter data to include only donors who gave to the 'Superior Fund'
superior_fund_data = alumni_data_cleaned[alumni_data_cleaned['Fund'] == 'Superior Fund']
superior_fund_data.count

<bound method DataFrame.count of        ConstituentID       City State    Postcode        Country  \
19             39070  Elk Mound    WI  54739-9298  United States   
105             9194     Wilton    CT  06897-4639  United States   
106             9194     Wilton    CT  06897-4639  United States   
111            37486     Hudson    WI  54016-7746  United States   
113            37486     Hudson    WI  54016-7746  United States   
...              ...        ...   ...         ...            ...   
49508           6689  Fairbanks    AK  99709-6657  United States   
49509           6689  Fairbanks    AK  99709-6657  United States   
49513           9642   Superior    WI  54880-6539  United States   
49514           9642   Superior    WI  54880-6539  United States   
49515           9642   Superior    WI  54880-6539  United States   

      constituent_type                     constituent_codes  lifetime_giving  \
19          Individual  Alumni (No start date - No end date)         

In [43]:
# Group by 'ConstituentID' and sum the 'Amount' of donations per constituent
# Retain other relevant features, like age, latest gift amount, etc.
superior_fund_grouped = superior_fund_data.groupby('ConstituentID').agg({
    'Amount': 'sum',  # Sum all the donation amounts per constituent
    'first_gift_amount': 'first',  # Keep the first entry found
    'latest_gift_amount': 'first',  # Keep the first entry found
    'greatest_gift_amount': 'first',  # Keep the first entry found
    'Age': 'first', # Keep the first entry found
    'Gender': 'first', # Keep the first entry found
    'lifetime_giving': 'first', # Keep the first entry found
    'first_gift_date': 'first', # Keep the first entry found
    'latest_gift_date': 'first', # Keep the first entry found
    'greatest_gift_date': 'first', # Keep the first entry found
    'marital_status': 'first', # Keep the first entry found
    'student_organizations': 'first', # Keep the first entry found
    'fraternal_organizations': 'first', # Keep the first entry found
    'mens_athletics': 'first', # Keep the first entry found
    'scholarship_recipient': 'first', # Keep the first entry found
    'womens_athletics': 'first' # Keep the first entry found
}).reset_index()

In [45]:
# Select features for PCA
numeric_columns = [
    'Amount', 'first_gift_amount', 'lifetime_giving', 'latest_gift_amount', 
    'greatest_gift_amount', 'Age', 'Gender', 'marital_status', 
    'student_organizations', 'fraternal_organizations', 
    'mens_athletics', 'scholarship_recipient', 'womens_athletics'
]

# Filter the selected columns
selected_features = superior_fund_grouped[numeric_columns]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_features)

# Check the result
print(scaled_data[:5])  # Print the first 5 rows of the standardized data


[[-0.03304531 -0.20028163 -0.07923342 -0.05302507 -0.06563387  0.47250113
   1.05188021 -1.27160086  2.38310022 -0.35898919 -0.29207543 -0.64795105
  -0.17188367]
 [-0.08148654  0.17101494 -0.08736338 -0.06217488 -0.07636783  0.18101785
   1.05188021 -1.27160086 -0.41962146 -0.35898919 -0.29207543  1.54332645
  -0.17188367]
 [-0.06140115 -0.11291773 -0.08099096 -0.05302507 -0.07636783  0.90972605
   1.05188021 -0.10681447  2.38310022 -0.35898919 -0.29207543 -0.64795105
  -0.17188367]
 [-0.08479472 -0.24396358 -0.09228018 -0.0704097  -0.07750086  0.39963031
  -0.94594441  2.2227583  -0.41962146 -0.35898919 -0.29207543  1.54332645
  -0.17188367]
 [-0.08290433 -0.15659968 -0.09184888 -0.06766476 -0.07726232 -0.62056118
  -0.94594441 -1.27160086 -0.41962146 -0.35898919 -0.29207543 -0.64795105
  -0.17188367]]
