This notebook contains Helen's code for loading and getting counts of the Open Secrets candidates and individual contributions files.

In [1]:
# KEEP
# import libraries
import pandas as pd
import regex as re
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [2]:
# Read in the candidates table (2020)
# Incumbents only

column_names = ["Cycle", "FECCandID", "CID", "FirstLastP",
                "Party", "DistIDRunFor", "DistIDCurr", "CurrCand",
                "CycleCand", "CRPICO", "RecipCode", "NoPacs"]

cand_df = pd.read_csv("C:/Users/hefla/Documents/Work/IPS/Area 990/Data Sources/Politicking/cands20.txt", 
                      sep = ",", # separated by commas
                      quotechar = "|", # fields are surrounded by pipes
                      encoding = "latin1",
                      header = None,
                      names = column_names
)

# Incumbents only
cand_df["DistIDCurr"] = cand_df["DistIDCurr"].astype(str).str.strip()
cand_df = cand_df[
    cand_df["DistIDCurr"].notna() &
    (cand_df["DistIDCurr"] != "") &
    (cand_df["DistIDCurr"] != "PRES") &
    (cand_df["DistIDCurr"].str.lower() != "nan")
]

cand_df.head()
# cand_df.info()

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs
13,2020,H0AL02087,N00030768,Martha Roby (R),R,AL02,AL02,,,I,RI,
24,2020,H0AL05163,N00030910,Mo Brooks (R),R,AL05,AL05,Y,Y,I,RW,
28,2020,H0AL07086,N00030622,Terri A Sewell (D),D,AL07,AL07,Y,Y,I,DW,
30,2020,H0AR01083,N00030770,Rick Crawford (R),R,AR01,AR01,Y,Y,I,RW,
37,2020,H0AR03055,N00031857,Steve Womack (R),R,AR03,AR03,Y,Y,I,RW,


In [3]:
# Read in the individual contributions table (2020)

column_names = ["Cycle", "FECTransIS", "ContribID", "Contrib",
                "RecipID", "Orgname", "UltOrg", "RealCode",
                "Date", "Amount", "Street", "City",
                "State", "Zip", "RecipCode", "Type",
                "CmteID", "OtherID", "Gender", "Microfilm",
                "Occupation", "Employer", "Source"]

contrib_df = pd.read_csv("C:/Users/hefla/Documents/Work/IPS/Area 990/Data Sources/Politicking/indivs20.txt", 
                      sep = ",", # separated by commas
                      quotechar = "|", # fields are surrounded by pipes
                      encoding = "latin1",
                      header = None,
                      names = column_names,
                      low_memory = False
)
contrib_df.head()
# cand_df.info()

Unnamed: 0,Cycle,FECTransIS,ContribID,Contrib,RecipID,Orgname,UltOrg,RealCode,Date,Amount,...,Zip,RecipCode,Type,CmteID,OtherID,Gender,Microfilm,Occupation,Employer,Source
0,2020,4030220201214334986,p0004869853,"LONNBERG, CARL",C00721712,[24T Contribution],,Z9500,12/20/2019,10000,...,94117,DP,24T,C00401224,C00721712,M,202001299171851035,PARTNER,BOSTON CONSULTING GROUP,Rept
1,2020,4030220201214334988,k0001516259,"LOVO, MARIO",N00044240,[24T Contribution],,Z9500,12/08/2019,250,...,33134,DL,24T,C00401224,C00696153,M,202001299171881052,LAWYER,SELF,Rept
2,2020,4030220201214335206,h3003526289,"LOGUE, KATHERINE",C00401224,,,J1200,07/09/2019,5,...,60010,PI,15,C00401224,,F,202001299171836150,NOT EMPLOYED,NONE,P/PAC
3,2020,4051220201742609379,m00016536071,"YINGLING, JOHN",N00044240,Steamboat Wharf of Provincetown,,G2900,01/20/2020,25,...,2657,DL,15E,C00696153,C00401224,M,202004159219618050,RESTAURANT MANAGER,STEAMBOAT WHARF OF PROVINCETOWN,temp
4,2020,4051220201742609381,i3003912456,"YODAIKEN, VICTOR",N00044240,Finite State Machine Labs,,Z9600,03/31/2020,250,...,78733,DL,15E,C00696153,C00401224,M,202004159219618050,BUSINESSMAN,FINITE STATE MACHINE LABS INC.,RptEF


In [26]:
# Field cleanup for candidates table

# Split the DistIDCurr (current district) field into separate columns for state and district (as integer)
cand_df["State"] = cand_df["DistIDCurr"].str[0:2]
# cand_df["District"] = cand_df["DistIDCurr"].str[2:4].fillna(0).astype(int)
cand_df["District"] = cand_df["DistIDCurr"].str[2:4].fillna(0)
cand_df.loc[cand_df["District"].astype(str).str.startswith("S"), "District"] = "0"
cand_df["District"] = cand_df["District"].astype(int)

# Strip off anything fitting the pattern " (<Party>)" at the end of the LastFirstP field
cand_df['FirstLastP'] = cand_df.apply(
    lambda row: row['FirstLastP'].replace(f" ({row['Party']})", ""),
    axis=1
)

# Strip off any remaining single-character codes of D, R, 3, or I surrounded by parentheses 
# at the end of candidate names (leaving all other things in parentheses intact)
cand_df['FirstLastP'] = cand_df['FirstLastP'].str.replace(
    r"\s*\((D|R|3|I)\)$", "", regex=True
)

# Trim off any trailing whitespaces from LsatFirstP
cand_df['FirstLastP'] = cand_df['FirstLastP'].str.strip()

# Make sure the candidate full name, party, and state fields are uppercased
cand_df["FirstLastP"] = cand_df["FirstLastP"].str.upper()
cand_df["Party"] = cand_df["Party"].str.upper()
cand_df["State"] = cand_df["State"].str.upper()

cand_df.info()
print(cand_df.columns)
cand_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 538 entries, 13 to 8037
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Cycle         538 non-null    int64 
 1   FECCandID     538 non-null    object
 2   CID           538 non-null    object
 3   FirstLastP    538 non-null    object
 4   Party         538 non-null    object
 5   DistIDRunFor  538 non-null    object
 6   DistIDCurr    538 non-null    object
 7   CurrCand      538 non-null    object
 8   CycleCand     538 non-null    object
 9   CRPICO        538 non-null    object
 10  RecipCode     538 non-null    object
 11  NoPacs        538 non-null    object
 12  State         538 non-null    object
 13  District      538 non-null    int64 
dtypes: int64(2), object(12)
memory usage: 63.0+ KB
Index(['Cycle', 'FECCandID', 'CID', 'FirstLastP', 'Party', 'DistIDRunFor',
       'DistIDCurr', 'CurrCand', 'CycleCand', 'CRPICO', 'RecipCode', 'NoPacs',
       'State', 'Dis

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs,State,District
13,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2
24,2020,H0AL05163,N00030910,MO BROOKS,R,AL05,AL05,Y,Y,I,RW,,AL,5
28,2020,H0AL07086,N00030622,TERRI A SEWELL,D,AL07,AL07,Y,Y,I,DW,,AL,7
30,2020,H0AR01083,N00030770,RICK CRAWFORD,R,AR01,AR01,Y,Y,I,RW,,AR,1
37,2020,H0AR03055,N00031857,STEVE WOMACK,R,AR03,AR03,Y,Y,I,RW,,AR,3


In [27]:
# QC of party stripping
# Select rows where FirstLastP ends with parentheses (e.g., "Name (XYZ)")
rows_with_parens = cand_df[cand_df['FirstLastP'].str.contains(r"\(.*\)$", regex=True)]
# Display them
print(rows_with_parens)

# QC of DistIDCurr
print(cand_df["DistIDCurr"].unique()[:20])  # first 20 unique entries


Empty DataFrame
Columns: [Cycle, FECCandID, CID, FirstLastP, Party, DistIDRunFor, DistIDCurr, CurrCand, CycleCand, CRPICO, RecipCode, NoPacs, State, District]
Index: []
['AL02' 'AL05' 'AL07' 'AR01' 'AR03' 'AZ04' 'CA07' 'CA11' 'CA03' 'CA25'
 'CA28' 'CA27' 'CA37' 'CA53' 'CT03' 'DC00' 'FL05' 'FL11' 'FL24' 'FL22']


In [28]:
# Field cleanup for individual contributions table

# Convert Date in contrib_df to a datetime column
contrib_df['Date'] = pd.to_datetime(contrib_df['Date'])

In [29]:
# Join candidates and individual contributions into a new data frame
# with summary columns on contribution day, count, and amount
# for each incumbent candidate

# Group contributions by recipient and date
daily_contrib_sum = (
    contrib_df
    .groupby(['RecipID', contrib_df['Date'].dt.date])  # group by candidate and day
    .agg(
        contrib_count=('Amount', 'count'),    # number of contributions
        contrib_amount=('Amount', 'sum')      # total amount per day
    )
    .reset_index()
    .rename(columns={'Date': 'contrib_date'})  # rename date column
)

# Filter out candidates where they're not incumbents (DistIDCurr not null)
# Or they're not currently the president (DistIDCurr = 'PRES')
# Or they're not running for president (DistIDRunFor = 'PRES')
cand_df_filtered = cand_df[
    (cand_df['DistIDCurr'].notnull()) & 
    (cand_df['DistIDCurr'] != "PRES") # & 
    # (cand_df['DistIDRunFor'] != "PRES")
]

# Merge the daily contributions summary data with the candidate data (left join)
# cand_contrib_df = cand_df.merge(
# Only load incumbents (where DistIDCurr is not null)
cand_contrib_df = cand_df_filtered.merge(
    daily_contrib_sum,
    left_on = 'CID',
    right_on = 'RecipID',
    how = 'left'
)

# Drop the RecipID column (don't need it since we have CID)
cand_contrib_df = cand_contrib_df.drop(columns=['RecipID'])

# Fill missing counts and amounts with 0 for candidates with no contributions (per day)
cand_contrib_df['contrib_count'] = cand_contrib_df['contrib_count'].fillna(0).astype(int)
cand_contrib_df['contrib_amount'] = cand_contrib_df['contrib_amount'].fillna(0)

cand_contrib_df.head()

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs,State,District,contrib_date,contrib_count,contrib_amount
0,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2,2019-01-10,1,2500.0
1,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2,2019-02-25,1,500.0
2,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2,2019-02-28,2,750.0
3,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2,2019-03-14,2,1000.0
4,2020,H0AL02087,N00030768,MARTHA ROBY,R,AL02,AL02,,,I,RI,,AL,2,2019-03-28,1,500.0


In [30]:
# Create a data frame that only has the files we need for matching

cand_contrib_matching_df = cand_contrib_df[["FirstLastP", "CID", "Party", "State", "District", "Cycle"]].rename(
    columns={
        "FirstLastP": "name",
        "CID": "id",
        "Party": "match_attribute1",
        "State": "match_attribute2",
        "District": "match_attribute3",
        "Cycle": "append_attribute1",
        "": "append_attribute2",
        "": "append_attribute3"
    }
)

# Adding blank columns needed for python matching process
cand_contrib_matching_df["append_attribute2"] = ""
cand_contrib_matching_df["append_attribute3"] = ""

cand_contrib_matching_df.head()

Unnamed: 0,name,id,match_attribute1,match_attribute2,match_attribute3,append_attribute1,append_attribute2,append_attribute3
0,MARTHA ROBY,N00030768,R,AL,2,2020,,
1,MARTHA ROBY,N00030768,R,AL,2,2020,,
2,MARTHA ROBY,N00030768,R,AL,2,2020,,
3,MARTHA ROBY,N00030768,R,AL,2,2020,,
4,MARTHA ROBY,N00030768,R,AL,2,2020,,


In [31]:
# Output the joined cand_contrib_df to a CSV file

cand_contrib_matching_df.to_csv(
#     r"C:\Users\hefla\Documents\Work\IPS\Area 990\Data Sources\Politicking\cand_contrib_summary_2020.csv",
    "../matching/match_candidates_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)

In [13]:
# Column counts

# cand_contrib_df["CID"].nunique()
# df.value_counts(["animal", "animal_group", "title", "author"]).reset_index(name="count")
key_column_counts = cand_contrib_df[["CID", "Party", "RecipCode", "NoPacs", "contrib_count", "State", "District"]].nunique()
print(key_column_counts)

CID               538
Party               5
RecipCode          10
NoPacs              1
contrib_count    1707
State              56
District           54
dtype: int64


In [46]:
# Top recipients

# group_counts = cand_contrib_df.groupby("FirstLastP")["contrib_amount"].sum()
# top_groups = group_counts[group_counts >= 1000000]
# print(top_groups)

group_counts = cand_contrib_df.groupby("FirstLastP")["contrib_amount"].sum()
top_groups = group_counts.sort_values(ascending=False)
top_groups_formatted = top_groups.map(lambda x: f"${x:,.0f}")

# Filter to only include candidates with contributions over a certain amount
# top_groups = top_groups[top_groups >= 1_000_000]

print(top_groups_formatted)


FirstLastP
Bernie Sanders      $106,916,776
Elizabeth Warren     $64,557,578
David Perdue         $62,080,234
Lindsey Graham       $43,903,595
Amy Klobuchar        $43,031,783
                        ...     
Rob Woodall              $-4,978
Pat Roberts              $-7,800
Dennis Heck             $-10,208
Lamar Alexander         $-37,900
Tom Udall               $-89,580
Name: contrib_amount, Length: 538, dtype: object
