This notebook contains Helen's code for loading and getting counts of the Open Secrets candidates and individual contributions files.

In [29]:
# KEEP
# import libraries
import pandas as pd
# import regex as re
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [37]:
# Read in the DCInbox table

# column_names = ["Cycle", "FECCandID", "CID", "FirstLastP",
#                 "Party", "DistIDRunFor", "DistIDCurr", "CurrCand",
#                 "CycleCand", "CRPICO", "RecipCode", "NoPacs"]

# dcinbox_df = pd.read_csv("dcinbox_export__election s.csv", 
dcinbox_raw_df = pd.read_csv(
    "../data/dcinbox/dcinbox_export_119th_through9_19_2025.csv",
    # sep = ",",            # separated by commas
    # quotechar = "|",      # fields are surrounded by pipes
    encoding = "latin1"
    # header = None,
    # names = column_names
)

dcinbox_raw_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,"Before the weekend, three things from Washington:","96 Dear , This week, the House advanced a co...",1758303499000,M001208,119,Lucy,McBath,1960-06-01,F,GA,6.0,Democrat,House,,259320
1,Trump vs. Kimmel: My Take,96 Do you believe the federal government shou...,1758298234000,F000483,119,Laura,Friedman,1966-12-03,F,CA,30.0,Democrat,House,,259315
2,Congressman Fulcher's Legislative Update,"September 19, 2025 Dear , After a busy and p...",1758290357000,F000469,119,Russ,Fulcher,1973-07-19,M,ID,1.0,Republican,House,,259316
3,The Wied Wire 9/19,News from Congressman Tony Wied Wied About Su...,1758289885000,W000829,119,Tony,Wied,1976-05-03,M,WI,8.0,Republican,House,,259317
4,Washington in a Week - Senate Unanimously Pass...,Senate Unanimously Passes Lee Resolution Hono...,1758289877000,L000577,119,Mike,Lee,1971-06-04,M,UT,,Republican,Senate,,259318


In [38]:
# print(dcinbox_df.columns)
# dcinbox_df = dcinbox_df.rename(
#     columns=lambda x: x if x not in dcinbox_df.columns.duplicated() else f"{x}_dup")
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
print(dcinbox_raw_df.columns)

Index(['Subject', 'Body', 'Unix Timestamp', 'BioGuide ID', 'Congress',
       'First Name', 'Last Name', 'Date of Birth', 'Gender', 'State',
       'District', 'Party', 'Chamber', 'Nickname', 'ID'],
      dtype='object')


In [39]:
# Field cleanup for candidates table

# drop duplicate Party Full column
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
# print(dcinbox_df.columns)

dcinbox_df = dcinbox_raw_df

# Convert unix timestamp to a datetime column
dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Unix Timestamp"], unit="ms")
# dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'], errors='coerce')

# Convert district to an integer and fill senators with 0
dcinbox_df["District"] = dcinbox_df["District"].fillna(0).astype(int)

# create a column that is the concatenated first and last name of the politician
# (for matching later to Open Secrets data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"] + " " + dcinbox_df["Last Name"]

# Rename party to party_short
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add party column that just is one character
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
)

# dcinbox_df.info()
dcinbox_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party
0,"Before the weekend, three things from Washington:","96 Dear , This week, the House advanced a co...",1758303499000,M001208,119,Lucy,McBath,1960-06-01,F,GA,6,Democrat,House,,259320,2025-09-19 17:38:19,Lucy McBath,D
1,Trump vs. Kimmel: My Take,96 Do you believe the federal government shou...,1758298234000,F000483,119,Laura,Friedman,1966-12-03,F,CA,30,Democrat,House,,259315,2025-09-19 16:10:34,Laura Friedman,D
2,Congressman Fulcher's Legislative Update,"September 19, 2025 Dear , After a busy and p...",1758290357000,F000469,119,Russ,Fulcher,1973-07-19,M,ID,1,Republican,House,,259316,2025-09-19 13:59:17,Russ Fulcher,R
3,The Wied Wire 9/19,News from Congressman Tony Wied Wied About Su...,1758289885000,W000829,119,Tony,Wied,1976-05-03,M,WI,8,Republican,House,,259317,2025-09-19 13:51:25,Tony Wied,R
4,Washington in a Week - Senate Unanimously Pass...,Senate Unanimously Passes Lee Resolution Hono...,1758289877000,L000577,119,Mike,Lee,1971-06-04,M,UT,0,Republican,Senate,,259318,2025-09-19 13:51:17,Mike Lee,R


In [40]:
print(dcinbox_df["Party Full"].head())
print(dcinbox_df["Party Full"].dtype)
print(dcinbox_df["Date"].dtype)

0      Democrat
1      Democrat
2    Republican
3    Republican
4    Republican
Name: Party Full, dtype: object
object
datetime64[ns]


In [None]:
# Make new data frame that only includes the distinct politicians with their Congress, 
# First Name, Last Name, Gender, State, District, and Party 
# to use for match targets

# Filter rows where Date is in 2019 or 2020 <-- we should figure out what the "cycle" actually means,
# or if we should use the congress number instead
# mask = dcinbox_df["Date"].between("2019-01-01", "2020-12-31")
# mask = dcinbox_df["Congress"] == "119"
# dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'], errors='coerce')
# mask = dcinbox_df['Date'].dt.year.isin([2019, 2020])

# pols_for_matching_df = dcinbox_df.loc[mask, ["Full Name", "Party", "State", "District"]]

# Create a data frame that only has the files we need for matching

pols_matching_df = dcinbox_df[["Full Name", "Party", "State", "District"]].rename(
    columns={
        "Full Name": "name",
        "Party": "match_attribute1",
        "State": "match_attribute2",
        "District": "match_attribute3"
    }
)

# Adding blank columns needed for python matching process
pols_matching_df["id"] = ""
pols_matching_df["append_attribute1"] = ""
pols_matching_df["append_attribute2"] = ""
pols_matching_df["append_attribute3"] = ""

# Reorder columns so "id" is second
cols = ["name", "id", "match_attribute1", "match_attribute2", "match_attribute3",
        "append_attribute1", "append_attribute2", "append_attribute3"]
pols_matching_df = pols_matching_df[cols]

print(pols_matching_df.head())

             name match_attribute1 match_attribute2  match_attribute3
0     Lucy McBath                D               GA                 6
1  Laura Friedman                D               CA                30
2    Russ Fulcher                R               ID                 1
3       Tony Wied                R               WI                 8
4        Mike Lee                R               UT                 0


In [24]:
# Column counts

# cand_contrib_df["CID"].nunique()
key_column_counts = dcinbox_df[["Full Name", "Party", "State", "District", "Gender"]].nunique()
print(key_column_counts)

Full Name    258
Party          2
State         48
District      38
Gender         2
dtype: int64


In [45]:
# Output the match targets to a CSV file

pols_matching_df.to_csv(
    "../matching/match_targets_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)