This notebook contains Helen's code for loading and getting counts of the Open Secrets candidates and individual contributions files.

In [1]:
# KEEP
# import libraries
import pandas as pd
# import regex as re
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [2]:
# Read in the DCInbox table

# column_names = ["Cycle", "FECCandID", "CID", "FirstLastP",
#                 "Party", "DistIDRunFor", "DistIDCurr", "CurrCand",
#                 "CycleCand", "CRPICO", "RecipCode", "NoPacs"]

dcinbox_df = pd.read_csv("dcinbox_export__election s.csv", 
                      # sep = ",", # separated by commas
                      # quotechar = "|", # fields are surrounded by pipes
                      encoding = "latin1"
                      # header = None,
                      # names = column_names
)

dcinbox_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,District Update: Rep. Hill Calls for a Federal...,"Email from Rep. Hill Friends, I wish I had be...",1749234818000,H001072,119,J.,Hill,1956-12-05,M,AR,2.0,Republican,House,,254570
1,"Fighting Hunger in New Jersey, Diving Into Tru...",General Newsletter From the Office of Senator...,1749204622000,K000394,119,Andy,Kim,1982-07-12,M,NJ,,Democrat,Senate,,254488
2,"Fighting Hunger in New Jersey, Diving Into Tru...",General Newsletter From the Office of Senator...,1749148525000,K000394,119,Andy,Kim,1982-07-12,M,NJ,,Democrat,Senate,,254493
3,EC From DC: Town Hall Tonight with Rep. Davids...,Click here to from these updates | Forwar...,1745240413000,C001061,119,Emanuel,Cleaver,1944-10-26,M,MO,5.0,Democrat,House,,252395
4,Moran Minute: He is Risen! âï¸,Moran Minute: He is Risen! âï¸ As We Celeb...,1745073029000,M001224,119,Nathaniel,Moran,1974-07-22,M,TX,1.0,Republican,House,,252343


In [None]:
# print(dcinbox_df.columns)
# dcinbox_df = dcinbox_df.rename(
#     columns=lambda x: x if x not in dcinbox_df.columns.duplicated() else f"{x}_dup")
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
print(dcinbox_df.columns)

Index(['Subject', 'Body', 'Unix Timestamp', 'BioGuide ID', 'Congress',
       'First Name', 'Last Name', 'Date of Birth', 'Gender', 'State',
       'District', 'Party Full', 'Chamber', 'Nickname', 'ID', 'Date',
       'Full Name'],
      dtype='object')


In [17]:
# Field cleanup for candidates table

# drop duplicate Party Full column
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
# print(dcinbox_df.columns)

# Convert unix timestamp to a datetime column - how??
dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Unix Timestamp"], unit="ms")

# Convert district to an integer and fill senators with 0
dcinbox_df["District"] = dcinbox_df["District"].fillna(0).astype(int)

# create a column that is the concatenated first and last name of the politician
# (for matching later to Open Secrets data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"] + " " + dcinbox_df["Last Name"]

# Rename party to party_short
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add party column that just is one character
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
)

# dcinbox_df.info()
dcinbox_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party
0,District Update: Rep. Hill Calls for a Federal...,"Email from Rep. Hill Friends, I wish I had be...",1749234818000,H001072,119,J.,Hill,1956-12-05,M,AR,2,Republican,House,,254570,2025-06-06 18:33:38,J. Hill,R
1,"Fighting Hunger in New Jersey, Diving Into Tru...",General Newsletter From the Office of Senator...,1749204622000,K000394,119,Andy,Kim,1982-07-12,M,NJ,0,Democrat,Senate,,254488,2025-06-06 10:10:22,Andy Kim,D
2,"Fighting Hunger in New Jersey, Diving Into Tru...",General Newsletter From the Office of Senator...,1749148525000,K000394,119,Andy,Kim,1982-07-12,M,NJ,0,Democrat,Senate,,254493,2025-06-05 18:35:25,Andy Kim,D
3,EC From DC: Town Hall Tonight with Rep. Davids...,Click here to from these updates | Forwar...,1745240413000,C001061,119,Emanuel,Cleaver,1944-10-26,M,MO,5,Democrat,House,,252395,2025-04-21 13:00:13,Emanuel Cleaver,D
4,Moran Minute: He is Risen! âï¸,Moran Minute: He is Risen! âï¸ As We Celeb...,1745073029000,M001224,119,Nathaniel,Moran,1974-07-22,M,TX,1,Republican,House,,252343,2025-04-19 14:30:29,Nathaniel Moran,R


In [18]:
print(dcinbox_df["Party Full"].head())
print(dcinbox_df["Party Full"].dtype)
print(dcinbox_df["Date"].dtype)

0    Republican
1      Democrat
2      Democrat
3      Democrat
4    Republican
Name: Party Full, dtype: object
object
datetime64[ns]


In [23]:
# Make new data frame that only includes the distinct politicians with their Congress, 
# First Name, Last Name, Gender, State, District, and Party 
# to use for match targets

# Filter rows where Date is in 2019 or 2020 <-- we should figure out what the "cycle" actually means,
# or if we should use the congress number instead
mask = dcinbox_df["Date"].between("2019-01-01", "2020-12-31")

pols_for_matching_df = dcinbox_df.loc[mask, ["Full Name", "Party", "State", "District", "Gender"]]

# Drop duplicates on Full Name while keeping the first occurrence
# pols_for_matching_df = filtered.drop_duplicates(subset="Full Name")
# pols_for_matching_df = pols_for_matching_df.reset_index(drop=True)

print(pols_for_matching_df.head())

            Full Name Party State  District Gender
237    James Lankford     R    OK         0      M
238    Robert Portman     R    OH         0      M
239         Rand Paul     R    KY         0      M
240  Marsha Blackburn     R    TN         0      F
241    Robert Portman     R    OH         0      M


In [24]:
# Column counts

# cand_contrib_df["CID"].nunique()
key_column_counts = dcinbox_df[["Full Name", "Party", "State", "District", "Gender"]].nunique()
print(key_column_counts)

Full Name    258
Party          2
State         48
District      38
Gender         2
dtype: int64


In [25]:
# Output the match targets to a CSV file

pols_for_matching_df.to_csv(
    r"C:\Users\hefla\Documents\School\Classes\CSYS 5870\DC Inbox\dcinbox_match_targets_2020.csv",
    index=False,  # no row numbers
    encoding="latin1"
)