This notebook contains Helen's python code for loading and getting counts of the DC Inbox newletter content files.

In [79]:
# KEEP
# import libraries
import pandas as pd
# import regex as re
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [80]:
# Read in the raw DCInbox table

# column_names = ["Subject", "Body", "Unix Timestamp", "BioGuide ID",
#                 "Congress", "First Name", "Last Name", "Date of Birth",
#                 "Gender", "State", "District", "Party",
#                 "Chamber", "Nickname", "ID"]

dcinbox_raw_df = pd.read_csv(
#     "../data/dcinbox/dcinbox_export_119th_through9_19_2025.csv",
    "../data/dcinbox/dcinbox_export_116.csv",
    # sep = ",",            # columns are separated by commas
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    engine = 'python'    # This might be able to handle very long messages better,
    # names = column_names
)

# Remove all unnamed columns, because they're messing up the read of the file
unnamed = dcinbox_raw_df.columns.str.contains('^Unnamed')
# dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed]
dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed].copy()

print(len(dcinbox_clean_columns_df))
dcinbox_clean_columns_df.head()

30872


Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123


In [81]:
# Field cleanup for DC Inbox table

# drop duplicate Party Full column
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
# print(dcinbox_df.columns)

# dcinbox_df = dcinbox_clean_columns_df
dcinbox_df = dcinbox_clean_columns_df.copy()

# Convert unix timestamp to a datetime column
# dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Unix Timestamp"], unit="ms")
# dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'], errors='coerce')
dcinbox_df["Date"] = pd.to_datetime(
    pd.to_numeric(dcinbox_df["Unix Timestamp"], errors="coerce"),
    unit="ms"
).dt.date

# Convert district to an integer and fill senators with 0
# dcinbox_df["District"] = dcinbox_df["District"].fillna(0).astype(int)
dcinbox_df["District"] = (
    pd.to_numeric(dcinbox_df["District"], errors = "coerce")
    .fillna(0)
    .astype(int)
)

# create a column that is the uppercased, concatenated first and last name of the politician
# (for matching later to Open Secrets data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"].str.upper() + " " + dcinbox_df["Last Name"].str.upper()

# Rename party to party_short
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add party column that just is one character
dcinbox_df["Party Truncated"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
)
# Add party column that just is one character mapped to D, R, or O (Other)
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
    .map({"D": "D", "R": "R"})
    .fillna("O")
)

# Make sure state and party are uppercased
dcinbox_df["State"] = dcinbox_df["State"].fillna("").str.strip().str.upper()
dcinbox_df["Party"] = dcinbox_df["Party"].str.upper()

print(len(dcinbox_df))
# dcinbox_df.info()
dcinbox_df.head()

30872


Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party Truncated,Party
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119,2021-01-02,KEVIN MCCARTHY,R,R
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120,2021-01-02,STEVEN HORSFORD,D,D
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121,2021-01-02,GLENN GROTHMAN,R,R
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122,2021-01-02,MARKWAYNE MULLIN,R,R
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123,2021-01-02,K. CONAWAY,R,R


In [82]:
# Read in the matched politicians table

column_names = ["target_id", "dcinbox_name", "fec_cid", "fec_name",
                "fec_party", "fec_state", "fec_district",
                "fec_cycle", "append_attribute2", "append_attribute3",
                "similarity_score", "matched_scorer"]

matched_pols_df = pd.read_csv(
    "../data/matched/matched_politicians_2020_test.csv",
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    names = column_names,
    header = 0
)

print(len(matched_pols_df))
matched_pols_df.head()

394


Unnamed: 0,target_id,dcinbox_name,fec_cid,fec_name,fec_party,fec_state,fec_district,fec_cycle,append_attribute2,append_attribute3,similarity_score,matched_scorer
0,,KEVIN MCCARTHY,N00028152,KEVIN MCCARTHY,R,CA,23,2020,,,100.0,WRatio
1,,STEVEN HORSFORD,N00033638,STEVEN HORSFORD,D,NV,4,2020,,,100.0,WRatio
2,,GLENN GROTHMAN,N00036409,GLENN S GROTHMAN,R,WI,6,2020,,,100.0,token_set_ratio
3,,MARKWAYNE MULLIN,N00033410,MARKWAYNE MULLIN,R,OK,2,2020,,,100.0,WRatio
4,,K. CONAWAY,N00026041,MIKE CONAWAY,R,TX,11,2020,,,90.0,partial_ratio


In [144]:
# Read in the summarized candidate contributions file

# column_names = ["target_id", "dcinbox_name", "fec_cid", "fec_name",
#                 "fec_party", "fec_state", "fec_district",
#                 "fec_cycle", "append_attribute2", "append_attribute3",
#                 "similarity_score", "matched_scorer"]

cand_contribs_df = pd.read_csv(
    "../data/fec/candidate_contributions_2020_test.csv",
    encoding = "latin1",
    # quotechar = '"',   # Make sure actual quotation marks are handled right
    # names = column_names,
    # header = 0
)

# Drop the columns we don't need
cand_contribs_df = cand_contribs_df.drop(columns=['DistIDRunFor', 'DistIDCurr', "CurrCand", "CycleCand", "RecipCode", "CRPICO"])

print(len(cand_contribs_df))
cand_contribs_df.head()

85572


Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party Original,NoPacs,State,District,Party,contrib_date,contrib_count,contrib_amount
0,2020,H0AL05163,N00030910,MO BROOKS,R,,AL,5,R,2019-11-11,2,2750.0
1,2020,H0AL05163,N00030910,MO BROOKS,R,,AL,5,R,2019-12-23,1,2800.0
2,2020,H0AL05163,N00030910,MO BROOKS,R,,AL,5,R,2020-01-06,1,100.0
3,2020,H0AL05163,N00030910,MO BROOKS,R,,AL,5,R,2020-01-15,1,250.0
4,2020,H0AL05163,N00030910,MO BROOKS,R,,AL,5,R,2020-01-17,6,9000.0


In [145]:
# Create a summarized data frame of the DC Inbox data with newsletter & term counts by day
# (To match with FEC data)
# This could eventually go into the DC Inbox ETL notebook instead -- this is just for testing

# Make sure Date is datetime
dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'])

# Create a new column with word counts in Body
dcinbox_df['body_word_count'] = dcinbox_df['Body'].str.split().str.len()
print(len(dcinbox_df))

# Group by Full Name and Date
dcinbox_summary_df = dcinbox_df.groupby(['Full Name', 'Date'], as_index=False).agg(
    newsletter_count = ('Subject', 'count'),
    body_word_count_sum = ('body_word_count', 'sum') 
)

# Merge back other columns using the first occurrence per day for all other columns
other_cols = [col for col in dcinbox_df.columns if col not in ['Subject', 'Body', 'body_word_count', 'Date', "Full Name", "Party Truncated"]]
dcinbox_summary_df = dcinbox_summary_df.merge(
    dcinbox_df.groupby(['Full Name', 'Date'])[other_cols].first().reset_index(),
    on=['Full Name', 'Date'],
    how='left'
)

# Only find records that are House members
dcinbox_summary_df = dcinbox_summary_df[(dcinbox_summary_df["Chamber"] == "House")]

# Drop unneeded columns
dcinbox_summary_df = dcinbox_summary_df.drop(columns=['Nickname', 'Date of Birth', "First Name", "Last Name", "BioGuide ID", "Unix Timestamp"])

print(len(dcinbox_summary_df))
dcinbox_summary_df.head()

30872
25785


Unnamed: 0,Full Name,Date,newsletter_count,body_word_count_sum,Congress,Gender,State,District,Party Full,Chamber,ID,Party
0,A. FERGUSON,2019-01-13,1,357,116,M,GA,3,Republican,House,160063,R
1,A. FERGUSON,2019-01-27,1,528,116,M,GA,3,Republican,House,159679,R
2,A. FERGUSON,2019-02-10,1,457,116,M,GA,3,Republican,House,159236,R
3,A. FERGUSON,2019-03-03,1,430,116,M,GA,3,Republican,House,158641,R
4,A. FERGUSON,2019-03-17,1,271,116,M,GA,3,Republican,House,158135,R


In [146]:
# Joining QC

# Original dcinbox file
# print(len(dcinbox_df))

# Dcinbox file with counts of newsletters by day
print(len(dcinbox_summary_df))

# file with politicians matched between dcinbox and FEC data
print(len(matched_pols_df))

# Summary FEC file of candidates and contributions by day
print(len(cand_contribs_df))


25785
394
85572


In [147]:
# Join summarized DC Inbox and matched politicians data into a new data frame (left join)

# Small subset for testing
# summary_test_df = dcinbox_summary_df[dcinbox_summary_df["ID"] == "158135"]
# summary_test_df = dcinbox_summary_df[dcinbox_summary_df["Date"] == "2020-02-01"]
# print(summary_test_df.head())
# print(len(summary_test_df))
print(len(dcinbox_summary_df))

# dc_inbox_sum_df = summary_test_df.merge(
dc_inbox_sum_df = dcinbox_summary_df.merge(
    matched_pols_df,
    left_on = ["Full Name"], # , "State", "District", "Party"],
    right_on = ["dcinbox_name"], # "fec_state", "fec_district", "fec_party"],
    how = 'left'
)

# Drop the columns we don't need
dc_inbox_sum_df = dc_inbox_sum_df.drop(columns=['fec_state', 'fec_district', 'fec_party', "target_id", "append_attribute2", "append_attribute3",
                                                "matched_scorer", "Party Full", "dcinbox_name"])

# Field cleanup & type correction
dc_inbox_sum_df['fec_cycle'] = dc_inbox_sum_df['fec_cycle'].fillna(0).astype(int)
dc_inbox_sum_df['similarity_score'] = dc_inbox_sum_df['similarity_score'].fillna(0).astype(int)
dc_inbox_sum_df['Date'] = pd.to_datetime(dc_inbox_sum_df['Date'])

# Fill missing counts and amounts with 0 for candidates with no contributions (per day)
# dc_inbox_sum_df['contrib_count'] = dc_inbox_sum_df['contrib_count'].fillna(0).astype(int)
# dc_inbox_sum_df['contrib_amount'] = dc_inbox_sum_df['contrib_amount'].fillna(0)

print(len(dc_inbox_sum_df))
dc_inbox_sum_df.head()
# dc_inbox_sum_df.info()

25785
25785


Unnamed: 0,Full Name,Date,newsletter_count,body_word_count_sum,Congress,Gender,State,District,Chamber,ID,Party,fec_cid,fec_name,fec_cycle,similarity_score
0,A. FERGUSON,2019-01-13,1,357,116,M,GA,3,House,160063,R,N00039090,DREW FERGUSON,2020,90
1,A. FERGUSON,2019-01-27,1,528,116,M,GA,3,House,159679,R,N00039090,DREW FERGUSON,2020,90
2,A. FERGUSON,2019-02-10,1,457,116,M,GA,3,House,159236,R,N00039090,DREW FERGUSON,2020,90
3,A. FERGUSON,2019-03-03,1,430,116,M,GA,3,House,158641,R,N00039090,DREW FERGUSON,2020,90
4,A. FERGUSON,2019-03-17,1,271,116,M,GA,3,House,158135,R,N00039090,DREW FERGUSON,2020,90


In [150]:
# QC for joining

test_date = "2020-02-07"
test_name = "SCHIFF"

dc_inbox_sum_df['Date'] = pd.to_datetime(dc_inbox_sum_df['Date'])
test_dcinbox_df = dc_inbox_sum_df[dc_inbox_sum_df["Date"] == test_date]
test_dcinbox_df = test_dcinbox_df[test_dcinbox_df["Full Name"].str.contains(test_name)]

cand_contribs_df['contrib_date'] = pd.to_datetime(cand_contribs_df['contrib_date'])
test_fec_df = cand_contribs_df[cand_contribs_df["contrib_date"] == test_date]
test_fec_df = test_fec_df[test_fec_df["FirstLastP"].str.contains(test_name)]

print(len(test_dcinbox_df))
print(len(test_fec_df))
# test_dcinbox_df.head()
test_fec_df.head()

1
1


Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party Original,NoPacs,State,District,Party,contrib_date,contrib_count,contrib_amount
1216,2020,H0CA27085,N00009585,ADAM SCHIFF,D,,CA,28,D,2020-02-07,267,42987.0


In [152]:
# Join summarized DC Inbox and FEC into a new data frame (left join) by day & politician

print(len(dc_inbox_sum_df))
print(len(cand_contribs_df))

joined_all_df = dc_inbox_sum_df.merge(
# joined_all_df = test_dcinbox_df.merge(
    cand_contribs_df,
    # test_fec_df,
    left_on = ['fec_cid', "Date"],
    right_on = ['CID', "contrib_date"],
    how = 'left'
)

# Drop the columns we don't need
# joined_all_df = joined_all_df.drop(columns=['State', 'District', 'Party', "append_attribute2", "append_attribute3"])

# Fill missing counts and amounts with 0 for candidates with no contributions (per day)
# dc_inbox_sum_df['contrib_count'] = dc_inbox_sum_df['contrib_count'].fillna(0).astype(int)
# dc_inbox_sum_df['contrib_amount'] = dc_inbox_sum_df['contrib_amount'].fillna(0)

print(len(joined_all_df))
joined_all_df.head()
# dc_inbox_sum_df.info()

25785
85572
25785


Unnamed: 0,Full Name,Date,newsletter_count,body_word_count_sum,Congress,Gender,State_x,District_x,Chamber,ID,...,CID,FirstLastP,Party Original,NoPacs,State_y,District_y,Party_y,contrib_date,contrib_count,contrib_amount
0,A. FERGUSON,2019-01-13,1,357,116,M,GA,3,House,160063,...,,,,,,,,NaT,,
1,A. FERGUSON,2019-01-27,1,528,116,M,GA,3,House,159679,...,,,,,,,,NaT,,
2,A. FERGUSON,2019-02-10,1,457,116,M,GA,3,House,159236,...,,,,,,,,NaT,,
3,A. FERGUSON,2019-03-03,1,430,116,M,GA,3,House,158641,...,,,,,,,,NaT,,
4,A. FERGUSON,2019-03-17,1,271,116,M,GA,3,House,158135,...,,,,,,,,NaT,,


In [155]:
# Output the final joined file to a CSV file

joined_all_df.to_csv(
#     r"C:\Users\hefla\Documents\Work\IPS\Area 990\Data Sources\Politicking\cand_contrib_summary_2020.csv",
    "../data/matched/joined_all_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)

In [153]:
# Column counts

# cand_contrib_df["CID"].nunique()
key_column_counts = dcinbox_df[["Full Name", "Party", "State", "District", "Gender"]].nunique()
print(key_column_counts)

Full Name    552
Party          3
State         98
District      54
Gender        45
dtype: int64
