This notebook contains Helen's python code for loading and getting counts of the DC Inbox newletter content files.

In [1]:
# import libraries
import pandas as pd

In [2]:
# Read in the raw DCInbox table

dcinbox_raw_df = pd.read_csv(
    "../data/dcinbox/dcinbox_export_116.csv",
    # sep = ",",            # columns are separated by commas
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    engine = 'python'    # This might be able to handle very long messages better
)

# Remove all unnamed columns, because they're messing up the read of the file
unnamed = dcinbox_raw_df.columns.str.contains('^Unnamed')
dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed].copy()

dcinbox_clean_columns_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123


In [3]:
# QC to make sure it's pulling in the right columns from DC Inbox data
print(dcinbox_clean_columns_df.columns)

Index(['Subject', 'Body', 'Unix Timestamp', 'BioGuide ID', 'Congress',
       'First Name', 'Last Name', 'Date of Birth', 'Gender', 'State',
       'District', 'Party', 'Chamber', 'Nickname', 'ID'],
      dtype='object')


In [4]:
# Field cleanup for DC Inbox table

# dcinbox_df = dcinbox_clean_columns_df
dcinbox_df = dcinbox_clean_columns_df.copy()

# Convert unix timestamp to a datetime column
dcinbox_df["Date"] = pd.to_datetime(
    pd.to_numeric(dcinbox_df["Unix Timestamp"], errors="coerce"),
    unit="ms"
).dt.date

# Convert district to an integer and fill senators with 0
dcinbox_df["District"] = (
    pd.to_numeric(dcinbox_df["District"], errors = "coerce")
    .fillna(0)
    .astype(int)
)

# create a column that is the uppercased, concatenated first and last name of the politician
# (for matching later to FEC data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"].str.upper() + " " + dcinbox_df["Last Name"].str.upper()

# Rename party to party full
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add A new party column that just is one character mapped to D, R, or O (Other)
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
    .map({"D": "D", "R": "R"})
    .fillna("O")
)

# Make sure state and party are uppercased
dcinbox_df["State"] = dcinbox_df["State"].fillna("").str.strip().str.upper()
dcinbox_df["Party"] = dcinbox_df["Party"].str.upper()

# dcinbox_df.info()
dcinbox_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119,2021-01-02,KEVIN MCCARTHY,R
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120,2021-01-02,STEVEN HORSFORD,D
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121,2021-01-02,GLENN GROTHMAN,R
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122,2021-01-02,MARKWAYNE MULLIN,R
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123,2021-01-02,K. CONAWAY,R


In [5]:
# Various checks on the fields in the raw data set

print(dcinbox_df["District"].unique())

dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Date"], errors='coerce')
print("Min Date:", dcinbox_df["Date"].min())
print("Max Date:", dcinbox_df["Date"].max())

print(dcinbox_df["Party Full"].unique())
print(dcinbox_df["Party"].unique())

[           23             4             6             2            11
             0             5             1            46            27
            13             3             7            24             9
             8            53            28            12            20
            36            15            10            17            16
            52            34            26            30            21
            25            19            18            39            51
            32            41            22            14            48
            49            33            38            47            45
            44            29            35            31            42
            37            40 1585070000000            50]
Min Date: 2019-01-03 00:00:00
Max Date: 2021-01-02 00:00:00
['Republican' 'Democrat' nan ' and severity of this breach'
 ' and air. As co-founder and co-chair of the Senate Ukraine Caucus'
 ' and local governments' ' New Jersey-base

In [6]:
# QC of third party politicians (2020 cycle)

independents = ["Sanders", "Gonzalez", "Amash", "King", "Sablan"]
indie_rows = dcinbox_df[dcinbox_df["Last Name"].isin(independents)]
print(indie_rows[["First Name", "Last Name", "Party", "District", "State"]].drop_duplicates())

      First Name Last Name Party  District State
29      Gregorio    Sablan     D         0    MP
42         Angus      King     D         0    ME
205      Bernard   Sanders     D         0    VT
412      Anthony  Gonzalez     R        16    OH
445      Vicente  Gonzalez     D        15    TX
9721       Steve      King     R         4    IA
23168     Justin     Amash     R         3    MI


In [7]:
# Make new data frame that only includes the distinct politicians with their Congress, 
# First Name, Last Name, Gender, State, District, and Party 
# to use for match targets

# Filter rows for desired cycle (e.g. the 116th Congress)
# and only House members
dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Date"])
date_filter = dcinbox_df["Date"].between("2019-01-03", "2021-01-02")
chamber_filter = dcinbox_df["Chamber"] == "House"
office_filter = dcinbox_df["District"] != 0

# pols_for_matching_df = dcinbox_df.loc[mask, ["Full Name", "Party", "State", "District"]]
# test_df = dcinbox_df.loc[date_filter & office_filter, ["Full Name", "Party", "State", "District"]]
# print(test_df["District"].value_counts())

# Create a data frame that only has the files we need for matching
# pols_matching_df = dcinbox_df[["Full Name", "Party", "State", "District"]].rename(
pols_matching_df = dcinbox_df.loc[date_filter & office_filter & chamber_filter, ["Full Name", "Party", "State", "District"]].rename(
    columns={
        "Full Name": "name",
        "Party": "match_attribute1",
        "State": "match_attribute2",
        "District": "match_attribute3"
    }
)

# Drop duplicate rows based on all columns
pols_matching_df = pols_matching_df.drop_duplicates()

# Adding blank columns needed for python matching process
pols_matching_df["id"] = ""
pols_matching_df["append_attribute1"] = ""
pols_matching_df["append_attribute2"] = ""
pols_matching_df["append_attribute3"] = ""

# Reordering columns so "id" is second
cols = ["name", "id", "match_attribute1", "match_attribute2", "match_attribute3",
        "append_attribute1", "append_attribute2", "append_attribute3"]
pols_matching_df = pols_matching_df[cols]

print(len(pols_matching_df))
print(pols_matching_df.head())

414
               name id match_attribute1 match_attribute2  match_attribute3  \
0    KEVIN MCCARTHY                   R               CA                23   
1   STEVEN HORSFORD                   D               NV                 4   
2    GLENN GROTHMAN                   R               WI                 6   
3  MARKWAYNE MULLIN                   R               OK                 2   
4        K. CONAWAY                   R               TX                11   

  append_attribute1 append_attribute2 append_attribute3  
0                                                        
1                                                        
2                                                        
3                                                        
4                                                        


In [8]:
# Output the match targets to a CSV file

pols_matching_df.to_csv(
    "../matching/match_targets_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)

In [9]:
# Column counts

# cand_contrib_df["CID"].nunique()
key_column_counts = dcinbox_df[["Full Name", "Party", "State", "District", "Gender"]].nunique()
print(key_column_counts)
# print(dcinbox_df["Chamber"].count())

Full Name    552
Party          3
State         98
District      54
Gender        45
dtype: int64
