This notebook contains Helen's python code for loading and getting counts of the DC Inbox newletter content files.

In [32]:
# KEEP
# import libraries
import pandas as pd
# import regex as re
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [33]:
# Read in the DCInbox table

# column_names = ["Subject", "Body", "Unix Timestamp", "BioGuide ID",
#                 "Congress", "First Name", "Last Name", "Date of Birth",
#                 "Gender", "State", "District", "Party",
#                 "Chamber", "Nickname", "ID"]

dcinbox_raw_df = pd.read_csv(
#     "../data/dcinbox/dcinbox_export_119th_through9_19_2025.csv",
    "../data/dcinbox/dcinbox_export_116.csv",
    # sep = ",",            # columns are separated by commas
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    engine = 'python'    # This might be able to handle very long messages better,
    # names = column_names
)

# Remove all unnamed columns, because they're messing up the read of the file
unnamed = dcinbox_raw_df.columns.str.contains('^Unnamed')
# dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed]
dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed].copy()

dcinbox_clean_columns_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123


In [34]:
# QC to see why it's reading in all 826 columns of the CSV file

# dcinbox_raw_df = pd.read_csv("../data/dcinbox/dcinbox_export_116.csv")
print(len(dcinbox_raw_df.columns))
print(dcinbox_raw_df.columns.tolist())

with open("../data/dcinbox/dcinbox_export_116.csv", "r", encoding="utf-8") as f:
    for _ in range(3):
        print(f.readline())

# It turns out it's reading in a ton of columns with trailing commas, 
# and only the first 15 are real.
unnamed = dcinbox_raw_df.columns.str.contains("^Unnamed")
print("Unnamed columns:", unnamed.sum())
print("Named columns:", (~unnamed).sum())

826
['Subject', 'Body', 'Unix Timestamp', 'BioGuide ID', 'Congress', 'First Name', 'Last Name', 'Date of Birth', 'Gender', 'State', 'District', 'Party', 'Chamber', 'Nickname', 'ID', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed

In [35]:
# print(dcinbox_df.columns)
# dcinbox_df = dcinbox_df.rename(
#     columns=lambda x: x if x not in dcinbox_df.columns.duplicated() else f"{x}_dup")
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
print(dcinbox_clean_columns_df.columns)

Index(['Subject', 'Body', 'Unix Timestamp', 'BioGuide ID', 'Congress',
       'First Name', 'Last Name', 'Date of Birth', 'Gender', 'State',
       'District', 'Party', 'Chamber', 'Nickname', 'ID'],
      dtype='object')


In [36]:
# Field cleanup for candidates table

# drop duplicate Party Full column
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
# print(dcinbox_df.columns)

# dcinbox_df = dcinbox_clean_columns_df
dcinbox_df = dcinbox_clean_columns_df.copy()

# Convert unix timestamp to a datetime column
# dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Unix Timestamp"], unit="ms")
# dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'], errors='coerce')
dcinbox_df["Date"] = pd.to_datetime(
    pd.to_numeric(dcinbox_df["Unix Timestamp"], errors="coerce"),
    unit="ms"
).dt.date

# Convert district to an integer and fill senators with 0
# dcinbox_df["District"] = dcinbox_df["District"].fillna(0).astype(int)
dcinbox_df["District"] = (
    pd.to_numeric(dcinbox_df["District"], errors = "coerce")
    .fillna(0)
    .astype(int)
)

# create a column that is the uppercased, concatenated first and last name of the politician
# (for matching later to Open Secrets data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"].str.upper() + " " + dcinbox_df["Last Name"].str.upper()

# Rename party to party_short
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add party column that just is one character
dcinbox_df["Party Truncated"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
)
# Add party column that just is one character mapped to D, R, or O (Other)
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
    .map({"D": "D", "R": "R"})
    .fillna("O")
)

# Make sure state and party are uppercased
dcinbox_df["State"] = dcinbox_df["State"].fillna("").str.strip().str.upper()
dcinbox_df["Party"] = dcinbox_df["Party"].str.upper()

# dcinbox_df.info()
dcinbox_df.head()

Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party Truncated,Party
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119,2021-01-02,KEVIN MCCARTHY,R,R
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120,2021-01-02,STEVEN HORSFORD,D,D
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121,2021-01-02,GLENN GROTHMAN,R,R
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122,2021-01-02,MARKWAYNE MULLIN,R,R
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123,2021-01-02,K. CONAWAY,R,R


In [37]:
# Various checks on the fields

# print(dcinbox_df["Party Full"].head())
# print(dcinbox_df["Party Full"].dtype)
# print(dcinbox_df["Date"].dtype)
dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Date"], errors='coerce')

print(dcinbox_df["District"].unique())

print("Min Date:", dcinbox_df["Date"].min())
print("Max Date:", dcinbox_df["Date"].max())

print(dcinbox_df["Party"].unique())
# print(dcinbox_df["Party Full"].str[0].value_counts())
print(dcinbox_df["Party Truncated"].value_counts())

print(dcinbox_df["District"].unique())

[           23             4             6             2            11
             0             5             1            46            27
            13             3             7            24             9
             8            53            28            12            20
            36            15            10            17            16
            52            34            26            30            21
            25            19            18            39            51
            32            41            22            14            48
            49            33            38            47            45
            44            29            35            31            42
            37            40 1585070000000            50]
Min Date: 2019-01-03 00:00:00
Max Date: 2021-01-02 00:00:00
['R' 'D' 'O']
Party Truncated
R    16104
D    14666
        42
I        8
M        2
'        1
S        1
Name: count, dtype: int64
[           23             4           

In [38]:
# Investigating weird values for district

weird_values = [1585070000000]

weird_party_rows = dcinbox_df[dcinbox_df["District"].isin(weird_values)]
print(weird_party_rows[["First Name", "Last Name", "District"]])

                  First Name  \
14699   Suite 629 Charleston   

                                               Last Name       District  
14699   WV 25302 Phone: (304) 342-5855 Fax: (304) 343...  1585070000000  


In [39]:
# Investigating third parties

# bernie_rows = dcinbox_df[dcinbox_df["Last Name"].str.startswith("Sand", na=False)]
# print(bernie_rows[["First Name", "Last Name", "Party"]])

independents = ["Sanders", "Gonzalez", "Amash", "King", "Sablan"]
indie_rows = dcinbox_df[dcinbox_df["Last Name"].isin(independents)]
print(indie_rows[["First Name", "Last Name", "Party Truncated"]].drop_duplicates())

weird_values = ["M", "'", " ", "S"]

weird_party_rows = dcinbox_df[dcinbox_df["Party Truncated"].str[0].isin(weird_values)]
print(weird_party_rows["Party Truncated"].count())
print(weird_party_rows[["First Name", "Last Name", "Party Truncated"]])


      First Name Last Name Party Truncated
29      Gregorio    Sablan               D
42         Angus      King               D
205      Bernard   Sanders               D
412      Anthony  Gonzalez               R
445      Vicente  Gonzalez               D
9721       Steve      King               R
23168     Justin     Amash               R
46
                                              First Name  \
238     to learn from this and strengthen our cyber d...   
433                                           prosperous   
752     which includes a number of key recommendation...   
1273     keeping a log and working with the lab. Portman   
4092    and communicate. Many of these changes have b...   
5759                                             Portman   
5929                    which supports rural communities   
6463    were able to hide behind an intermediary who ...   
6895    said Portman. ""There are many large and smal...   
7962                                       $3.1 milli

In [40]:
# Make new data frame that only includes the distinct politicians with their Congress, 
# First Name, Last Name, Gender, State, District, and Party 
# to use for match targets

# Filter rows for desired cycle (e.g. the 116th Congress)
# and only House members
dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Date"])
date_filter = dcinbox_df["Date"].between("2019-11-03", "2020-11-02")
chamber_filter = dcinbox_df["Chamber"] == "House"
office_filter = dcinbox_df["District"] != 0

# pols_for_matching_df = dcinbox_df.loc[mask, ["Full Name", "Party", "State", "District"]]
# test_df = dcinbox_df.loc[date_filter & office_filter, ["Full Name", "Party", "State", "District"]]
# print(test_df["District"].value_counts())

# Create a data frame that only has the files we need for matching
# pols_matching_df = dcinbox_df[["Full Name", "Party", "State", "District"]].rename(
pols_matching_df = dcinbox_df.loc[date_filter & office_filter & chamber_filter, ["Full Name", "Party", "State", "District"]].rename(
    columns={
        "Full Name": "name",
        "Party": "match_attribute1",
        "State": "match_attribute2",
        "District": "match_attribute3"
    }
)

# Drop duplicate rows based on all columns
pols_matching_df = pols_matching_df.drop_duplicates()

# Adding blank columns needed for python matching process
pols_matching_df["id"] = ""
pols_matching_df["append_attribute1"] = ""
pols_matching_df["append_attribute2"] = ""
pols_matching_df["append_attribute3"] = ""

# Reordering columns so "id" is second
cols = ["name", "id", "match_attribute1", "match_attribute2", "match_attribute3",
        "append_attribute1", "append_attribute2", "append_attribute3"]
pols_matching_df = pols_matching_df[cols]

print(len(pols_matching_df))
print(pols_matching_df.head())

400
                        name id match_attribute1 match_attribute2  \
1978           BRAD WENSTRUP                   R               OH   
1980  DEBBIE MUCARSEL-POWELL                   D               FL   
1981             MARTHA ROBY                   R               AL   
1982           TULSI GABBARD                   D               HI   
1983             DEVIN NUNES                   R               CA   

      match_attribute3 append_attribute1 append_attribute2 append_attribute3  
1978                 2                                                        
1980                26                                                        
1981                 2                                                        
1982                 2                                                        
1983                22                                                        


In [41]:
# Output the match targets to a CSV file

pols_matching_df.to_csv(
    "../matching/match_targets_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)

In [42]:
# Column counts

# cand_contrib_df["CID"].nunique()
key_column_counts = dcinbox_df[["Full Name", "Party", "State", "District", "Gender"]].nunique()
print(key_column_counts)
# print(dcinbox_df["Chamber"].count())

Full Name    552
Party          3
State         98
District      54
Gender        45
dtype: int64


In [43]:
# Read Kevin McCarthy's email

kevin_df = dcinbox_df[(dcinbox_df["Full Name"].str.contains("MCCARTHY", na = False))]
# bernie_rows = dcinbox_df[dcinbox_df["Last Name"].str.startswith("Sand", na=False)]
kevin_df["Date"] = pd.to_datetime(kevin_df["Date"])
kevin_df = kevin_df[(kevin_df["Date"] >= "2020-02-01") & (kevin_df["Date"] <= "2020-02-29")]

print(kevin_df[["Subject", "Body", "Date", "Full Name"]])

kevin_df.to_csv(
    "../matching/Kevin_McCarthy_Feb_2020_emails.csv",
    index=False,  # no row numbers
    encoding="latin1"
)

                                                 Subject  \
16288      The Leader's Weekly Schedule - Week of 3/2/20   
16390              The Leader's Daily Schedule - 2/28/20   
16392                               Trump in Bakersfield   
16417              The Leader's Daily Schedule - 2/27/20   
16446              The Leader's Daily Schedule - 2/26/20   
16476  Spectrum News 1 Interview on McCarthy's Bill, ...   
16479              The Leader's Daily Schedule - 2/25/20   
16560     The Leader's Weekly Schedule - Week of 2/24/20   
16561  Cold and Cruel: California Must Stop Discrimin...   
16626                Thanks for the visit, Mr. President   
16752                                           One Week   
16909              The Leader's Daily Schedule - 2/13/20   
16943              The Leader's Daily Schedule - 2/12/20   
16970              The Leader's Daily Schedule - 2/11/20   
17016              The Leader's Daily Schedule - 2/10/20   
17082     The Leader's Weekly Schedule -

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kevin_df["Date"] = pd.to_datetime(kevin_df["Date"])
