This notebook contains Helen's python code for loading and getting counts of the DC Inbox newletter content files.

In [61]:
# KEEP
# import libraries
import pandas as pd
import numpy as np
# import regex as re
# import seaborn as sns
# import matplotlib.pyplot as plt

# set default seaborn theme
# sns.reset_defaults()
# sns.set_theme()


In [62]:
# Read in the raw DCInbox table

dcinbox_raw_df = pd.read_csv(
#     "../data/dcinbox/dcinbox_export_119th_through9_19_2025.csv",
    "../data/dcinbox/dcinbox_export_116.csv",
    # sep = ",",            # columns are separated by commas
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    engine = 'python'    # This might be able to handle very long messages better,
    # names = column_names
)

# Remove all unnamed columns, because they're messing up the read of the file
unnamed = dcinbox_raw_df.columns.str.contains('^Unnamed')
# dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed]
dcinbox_clean_columns_df = dcinbox_raw_df.loc[:, ~unnamed].copy()

print(len(dcinbox_clean_columns_df))
dcinbox_clean_columns_df.head()

30872


Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party,Chamber,Nickname,ID
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123


In [63]:
# Field cleanup for DC Inbox table

# drop duplicate Party Full column
# dcinbox_df = dcinbox_df.loc[:, ~dcinbox_df.columns.duplicated()]
# print(dcinbox_df.columns)

# dcinbox_df = dcinbox_clean_columns_df
dcinbox_df = dcinbox_clean_columns_df.copy()

# Convert unix timestamp to a datetime column
# dcinbox_df["Date"] = pd.to_datetime(dcinbox_df["Unix Timestamp"], unit="ms")
# dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'], errors='coerce')
dcinbox_df["Date"] = pd.to_datetime(
    pd.to_numeric(dcinbox_df["Unix Timestamp"], errors="coerce"),
    unit="ms"
).dt.date

# Convert district to an integer and fill senators with 0
# dcinbox_df["District"] = dcinbox_df["District"].fillna(0).astype(int)
dcinbox_df["District"] = (
    pd.to_numeric(dcinbox_df["District"], errors = "coerce")
    .fillna(0)
    .astype(int)
)

# create a column that is the uppercased, concatenated first and last name of the politician
# (for matching later to Open Secrets data)
dcinbox_df["Full Name"] = dcinbox_df["First Name"].str.upper() + " " + dcinbox_df["Last Name"].str.upper()

# Rename party to party_short
dcinbox_df.rename(columns={"Party": "Party Full"}, inplace=True)
# Add party column that just is one character
dcinbox_df["Party Truncated"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
)
# Add party column that just is one character mapped to D, R, or O (Other)
dcinbox_df["Party"] = (
    dcinbox_df["Party Full"]
    .fillna("")
    .astype(str)
    .str[0]
    .str.upper()
    .map({"D": "D", "R": "R"})
    .fillna("O")
)

# Make sure state and party are uppercased
dcinbox_df["State"] = dcinbox_df["State"].fillna("").str.strip().str.upper()
dcinbox_df["Party"] = dcinbox_df["Party"].str.upper()

print(len(dcinbox_df))
# dcinbox_df.info()
dcinbox_df.head()

30872


Unnamed: 0,Subject,Body,Unix Timestamp,BioGuide ID,Congress,First Name,Last Name,Date of Birth,Gender,State,District,Party Full,Chamber,Nickname,ID,Date,Full Name,Party Truncated,Party
0,The Leader's Daily Schedule - 1/3/21,Kevin McCarthy - Republican Leader Leader's Da...,1609630000000.0,M001165,116,Kevin,McCarthy,1/26/65,M,CA,23,Republican,House,,176119,2021-01-02,KEVIN MCCARTHY,R,R
1,Join me (virtually) for the swearing-in of the...,Join me (virtually) for the swearing-in of th...,1609610000000.0,H001066,116,Steven,Horsford,4/29/73,M,NV,4,Democrat,House,,176120,2021-01-02,STEVEN HORSFORD,D,D
2,"2020, A Year in Review","2020, A Year in Review Looking Back on 2020 2...",1609590000000.0,G000576,116,Glenn,Grothman,7/3/55,M,WI,6,Republican,House,,176121,2021-01-02,GLENN GROTHMAN,R,R
3,RESPONSE REQUESTED: Stay Updated on the 117th ...,RESPONSE REQUESTED: Stay Updated on the 117th...,1609590000000.0,M001190,116,Markwayne,Mullin,7/26/77,M,OK,2,Republican,House,,176122,2021-01-02,MARKWAYNE MULLIN,R,R
4,"Thank You TX-11, and Godspeed","Thank You TX-11, and Godspeed January 2, 2021...",1609580000000.0,C001062,116,K.,Conaway,6/11/48,M,TX,11,Republican,House,,176123,2021-01-02,K. CONAWAY,R,R


In [64]:
# Read in the matched politicians table

column_names = ["target_id", "dcinbox_name", "fec_cid", "fec_name",
                "fec_party", "fec_state", "fec_district",
                "fec_cycle", "append_attribute2", "append_attribute3",
                "similarity_score", "matched_scorer"]

matched_pols_df = pd.read_csv(
    "../data/matched/matched_politicians_2020_test.csv",
    encoding = "latin1",
    quotechar = '"',   # Make sure actual quotation marks in newsletters are handled right
    names = column_names,
    header = 0
)

print(len(matched_pols_df))
matched_pols_df.head()

394


Unnamed: 0,target_id,dcinbox_name,fec_cid,fec_name,fec_party,fec_state,fec_district,fec_cycle,append_attribute2,append_attribute3,similarity_score,matched_scorer
0,,KEVIN MCCARTHY,N00028152,KEVIN MCCARTHY,R,CA,23,2020,,,100.0,WRatio
1,,STEVEN HORSFORD,N00033638,STEVEN HORSFORD,D,NV,4,2020,,,100.0,WRatio
2,,GLENN GROTHMAN,N00036409,GLENN S GROTHMAN,R,WI,6,2020,,,100.0,token_set_ratio
3,,MARKWAYNE MULLIN,N00033410,MARKWAYNE MULLIN,R,OK,2,2020,,,100.0,WRatio
4,,K. CONAWAY,N00026041,MIKE CONAWAY,R,TX,11,2020,,,90.0,partial_ratio


In [65]:
# Read in the summarized FEC candidate contributions file

# column_names = ["target_id", "dcinbox_name", "fec_cid", "fec_name",
#                 "fec_party", "fec_state", "fec_district",
#                 "fec_cycle", "append_attribute2", "append_attribute3",
#                 "similarity_score", "matched_scorer"]

cand_contribs_raw_df = pd.read_csv(
    "../data/fec/candidate_contributions_2020_test.csv",
    encoding = "latin1",
    # quotechar = '"',   # Make sure actual quotation marks are handled right
    # names = column_names,
    # header = 0
)

# Drop the columns we don't need
cand_contribs_raw_df = cand_contribs_raw_df.drop(columns=['DistIDRunFor', 'DistIDCurr', "CurrCand", "CycleCand", 
                                                  "RecipCode", "CRPICO", "FECCandID"])

print(len(cand_contribs_raw_df))
cand_contribs_raw_df.head()

38565


Unnamed: 0,Cycle,CID,FirstLastP,Party Original,NoPacs,State,District,Party,contrib_date,contrib_count,contrib_amount
0,2020,N00030910,MO BROOKS,R,,AL,5,R,2019-11-11,2,2750.0
1,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-06,1,100.0
2,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-15,1,250.0
3,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-17,6,9000.0
4,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-21,1,500.0


In [66]:
# Clean up columns in the summarized FEC candidate contributions dataframe

# Rename columns so they won't clash with the DC Inbox data file
cand_contribs_raw_df = cand_contribs_raw_df.rename(columns={
    "Cycle": "fec_cycle",
    "CID": "fec_cid",
    "FirstLastP": "fec_name",
    "Party Original": "fec_party_original",
    "NoPacs": "no_pacs",
    "State": "fec_state",
    "District": "fec_district",
    "Party": "fec_party"    
})

# Set correct types on some columns
# cand_contribs_df["contrib_amount"] = cand_contribs_df["contrib_amount"].fillna(0).astype(int)
# cand_contribs_window_df["window_contrib_amount_sum"] = cand_contribs_window_df["window_contrib_amount_sum"].fillna(0).astype(int)
# cand_contribs_window_df["window_contrib_count_sum"] = cand_contribs_window_df["window_contrib_count_sum"].fillna(0).astype(int)
cand_contribs_raw_df["contrib_count"] = cand_contribs_raw_df["contrib_count"].fillna(0)
cand_contribs_raw_df["contrib_amount"] = cand_contribs_raw_df["contrib_amount"].fillna(0).astype(int)

# Get all unique candidates
# candidates = cand_contribs_raw_df[["fec_name", "fec_cycle", "fec_state", "fec_district", "fec_party"]].drop_duplicates()

# Ensure contrib_date is datetime
cand_contribs_raw_df["contrib_date"] = pd.to_datetime(cand_contribs_raw_df["contrib_date"])

# Define the full date range
start_date = pd.Timestamp("2019-11-03")
end_date = pd.Timestamp("2020-11-02")
num_days = (end_date - start_date).days + 1  # inclusive

# Filter to target period
mask = (cand_contribs_raw_df["contrib_date"] >= start_date) & (cand_contribs_raw_df["contrib_date"] <= end_date)
filtered = cand_contribs_raw_df.loc[mask].copy()

# Compute total contributions per candidate
total_stats = (
    filtered.groupby(["fec_name", "fec_cycle", "fec_state", "fec_district", "fec_party"], dropna=False)
    .agg(
        total_contrib_count=("contrib_count", "sum"),
        total_contrib_amount=("contrib_amount", "sum")
    )
    .reset_index()
)

# Calculate average per day over the entire period
total_stats["avg_daily_contrib_count"] = total_stats["total_contrib_count"] / num_days
total_stats["avg_daily_contrib_amount"] = total_stats["total_contrib_amount"] / num_days

# Merge averages back into the original dataframe
cand_contribs_df = cand_contribs_raw_df.merge(
    total_stats[["fec_name", "fec_cycle", "fec_state", "fec_district", "fec_party", 
                 "avg_daily_contrib_count", "avg_daily_contrib_amount"]],
    on=["fec_name", "fec_cycle", "fec_state", "fec_district", "fec_party"],
    how="left"
)

cand_contribs_df.head()

Unnamed: 0,fec_cycle,fec_cid,fec_name,fec_party_original,no_pacs,fec_state,fec_district,fec_party,contrib_date,contrib_count,contrib_amount,avg_daily_contrib_count,avg_daily_contrib_amount
0,2020,N00030910,MO BROOKS,R,,AL,5,R,2019-11-11,2,2750,0.087432,109.907104
1,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-06,1,100,0.087432,109.907104
2,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-15,1,250,0.087432,109.907104
3,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-17,6,9000,0.087432,109.907104
4,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-21,1,500,0.087432,109.907104


In [67]:
# Create a summarized data frame of the DC Inbox data with newsletter & term counts by day

# Make sure Date is datetime
dcinbox_df['Date'] = pd.to_datetime(dcinbox_df['Date'])

# Create a new column with word counts in Body
dcinbox_df['body_word_count'] = dcinbox_df['Body'].str.split().str.len()
print(len(dcinbox_df))

# Group by Full Name and Date
dcinbox_summary_df = dcinbox_df.groupby(['Full Name', 'Date'], as_index=False).agg(
    newsletter_count = ('Subject', 'count'),
    body_word_count_sum = ('body_word_count', 'sum') 
)

# Compute total newsletters per politician
dcinbox_summary_df["total_newsletters"] = dcinbox_summary_df.groupby("Full Name")["newsletter_count"].transform("sum")

# Merge back other columns using the first occurrence per day for all other columns
other_cols = [col for col in dcinbox_df.columns if col not in ['Subject', 'Body', 'body_word_count', 'Date', "Full Name", "Party Truncated"]]
dcinbox_summary_df = dcinbox_summary_df.merge(
    dcinbox_df.groupby(['Full Name', 'Date'])[other_cols].first().reset_index(),
    on=['Full Name', 'Date'],
    how='left'
)

# Only find records that are House members
dcinbox_summary_df = dcinbox_summary_df[(dcinbox_summary_df["Chamber"] == "House")]

# Drop unneeded columns
dcinbox_summary_df = dcinbox_summary_df.drop(columns=['Nickname', 'Date of Birth', "First Name", "Last Name", "BioGuide ID", "Unix Timestamp"])

print(len(dcinbox_summary_df))
dcinbox_summary_df.head()

30872
25785


Unnamed: 0,Full Name,Date,newsletter_count,body_word_count_sum,total_newsletters,Congress,Gender,State,District,Party Full,Chamber,ID,Party
0,A. FERGUSON,2019-01-13,1,357,44,116,M,GA,3,Republican,House,160063,R
1,A. FERGUSON,2019-01-27,1,528,44,116,M,GA,3,Republican,House,159679,R
2,A. FERGUSON,2019-02-10,1,457,44,116,M,GA,3,Republican,House,159236,R
3,A. FERGUSON,2019-03-03,1,430,44,116,M,GA,3,Republican,House,158641,R
4,A. FERGUSON,2019-03-17,1,271,44,116,M,GA,3,Republican,House,158135,R


In [68]:
# Joining QC

# Original dcinbox file
# print(len(dcinbox_df))

# Dcinbox file with counts of newsletters by day
print(len(dcinbox_summary_df))

# file with politicians matched between dcinbox and FEC data
print(len(matched_pols_df))

# Summary FEC file of candidates and contributions by day
print(len(cand_contribs_df))


25785
394
38565


In [69]:
# Join summarized DC Inbox and matched politicians data into a new data frame (left join)

# Small subset for testing
# summary_test_df = dcinbox_summary_df[dcinbox_summary_df["ID"] == "158135"]
# summary_test_df = dcinbox_summary_df[dcinbox_summaryary_test_df.head())
# print(len(summary_test_df))
print(len(dcinbox_summary_df))

# dc_inbox_sum_df = summary_test_df.merge(
dc_inbox_sum_df = dcinbox_summary_df.merge(
    matched_pols_df,
    left_on = ["Full Name"], # , "State", "District", "Party"],
    right_on = ["dcinbox_name"], # "fec_state", "fec_district", "fec_party"],
    how = 'left'
)

# Drop the columns we don't need
dc_inbox_sum_df = dc_inbox_sum_df.drop(columns=['fec_state', 'fec_district', 'fec_party', "target_id", "append_attribute2", "append_attribute3",
                                                "matched_scorer", "Party Full", "dcinbox_name", "ID"])

# Field cleanup & type correction
dc_inbox_sum_df["Date"] == "2020-02-01"
# print(summ_df['fec_cycle'] = dc_inbox_sum_df['fec_cycle'].fillna(0).astype(int)
dc_inbox_sum_df['similarity_score'] = dc_inbox_sum_df['similarity_score'].fillna(0).astype(int)
dc_inbox_sum_df['Date'] = pd.to_datetime(dc_inbox_sum_df['Date'])

# Fill missing counts and amounts with 0 for candidates with no contributions (per day)
# dc_inbox_sum_df['contrib_count'] = dc_inbox_sum_df['contrib_count'].fillna(0).astype(int)
# dc_inbox_sum_df['contrib_amount'] = dc_inbox_sum_df['contrib_amount'].fillna(0)

# Rename columns so they won't clash with the FEC data file
dc_inbox_sum_df = dc_inbox_sum_df.rename(columns={
    "Full Name": "dc_name",
    "Date": "dc_date",
    "Congress": "dc_congress",
    "Gender": "dc_gender",
    "State": "dc_state",
    "District": "dc_district",
    "Chamber": "dc_chamber",
    "Party": "dc_party"    
})

print(len(dc_inbox_sum_df))
dc_inbox_sum_df.head()
# dc_inbox_sum_df.info()

25785
25785


Unnamed: 0,dc_name,dc_date,newsletter_count,body_word_count_sum,total_newsletters,dc_congress,dc_gender,dc_state,dc_district,dc_chamber,dc_party,fec_cid,fec_name,fec_cycle,similarity_score
0,A. FERGUSON,2019-01-13,1,357,44,116,M,GA,3,House,R,N00039090,DREW FERGUSON,2020.0,90
1,A. FERGUSON,2019-01-27,1,528,44,116,M,GA,3,House,R,N00039090,DREW FERGUSON,2020.0,90
2,A. FERGUSON,2019-02-10,1,457,44,116,M,GA,3,House,R,N00039090,DREW FERGUSON,2020.0,90
3,A. FERGUSON,2019-03-03,1,430,44,116,M,GA,3,House,R,N00039090,DREW FERGUSON,2020.0,90
4,A. FERGUSON,2019-03-17,1,271,44,116,M,GA,3,House,R,N00039090,DREW FERGUSON,2020.0,90


In [70]:
# Join summarized FEC and DC Inbox into a new data frame (left join) by day & politician,
# For use in computing rolling window contribution amount & count columns

print(len(dc_inbox_sum_df))
print(len(cand_contribs_df))

# Date field type corrections
dc_inbox_sum_df['dc_date'] = pd.to_datetime(dc_inbox_sum_df['dc_date'])
cand_contribs_df['contrib_date'] = pd.to_datetime(cand_contribs_df['contrib_date'])

window_col_join_df = cand_contribs_df.merge(
# joined_all_df = test_dcinbox_df.merge(
    # dc_inbox_sum_df,
    dc_inbox_sum_df.drop(columns=["fec_cycle", "fec_name", "dc_party", "dc_chamber", "dc_district",
                                  "dc_name", "newsletter_count", "body_word_count_sum",
                                  "dc_state", "dc_gender", "dc_congress", "similarity_score"]),
    left_on = ['fec_cid', "contrib_date"],
    right_on = ['fec_cid', "dc_date"],
    how = 'left'
)

# Drop the columns we don't need
# window_col_join_df = window_col_join_df.drop(columns=["dc_date"])

# Fill missing counts and amounts with 0 for candidates with no contributions (per day)
# dc_inbox_sum_df['contrib_count'] = dc_inbox_sum_df['body_word_count_sum'].fillna(0).astype(int)
# window_col_join_df['dc_date'] = window_col_join_df['dc_date'].fillna(0)

print(len(window_col_join_df))
window_col_join_df.head()
# dc_inbox_sum_df.info()

25785
38565
38565


Unnamed: 0,fec_cycle,fec_cid,fec_name,fec_party_original,no_pacs,fec_state,fec_district,fec_party,contrib_date,contrib_count,contrib_amount,avg_daily_contrib_count,avg_daily_contrib_amount,dc_date,total_newsletters
0,2020,N00030910,MO BROOKS,R,,AL,5,R,2019-11-11,2,2750,0.087432,109.907104,NaT,
1,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-06,1,100,0.087432,109.907104,NaT,
2,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-15,1,250,0.087432,109.907104,NaT,
3,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-17,6,9000,0.087432,109.907104,NaT,
4,2020,N00030910,MO BROOKS,R,,AL,5,R,2020-01-21,1,500,0.087432,109.907104,NaT,


In [71]:
# Add a rolling window of three days after the initial date to the joined FEC/DC Inbox data frame

# Set window days
input_window_days = 2

# Use a small test set at first
# window_col_df = window_col_join_df[window_col_join_df["fec_name"].str.contains("PELOSI")].copy()
# window_col_df = window_col_df[window_col_df["contrib_date"] >= "2019-11-19"].copy()

window_col_df = window_col_join_df.copy()

# Make sure date column is datetime
window_col_df["contrib_date"] = pd.to_datetime(window_col_df["contrib_date"])
# joined_all_window_df.loc[:, "dc_date"] = pd.to_datetime(joined_all_window_df["dc_date"])

# Fill NaNs in contrib columns to avoid issues during summing
window_col_df["contrib_count"] = window_col_df["contrib_count"].fillna(0).astype(int)
window_col_df["contrib_amount"] = window_col_df["contrib_amount"].fillna(0).astype(int)

# Define the group columns (unique identifiers for rolling sums)
group_cols = ["fec_name", "fec_cycle", "fec_state", "fec_district", "fec_party"]

# Sort by group and date
window_col_df = window_col_df.sort_values(by=group_cols + ["contrib_date"])

# Define a general rolling window function
def add_forward_rolling_sum(df, value_cols, window_days = 2):
    # value_cols should be a list, e.g. ["contrib_count", "contrib_amount"]
    days_col = "window_days"
    df[days_col] = 0  # initialize shared days column

    for value_col in value_cols:
        new_col = f"window_{value_col}"
        df[new_col] = 0

    for name, g in df.groupby(group_cols, sort=False):
        g = g.sort_values("contrib_date")
        dates = g["contrib_date"].reset_index(drop=True)
        dc_dates = g["dc_date"].reset_index(drop=True)
        results = {val: np.zeros(len(g), dtype=int) for val in value_cols}
        actual_days = np.zeros(len(g), dtype=int)

        for i, current_date in enumerate(dates):
            # Default window end
            window_end = current_date + pd.Timedelta(days=window_days)

            # Include all dates within window initially
            mask = (dates >= current_date) & (dates <= window_end)

            # Stop before first non-NaT dc_date if found
            next_reset_idx = np.where((dates > current_date) & pd.notna(dc_dates))[0]
            if len(next_reset_idx) > 0:
                first_reset_date = dates[next_reset_idx[0]]
                mask = mask & (dates < first_reset_date)
                window_end = min(window_end, first_reset_date - pd.Timedelta(days=1))

            # Store actual number of days used
            actual_days[i] = (window_end - current_date).days + 1

            # Compute rolling sums for all specified value columns
            for value_col in value_cols:
                values = g[value_col].fillna(0).astype(int).reset_index(drop=True)
                results[value_col][i] = int(values[mask].sum())

        # Write results back to df
        for value_col in value_cols:
            new_col = f"window_{value_col}"
            df.loc[g.index, new_col] = results[value_col]
        df.loc[g.index, days_col] = actual_days

    return df


# Compute rolling sums for count and amount
# window_col_df = add_forward_rolling_sum(window_col_df, "contrib_count", window_days=input_window_days)
# window_col_df = add_forward_rolling_sum(window_col_df, "contrib_amount", window_days=input_window_days)
window_col_df = add_forward_rolling_sum(
    window_col_df,
    value_cols=["contrib_count", "contrib_amount"],
    window_days = input_window_days
)

# Check results
# joined_all_window_df.head()[["dc_date", "dc_name", "contrib_count", "window_contrib_count", 
#                              "contrib_amount", "window_contrib_amount"]]
window_col_df.head()


Unnamed: 0,fec_cycle,fec_cid,fec_name,fec_party_original,no_pacs,fec_state,fec_district,fec_party,contrib_date,contrib_count,contrib_amount,avg_daily_contrib_count,avg_daily_contrib_amount,dc_date,total_newsletters,window_days,window_contrib_count,window_contrib_amount
19671,2020,N00040888,ABBY FINKENAUER,D,,IA,1,D,2019-11-06,4,7900,0.554645,512.923497,NaT,,3,4,7900
19672,2020,N00040888,ABBY FINKENAUER,D,,IA,1,D,2019-11-14,2,1020,0.554645,512.923497,NaT,,3,2,1020
19673,2020,N00040888,ABBY FINKENAUER,D,,IA,1,D,2019-11-18,2,75,0.554645,512.923497,NaT,,3,4,1325
19674,2020,N00040888,ABBY FINKENAUER,D,,IA,1,D,2019-11-19,2,1250,0.554645,512.923497,NaT,,3,2,1250
19675,2020,N00040888,ABBY FINKENAUER,D,,IA,1,D,2019-11-25,3,-3050,0.554645,512.923497,NaT,,3,3,-3050


In [76]:
# QC for joining

test_date = "2020-02-07"
test_name = "SCHIFF"

dc_inbox_sum_df['dc_date'] = pd.to_datetime(dc_inbox_sum_df['dc_date'])
test_dcinbox_df = dc_inbox_sum_df[dc_inbox_sum_df["dc_date"] == test_date]
test_dcinbox_df = test_dcinbox_df[test_dcinbox_df["dc_name"].str.contains(test_name)]

cand_contribs_df['contrib_date'] = pd.to_datetime(cand_contribs_df['contrib_date'])
test_fec_df = cand_contribs_df[cand_contribs_df["contrib_date"] == test_date]
test_fec_df = test_fec_df[test_fec_df["fec_name"].str.contains(test_name)]

print(len(test_dcinbox_df))
print(len(test_fec_df))
# test_dcinbox_df.head()
test_fec_df.head()

1
1


Unnamed: 0,fec_cycle,fec_cid,fec_name,fec_party_original,no_pacs,fec_state,fec_district,fec_party,contrib_date,contrib_count,contrib_amount,avg_daily_contrib_count,avg_daily_contrib_amount
760,2020,N00009585,ADAM SCHIFF,D,,CA,28,D,2020-02-07,19,13968,5.920765,2720.027322


In [73]:
# Join summarized DC Inbox and window-column FEC into a new data frame (left join) by day & politician

print("window_col_df: ", len(window_col_df))
print("dc_inbox_sum_df: ", len(dc_inbox_sum_df))

# Use a small test set at first
# dc_inbox_sum_df = dc_inbox_sum_df[dc_inbox_sum_df["dc_name"].str.contains("FINKENAUER")].copy()
# dc_inbox_sum_df = dc_inbox_sum_df[dc_inbox_sum_df["dc_date"] >= "2020-01-30"].copy()

joined_all_df = dc_inbox_sum_df.merge(
# joined_all_df = test_dcinbox_df.merge(
    # cand_contribs_df,
    window_col_df.drop(columns=['fec_cycle', "fec_name", "fec_party", "fec_state", "fec_district", "dc_date"]),
    # test_fec_df,
    left_on = ['fec_cid', "dc_date"],
    right_on = ['fec_cid', "contrib_date"],
    how = 'left'
)

print("joined_all_df: ", len(joined_all_df))

# Drop the columns we don't need
# joined_all_df = joined_all_df.drop(columns=['State', 'District', 'Party', "append_attribute2", "append_attribute3"])

# Add score columns for count and amount columns
# Calculated as:
#    ( (counts/amounts in window) / (number of days in window) )
#                           divided by
#                 (average daily count/amount) 
# To get a score of how much better or worse a given day did than average
joined_all_df["contrib_count_score"] = (joined_all_df["window_contrib_count"] / joined_all_df["window_days"]) / joined_all_df["avg_daily_contrib_count"]
joined_all_df["contrib_amount_score"] = (joined_all_df["window_contrib_amount"] / joined_all_df["window_days"]) / joined_all_df["avg_daily_contrib_amount"]

# Fill missing values for quantitative columns & format them
joined_all_df['window_days'] = joined_all_df['window_days'].fillna(0).astype(int)
joined_all_df['contrib_count'] = joined_all_df['contrib_count'].fillna(0).astype(int)
joined_all_df['contrib_amount'] = joined_all_df['contrib_amount'].fillna(0).astype(float).map("{:.2f}".format)
joined_all_df['avg_daily_contrib_count'] = joined_all_df['avg_daily_contrib_count'].fillna(0).astype(int)
joined_all_df['avg_daily_contrib_amount'] = joined_all_df['avg_daily_contrib_amount'].fillna(0).astype(float).map("{:.2f}".format)
joined_all_df['window_contrib_count'] = joined_all_df['window_contrib_count'].fillna(0).astype(int)
joined_all_df['window_contrib_amount'] = joined_all_df['window_contrib_amount'].fillna(0).astype(float).map("{:.2f}".format)
joined_all_df['contrib_count_score'] = joined_all_df['contrib_count_score'].fillna(0).astype(float).map("{:.2f}".format)
joined_all_df['contrib_amount_score'] = joined_all_df['contrib_amount_score'].fillna(0).astype(float).map("{:.2f}".format)

joined_all_df.head()
# dc_inbox_sum_df.info()

window_col_df:  38565
dc_inbox_sum_df:  25785
joined_all_df:  25785


Unnamed: 0,dc_name,dc_date,newsletter_count,body_word_count_sum,total_newsletters_x,dc_congress,dc_gender,dc_state,dc_district,dc_chamber,...,contrib_count,contrib_amount,avg_daily_contrib_count,avg_daily_contrib_amount,total_newsletters_y,window_days,window_contrib_count,window_contrib_amount,contrib_count_score,contrib_amount_score
0,A. FERGUSON,2019-01-13,1,357,44,116,M,GA,3,House,...,0,0.0,0,0.0,,0,0,0.0,0.0,0.0
1,A. FERGUSON,2019-01-27,1,528,44,116,M,GA,3,House,...,0,0.0,0,0.0,,0,0,0.0,0.0,0.0
2,A. FERGUSON,2019-02-10,1,457,44,116,M,GA,3,House,...,0,0.0,0,0.0,,0,0,0.0,0.0,0.0
3,A. FERGUSON,2019-03-03,1,430,44,116,M,GA,3,House,...,0,0.0,0,0.0,,0,0,0.0,0.0,0.0
4,A. FERGUSON,2019-03-17,1,271,44,116,M,GA,3,House,...,0,0.0,0,0.0,,0,0,0.0,0.0,0.0


In [74]:
# Output the final joined file to a CSV file

joined_all_df.to_csv(
#     r"C:\Users\hefla\Documents\Work\IPS\Area 990\Data Sources\Politicking\cand_contrib_summary_2020.csv",
    "../data/matched/joined_all_2020_test.csv",
    index=False,  # no row numbers
    encoding="latin1"
)