# An Analysis of Political Contributions During the 2020 House of Representatives Election

Goal of this notebook is to:
1. Webscrape the Open Secrets pages detailing the contribution sources for each candidate. 
2. Answer the following questions:
    * What does the overall distribution of funding sources look like?
    * Is there any detectable difference in contribution sources between Democrat and Republican candidates?
    * Do the funding sources for either the winning candidate or incumbent candidate differ from the other candidates?

In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as BS
import time

In [2]:
## Prepare list of all URLs
# Initialize empty list for URLs
URLs = []

# Scrape table of state abbreviations and # of representatives per state from Britannica site
response = requests.get('https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120')
soup = BS(response.text)
states_reps = pd.read_html(str(soup.find('table')))[0].drop(50)

# Scrape abbreviation table from World Population Review site
response = requests.get('https://worldpopulationreview.com/states/state-abbreviations')
soup = BS(response.text)
states_abv = pd.read_html(str(soup.find('table')))[0]
states_abv = (states_abv.rename({'State':'state'},
                                axis='columns'))

# Merge the two read in dataframes
states = pd.merge(states_reps, states_abv, on='state').drop(columns=['Abbreviation','state'])

# Loop through states dataframe and create list of all URLs
for index in states.index:
    code = states.loc[index]['Code']
    districts = states.loc[index]['representatives']
    
    for dist in range(1, districts+1):
        URL = 'https://www.opensecrets.org/races/candidates?cycle=2020&id={}{}&spec=N'.format(code, str(dist).zfill(2))
        URLs.append(URL)

In [9]:
# Initialize empty dataframe to hold webscraped data
contributions = pd.DataFrame()

# Loop through lists of URLs and extract data from each
for URL in URLs:   
    response = requests.get(URL)
    time.sleep(0.2)
    soup = BS(response.text)
    strong = soup.findAll('strong')

    for i in range(len(strong)):
        # Intialize for loop variables
        cont_df = pd.DataFrame()
        temp_list = []

        # Format dataframe for current webpage
        cont_df = pd.read_html(str(soup.findAll('table')[1 + (3*i)]))[0].transpose()
        cont_df.columns = cont_df.iloc[0]
        cont_df = cont_df.drop(labels=['Type of Contribution', 'Percentage'], axis=0)
        cont_df = cont_df.reset_index(drop=True)

        # Split cand_info into separate elements
        cand_info = strong[i].text.split(' • ')

        # Assign candidate name to candidate column
        name = re.sub(r'\s\W\w\W', "", re.sub(r'\n{0,}\t{0,}', '', cand_info[0]))
        cont_df = cont_df.rename(index={0: name})

        # Create new column named party
        party = re.sub(r'\W', '', re.findall(r'\W\w\W', re.sub(r'\n', '', cand_info[0]))[0])
        cont_df.at[name, 'party'] =  party

        # Create column named incumbent and determine incumbent status
        for j in range(len(cand_info)):
            temp = re.sub(r'\s\W\w\W', "", re.sub(r'\n{0,}\t{0,}', '', cand_info[j]))
            temp_list.append(temp)

        if ('Incumbent' in temp_list):
            cont_df.at[name, 'incumbent'] = True
        else:
            cont_df.at[name, 'incumbent'] = False

        if ('Winner ' in temp_list):
            cont_df.at[name, 'winner'] = True
        else:
            cont_df.at[name, 'winner'] = False

        contributions = pd.concat([contributions, cont_df])

contributions

Type of Contribution,Small Individual Contributions (≤ $200),Large Individual Contributions,PAC Contributions*,Candidate self-financing,Other,party,incumbent,winner
Jerry Carl,"$44,580","$999,616","$387,000","$434,656","$105,470",R,False,True
James Averhart,"$12,895","$37,955",$0,"$29,245",$0,D,False,False
Barry Moore,"$62,208","$346,329","$230,282","$11,500",$489,R,False,True
Phyllis Harvey-Hall,"$15,307","$27,105","$2,032","$10,575","$1,030",D,False,False
Mike D Rogers,"$5,315","$438,054","$744,734",$0,"$5,009",D,True,True
...,...,...,...,...,...,...,...,...
Tricia Zunker,"$600,015","$566,860","$91,807",$0,"$3,276",D,False,False
Mike Gallagher,"$188,430","$2,040,299","$841,300",$0,"$132,876",R,True,True
Amanda Stuck,"$169,380","$221,453","$30,183",$0,"-$4,038",D,False,False
Liz Cheney,"$189,647","$980,349","$1,292,490",$0,"$541,398",R,True,True
