# An Analysis of Political Contributions During the 2020 House of Representatives Election

Goal of this notebook is to webscrape data for the 2020 House of Representatives Election from the Open Secrets website.

In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as BS
import time

In [2]:
## Prepare list of all URLs
# Initialize empty list for URLs
URLs = []

# Scrape table of state abbreviations and # of representatives per state from Britannica site
response = requests.get('https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120')
soup = BS(response.text)
states_reps = pd.read_html(str(soup.find('table')))[0].drop(50)

# Scrape abbreviation table from World Population Review site
response = requests.get('https://worldpopulationreview.com/states/state-abbreviations')
soup = BS(response.text)
states_abv = pd.read_html(str(soup.find('table')))[0]
states_abv = (states_abv.rename({'State':'state'},
                                axis='columns'))

# Merge the two read in dataframes
states = pd.merge(states_reps, states_abv, on='state').drop(columns=['Abbreviation','state'])

# Loop through states dataframe and create list of all URLs
for index in states.index:
    code = states.loc[index]['Code']
    districts = states.loc[index]['representatives']
    
    for dist in range(1, districts+1):
        URL = 'https://www.opensecrets.org/races/summary?cycle=2020&id={}{}&spec=N'.format(code, str(dist).zfill(2))
        URLs.append(URL)

In [3]:
## Scrape the data for all Districts for each US State
# Initialize empty dataframe to add data to
US = pd.DataFrame()

# Loop through lists of URLs and extract data from each
for URL in URLs:
    response = requests.get(URL)
    time.sleep(0.2)

    soup = BS(response.text)
    loop_df = pd.read_html(str(soup.find('table')))[0]
    
    # Create state and district columns
    for index in loop_df.index:    
        loop_df.at[index, 'state'] = re.findall(r'id=\w\w\d\d', str(soup.find('link')))[0][3:5]
        loop_df.at[index, 'district'] = re.findall(r'id=\w\w\d\d', str(soup.find('link')))[0][5:7]
    
    # Combine data from webscraping into one dataframe
    if US.empty:
        US = loop_df
    else:
        US = pd.concat([US, loop_df], ignore_index = True)  

In [4]:
# Export scraped dataframe as a csv
US.to_csv('../data/US_scraped.csv', index=False)