In [1]:
import requests
from bs4 import BeautifulSoup as BS
import numpy as np
import pandas as pd
import re
import time
from tqdm.notebook import tqdm

In [2]:
def candidate_info(df):
    """Extract candidate info from Open Secrets tables, then drop candidate column."""
    
    df['incumbent'] = df['candidate'].str.contains(r'Incumbent')
    df['winner'] = df['candidate'].str.contains(r'Winner')
    df['party'] = df['candidate'].apply(lambda x: re.search(r'\((.*?)\)',x).group(1))
    df['name'] = df['candidate'].str.split(r'\(.\)', expand = True)[0]
    df = df.drop(columns = 'candidate')[['district', 'name', 'party','incumbent', 'winner', 'raised', 'spent']]

    return df

def clean_money(df):
    """Clean and conver the money columns"""
    
    money_cols = ['raised', 'spent']
    for col in money_cols:
        df[col] = df[col].str.replace(r'\$|,', '', regex = True).astype('int')
    
    return df

def format_os_table(df):
    """lowercase column names, extract candidate info, rearrange columns, and clean the money columns"""
    
    df.columns = df.columns.str.lower()
    df = candidate_info(df)
    df = clean_money(df)
    
    return df

def get_format_os2020_table(i, abbrv):
    """Takes an index and state abbrreviation and returns a district dataframe corresponding to district <abbrv><i>"""
    
    # Build state-district combinations for URL and dataframe
    if i < 10:
        state_dist = f"{abbrv}0{str(i)}"
    else:
        state_dist = f"{abbrv}{str(i)}"

    # Get request
    URL = f'https://www.opensecrets.org/races/summary?cycle=2020&id={state_dist}&spec=N'
    response = requests.get(URL)
    soup = BS(response.text)

    # Make df
    district = pd.read_html(str(soup.find('table')))[0]

    # Add a district column
    district[['district', 'state']] = [state_dist, state]

    # Format
    district = format_os_table(district)

    return district

In [3]:
US = pd.DataFrame(columns = ['district', 'name', 'party','incumbent', 'winner', 'raised', 'spent'])

In [4]:
# Get request
URL = 'https://www.50states.com/abbreviations.htm'
response = requests.get(URL)
soup = BS(response.text)

abv = pd.read_html(str(soup.find('table')))[0][['US STATE', 'POSTAL ABBREVIATION']].rename(columns = {'US STATE':'state', 'POSTAL ABBREVIATION':'abbreviation'})

In [5]:
URL = 'https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120'
response = requests.get(URL)
soup = BS(response.text)

states = abv.merge(pd.read_html(str(soup.find('table')))[0])
states.to_csv('../data/state_reps_ref.csv', index = False)

In [6]:
# for ind, row in states.iterrows():
    
#     # Save current state information in cleaner variable names
#     state = row['state']
#     abbrv = row['abbreviation']
#     reps = row['representatives']

#     # Loop through each district of the current state
#     for i in range(1,reps+1):
                
#         # Build state-district combinations for URL and dataframe
#         if i < 10:
#             state_dist = f"{abbrv}0{str(i)}"
#         else:
#             state_dist = f"{abbrv}{str(i)}"
            
#         # Get request
#         URL = f'https://www.opensecrets.org/races/summary?cycle=2020&id={state_dist}&spec=N'
#         response = requests.get(URL)
#         soup = BS(response.text)
        
#         # Make df
#         district = pd.read_html(str(soup.find('table')))[0]

#         # Add a district column
#         district[['district', 'state']] = [state_dist, state]

#         # Format
#         district = format_os_table(district)

#         US = pd.concat([US, district])
        
#     time.sleep(3)

In [7]:
for ind, row in states.iterrows():
    
    # Save current state information in cleaner variable names
    state = row['state']
    abbrv = row['abbreviation']
    reps = row['representatives']

    # Loop through each district of the current state
    for i in range(1,reps+1):
                
        district = get_format_os2020_table(i, abbrv)
        district['state'] = state
        
        US = pd.concat([US, district])
        
    time.sleep(2)

In [8]:
US

Unnamed: 0,district,name,party,incumbent,winner,raised,spent,state
0,AL01,Jerry Carl,R,False,True,1971321,1859349,Alabama
1,AL01,James Averhart,D,False,False,80095,78973,Alabama
0,AL02,Barry Moore,R,False,True,650807,669368,Alabama
1,AL02,Phyllis Harvey-Hall,D,False,False,56050,55988,Alabama
0,AL03,Mike D Rogers,R,True,True,1193111,1218564,Alabama
...,...,...,...,...,...,...,...,...
1,WI07,Tricia Zunker,D,False,False,1261957,1232690,Wisconsin
0,WI08,Mike Gallagher,R,True,True,3202905,2841801,Wisconsin
1,WI08,Amanda Stuck,D,False,False,416978,399916,Wisconsin
0,WY01,Liz Cheney,R,True,True,3003883,3060167,Wyoming


In [9]:
US.to_csv('../data/us_rep_elections.csv', index = False)