In [82]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import numpy as np 
import requests
import re
from time import sleep
from random import randint

In [83]:
#These below functions sift through the candidate header for information
def name_extractor(candidate):
            return re.findall(r'[A-Z]\w+ [A-Z]\w+',candidate)
def party_extractor(candidate):
            return re.findall(r'\(\w\)',candidate)[0]
def incumbent_finder(candidate):
            incumbent = re.findall(r'Incumbent',candidate)
            if len(incumbent)>0:
                return incumbent[0]
            else: 
                return 'N/a'
def winner_finder(candidate):
            winner = re.findall(r'Winner',candidate)
            if len(winner)>0:
                return winner[0]
            else:
                return 'N/a'
def percentage_vote_finder(candidate):
            return re.findall(r'(?<=\()\d+.\d',candidate)
#This will be based on the website you are on, there are different websites for each State,District combo. That's why soup is input
def state_and_district_finder(soup):
            string = str(soup.findAll('h1'))
            state = re.findall(r'(?<=>)[A-Z]\w+', string)
            district = re.findall(r'District \d{2}', string)
            return state, district; 

In [84]:
#read in State and District info to pull each state and district url with candidates
state_info = pd.read_csv('../webscraping_open_secrets-malted_milk_balls/Data/state_abr_districts.csv', dtype=str)

In [85]:
#create lists of state abbreviations and district numbers
state_info = state_info['Abbreviation'] + state_info['District Numbers']
state_info

0      AL01
1      AL02
2      AL03
3      AL04
4      AL05
       ... 
430    WI05
431    WI06
432    WI07
433    WI08
434    WY01
Length: 435, dtype: object

In [86]:
urls = [f'https://www.opensecrets.org/races/candidates?cycle=2020&id={i}&spec=N' for i in (state_info)]
urls

['https://www.opensecrets.org/races/candidates?cycle=2020&id=AL01&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL02&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL03&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL04&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL05&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL06&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AL07&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AK01&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ01&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ02&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ03&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ04&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ05&spec=N',
 'https://www.opensecrets

In [87]:
# Run entire code through for loop
url_list = []

for url in urls: 
    page=requests.get(url)
    if page.status_code != 200:
        # print('Error! Request Denied. Retrying page request.')
        while page.status_code != 200:
            page=requests.get(url)

    soup=BS(page.content,'html.parser')
    # May or may not need this sleep timer
    #sleep(randint(2,5)) 

    state_and_district = state_and_district_finder(soup)

    candidates = []
    for candidate in soup.findAll('div', class_ = "Members--bio u-richtext"):
        candidates.append(candidate.text.strip()) 
        

    money = []
    for tag in soup.findAll('table', class_ = 'Members--table'):
        for anchor in tag.findAll('td', class_ = 'Members--number'):
            money.append(int(anchor.text.replace('$','').replace(',','')))
    
    money_table = np.reshape(money, (len(candidates),3))

    names = []
    party_affiliation = []
    incumbent_status = []
    winner_status = []
    percentage_vote = []
    for i in candidates:
        names.append(name_extractor(i))
        party_affiliation.append(party_extractor(i))
        incumbent_status.append(incumbent_finder(i))
        winner_status.append(winner_finder(i))
        percentage_vote.append(percentage_vote_finder(i))

    congressional_races = pd.DataFrame({'Name':names,
                                'Party':party_affiliation,
                                'State':state_and_district[0]*len(candidates),
                                'District Number':state_and_district[1]*len(candidates),
                                'Incumbent Status':incumbent_status,
                                'Winner Status':winner_status,
                                'Percentage of Vote':percentage_vote,
                                'Total Amount Raised':list(money_table[:,0]),
                                'Total Amount Spent':list(money_table[:,1])
                                })

#This line and the following should be the final merge code.    
    url_list.append(congressional_races)
    
candidates = pd.concat(url_list)
candidates = candidates.reset_index(drop=True)
candidates['Name'] = candidates['Name'].str.get(0)
candidates['Percentage of Vote'] = candidates['Percentage of Vote'].str.get(0)
candidates['Percentage of Vote'] = candidates['Percentage of Vote'].astype(float)
candidates

Unnamed: 0,Name,Party,State,District Number,Incumbent Status,Winner Status,Percentage of Vote,Total Amount Raised,Total Amount Spent
0,Jerry Carl,(R),Alabama,District 01,N/a,Winner,64.9,1971321,1859349
1,James Averhart,(D),Alabama,District 01,N/a,N/a,35.0,80095,78973
2,Barry Moore,(R),Alabama,District 02,N/a,Winner,65.3,650807,669368
3,Phyllis Harvey,(D),Alabama,District 02,N/a,N/a,34.6,56050,55988
4,,(R),Alabama,District 03,Incumbent,Winner,67.5,1193111,1218564
...,...,...,...,...,...,...,...,...,...
887,Tricia Zunker,(D),Wisconsin,District 07,N/a,N/a,39.2,1261957,1232690
888,Mike Gallagher,(R),Wisconsin,District 08,Incumbent,Winner,64.0,3202905,2841801
889,Amanda Stuck,(D),Wisconsin,District 08,N/a,N/a,36.0,416978,399916
890,Liz Cheney,(R),Wyoming,District 01,Incumbent,Winner,68.6,3003883,3060167
