In [205]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import numpy as np
import re

In [249]:
# Provided list of TN candidate urls
urls = ['https://www.opensecrets.org/races/candidates?cycle=2020&id=TN01&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN02&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN03&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN04&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN05&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN06&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN08&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN09&spec=N']

url_list = []

# Run entire code through for loop
for url in urls: 
    page=requests.get(url)
    soup=BS(page.content,'html.parser')
    
    def name_extractor(candidate):
                return re.findall(r'[A-Z]\w+ [A-Z]\w+',candidate)[0]

    def party_extractor(candidate):
                return re.findall(r'\(\w\)',candidate)[0]

    def incumbent_finder(candidate):
                incumbent = re.findall(r'Incumbent',candidate)
                if len(incumbent)>0:
                    return incumbent[0]
                else: 
                    return 'N/a'

    def winner_finder(candidate):
                winner = re.findall(r'Winner',candidate)
                if len(winner)>0:
                    return winner[0]
                else:
                    return 'N/a'

    def percentage_vote_finder(candidate):
                return re.findall(r'(?<=\()\d+.\d\%',candidate)

    #This will be based on the website you are on, there are different websites for each State,District combo. That's why soup is input
    def state_and_district_finder(soup):
                string = str(soup.findAll('h1'))
                state = re.findall(r'(?<=>)[A-Z]\w+', string)
                district = re.findall(r'District \d{2}', string)
                return state, district; 

    state_and_district = state_and_district_finder(soup)

    candidates = []
    for candidate in soup.findAll('div', class_ = "Members--bio u-richtext"):
               candidates.append(candidate.text.strip()) 

    money = []
    for tag in soup.findAll('table', class_ = 'Members--table'):
               for anchor in tag.findAll('td', class_ = 'Members--number'):
                  money.append(int(anchor.text.replace('$','').replace(',','')))
    money_table = np.reshape(money, (len(candidates),3))

    names = []
    party_affiliation = []
    incumbent_status = []
    winner_status = []
    percentage_vote = []
    for i in candidates:
               names.append(name_extractor(i))
               party_affiliation.append(party_extractor(i))
               incumbent_status.append(incumbent_finder(i))
               winner_status.append(winner_finder(i))
               percentage_vote.append(percentage_vote_finder(i))

    congressional_races = pd.DataFrame({'Name':names,
                                'Party':party_affiliation,
                                'State':state_and_district[0]*len(candidates),
                                'District Number':state_and_district[1]*len(candidates),
                                'Incumbent Status':incumbent_status,
                                'Winner Status':winner_status,
                                'Percentage of Vote':percentage_vote,
                                'Total Amount Raised':list(money_table[:,0]),
                                'Total Amount Spent':list(money_table[:,1])
                                })

#This line and the following should be the final merge code.    
    url_list.append(congressional_races)

#Run this outside of for loop.
tn_candidates = pd.concat(url_list)
tn_candidates = tn_candidates.reset_index(drop=True)
tn_candidates

Unnamed: 0,Name,Party,State,District Number,Incumbent Status,Winner Status,Percentage of Vote,Total Amount Raised,Total Amount Spent
0,Diana Harshbarger,(R),Tennessee,District 01,N/a,Winner,[74.8%],2126946,1869100
1,Blair Nicole,(D),Tennessee,District 01,N/a,N/a,[22.4%],140209,134995
2,Tim Burchett,(R),Tennessee,District 02,Incumbent,Winner,[67.7%],1336276,878488
3,Renee Hoyos,(D),Tennessee,District 02,N/a,N/a,[31.0%],812784,816793
4,Chuck Fleischmann,(R),Tennessee,District 03,Incumbent,Winner,[67.3%],1051653,381411
5,Meg Gorman,(D),Tennessee,District 03,N/a,N/a,[30.5%],85843,77760
6,Scott Desjarlais,(R),Tennessee,District 04,Incumbent,Winner,[66.7%],331464,392499
7,Christopher Hale,(D),Tennessee,District 04,N/a,N/a,[33.3%],308731,302996
8,Jim Cooper,(D),Tennessee,District 05,Incumbent,Winner,[100.0%],936569,1332131
9,John Rose,(R),Tennessee,District 06,Incumbent,Winner,[73.7%],1050429,625688


In [231]:
# Potential for loop replacement
results = []
state_list = x
district_list = x
for state in state_list & for district in district_list:
      url = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state}{district}&spec=N'
      results.append(url)
      page = page + 1

['https://www.opensecrets.org/races/candidates?cycle=2020&id=TN01&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN02&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN03&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN04&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN05&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN06&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN08&spec=N',
 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN09&spec=N']

In [240]:
states= pd.read_csv('/Users/rachael.miller/Documents/NSS_Projects/webscraping_open_secrets-malted_milk_balls/state_abr_districts.csv')
states

Unnamed: 0,State,Abbreviation,Number of Districts
0,Alabama,AL,7
1,Alaska,AK,1
2,Arizona,AZ,9
3,Arkansas,AR,4
4,California,CA,53
5,Colorado,CO,7
6,Connecticut,CT,5
7,Delaware,DE,1
8,Florida,FL,27
9,Georgia,GA,14
