In [167]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import timeit
import re
from tqdm.notebook import tqdm
import time

In [183]:
def update_url(URL, numb_districts):
    district = numb_districts

    url = URL + f"{district}&spec=N"
    
    return url

def split_title(title):
    title = title.get_text()
    title = (
    title.replace(' 2020 Race • OpenSecrets', '')
         .replace('District ', '')
         .split(' ')
        )
    return title

def get_dataframe(state, district, election_results):
    regex_candidate = r"(\w+\s\w+)"
    incumbent = "Incumbent"
    regex_party = r"(\(+\w+\))"
    winner = "Winner"

    columns_extract= {'Candidate_name': regex_candidate,'Party':regex_party}
    columns_contains = {'Incumbent':incumbent, 'Winner':winner}

    for key, value in columns_extract.items():
        election_results[key] = election_results['Candidate'].str.extract(value, expand=True)

    for key, value in columns_contains.items():
        election_results[key] = election_results['Candidate'].str.contains(value)
        
    election_results['Raised'] = election_results['Raised'].map(lambda x: re.sub(r'\D', '', x))
    election_results['Spent'] = election_results['Spent'].map(lambda x: re.sub(r'\D', '', x))
    
    election_results['Raised'] = election_results['Raised'].apply(pd.to_numeric)
    election_results['Spent'] = election_results['Spent'].apply(pd.to_numeric)
    
    election_results['percent_raised'] = election_results['Raised'] / election_results['Raised'].sum() * 100
    election_results['percent_spent'] = election_results['Spent'] / election_results['Spent'].sum() * 100


    election_results['State'] = state 

    election_results['District'] = district

    election_results = election_results[['Candidate_name', 
                                         'Party', 
                                         'State',
                                         'District',
                                         'Incumbent', 
                                         'Winner', 
                                         'Raised', 
                                         'Spent',
                                         'percent_raised',
                                         'percent_spent']]
    return election_results
    
def generate_dataframe(response):
    soup = BS(response.text)
    soup.prettify()
    
    
    title = soup.find('title')
    title = split_title(title)
    state = (title[0])
    district = title[1]
    
    election_results = pd.read_html(str(soup.find('table')))[0]
    election_results = get_dataframe(state, district, election_results)
    return election_results

def state_parse(key, value):
    url = f"https://www.opensecrets.org/races/summary?cycle=2020&id={key}"
    numb_districts = value
    election_results = pd.DataFrame(columns=['Candidate_name', 
                                         'Party', 
                                         'State',
                                         'District',
                                         'Incumbent', 
                                         'Winner', 
                                         'Raised', 
                                         'Spent'])
    for i in range(1, numb_districts + 1):
        time.sleep(0.5)
        district = '{:0>2}'.format(i)
        URL = update_url(url, district)
        response = requests.get(URL)

        results = generate_dataframe(response)
        
        election_results = election_results.append(results)
       
    return election_results


In [184]:
#election_results = pd.DataFrame(columns=['Candidate_name', 
#                                         'Party', 
 #                                        'State',
  #                                       'District',
   #                                      'Incumbent', 
    #                                     'Winner', 
     #                                    'Raised', 
      #                                   'Spent'])

In [185]:
URL_abb = "https://www.scouting.org/resources/los/states/"

response_abb = requests.get(URL_abb)

soup_abb = BS(response_abb.text)

soup_abb

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

In [186]:
URL_states = "https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120"
URL_abb = "https://www.50states.com/abbreviations.htm"

response_states = requests.get(URL_states)
response_abb = requests.get(URL_abb)

soup = BS(response_states.text)

soup_abb = BS(response_abb.text)

abb = pd.read_html(str(soup_abb.find('table')))[0]
abb = abb.drop(columns='STANDARD ABBREVIATION')
abb = abb.rename(columns={'US STATE':'state', 'POSTAL ABBREVIATION':'state_abb'})


states = pd.read_html(str(soup.find('table')))[0]
states = states.drop([50])

state_districts = pd.merge(states, abb, on='state')
state_districts['representatives'] = state_districts['representatives'].apply(pd.to_numeric)
state_districts = state_districts[['state_abb', 'representatives']].set_index('state_abb').T.to_dict('records')

state_districts = state_districts[0]
state_districts

{'AL': 7,
 'AK': 1,
 'AZ': 9,
 'AR': 4,
 'CA': 53,
 'CO': 7,
 'CT': 5,
 'DE': 1,
 'FL': 27,
 'GA': 14,
 'HI': 2,
 'ID': 2,
 'IL': 18,
 'IN': 9,
 'IA': 4,
 'KS': 4,
 'KY': 6,
 'LA': 6,
 'ME': 2,
 'MD': 8,
 'MA': 9,
 'MI': 14,
 'MN': 8,
 'MS': 4,
 'MO': 8,
 'MT': 1,
 'NE': 3,
 'NV': 4,
 'NH': 2,
 'NJ': 12,
 'NM': 3,
 'NY': 27,
 'NC': 13,
 'ND': 1,
 'OH': 16,
 'OK': 5,
 'OR': 5,
 'PA': 18,
 'RI': 2,
 'SC': 7,
 'SD': 1,
 'TN': 9,
 'TX': 36,
 'UT': 4,
 'VT': 1,
 'VA': 11,
 'WA': 10,
 'WV': 3,
 'WI': 8,
 'WY': 1}

In [187]:
state_districts['AL']

7

In [188]:
  election_results = pd.DataFrame(columns=['Candidate_name', 
                                         'Party', 
                                         'State',
                                         'District',
                                         'Incumbent', 
                                         'Winner', 
                                         'Raised', 
                                         'Spent'])

for key in tqdm(state_districts.keys()):
    results = state_parse(key, state_districts[key])
    election_results = election_results.append(results)

election_results    

  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,Candidate_name,Party,State,District,Incumbent,Winner,Raised,Spent,percent_raised,percent_spent
0,Jerry Carl,(R),Alabama,01,False,True,1971321,1859349,96.095624,95.925703
1,James Averhart,(D),Alabama,01,False,False,80095,78973,3.904376,4.074297
0,Barry Moore,(R),Alabama,02,False,True,650807,669368,92.070532,92.281307
1,Phyllis Harvey,(D),Alabama,02,False,False,56050,55988,7.929468,7.718693
0,Mike D,(R),Alabama,03,True,True,1193111,1218564,95.956760,96.747133
...,...,...,...,...,...,...,...,...,...,...
1,Tricia Zunker,(D),Wisconsin,07,False,False,1261957,1232690,32.362718,32.894277
0,Mike Gallagher,(R),Wisconsin,08,True,True,3202905,2841801,88.480898,87.663451
1,Amanda Stuck,(D),Wisconsin,08,False,False,416978,399916,11.519102,12.336549
0,Liz Cheney,(R),Wyoming,01,True,True,3003883,3060167,95.711395,95.857821


In [189]:
election_results.to_csv("../webscraping_open_secrets-silver-ish-sharks/data/election_results.csv")

In [149]:
for key in state_districts:
    print(state_districts[key])

7
1
9
4
53
7
5
1
27
14
2
2
18
9
4
4
6
6
2
8
9
14
8
4
8
1
3
4
2
12
3
27
13
1
16
5
5
18
2
7
1
9
36
4
1
11
10
3
8
1
