In [263]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import aiohttp
import asyncio
import random
%run ../data/states_districts.py

In [264]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
]





In [265]:
async def scrape_candidate_data(state_district):
    headers = {
    'User-Agent': random.choice(user_agents)
    }
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state_district}&spec=N'

    timeout = aiohttp.ClientTimeout(total=600)
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(endpoint, headers=headers) as res:
            if res.status == 200:  # HTTP Status Code 200 means OK
                text = await res.text()
                soup = BeautifulSoup(text, 'html.parser')
            else:
                print(f"Failed to retrieve {endpoint}: {res.status}")
                return []  # Return an empty list or another suitable default value

    
    pattern_candidate = re.compile(r"(.+?) \((R|D|I)\)( •Incumbent•Winner)?(\(([\d.]+)% of vote\))?")
    pandas_data = []
    
    # Extract candidate data
    for element in soup.find_all('h2'):
        text = element.get_text(strip=True)
        match = pattern_candidate.match(text)
        if match:
            name, party, incumbent_winner, _, vote_percentage = match.groups()
            incumbent = incumbent_winner is not None
            winner = incumbent  # Assumes if they are incumbent they are also the winner
            pandas_data.append({
                "Name": name,
                "Party": party,
                "Incumbent": incumbent,
                "Winner": winner,
                "Vote Percentage": vote_percentage,
                "Raised": None,
                "Spent": None
            })

    financial_data = []

    # Loop through all the tables with class 'Members--table'
    for table in soup.find_all('table', class_='Members--table'):
        # Initialize a dictionary to store the raised and spent values for this table
        table_data = {}
        
        # Loop through all the tr elements in the table
        for row in table.find_all('tr'):
            # Get the text content of all td elements in the row
            cols = [col.get_text() for col in row.find_all('td')]
            
            # Check if the first column is 'Raised' or 'Spent', and if so, store the data
            if cols[0] == 'Raised:':
                table_data['Raised'] = cols[1]
            elif cols[0] == 'Spent:':
                table_data['Spent'] = cols[1]
        
        # Append the data dictionary to the financial_data list
        financial_data.append(table_data)

    # Check if lengths of pandas_data and financial_data match
    if len(pandas_data) == len(financial_data):
        # Update pandas_data with financial data
        for p_data, f_data in zip(pandas_data, financial_data):
            p_data.update(f_data)
    else:
        print(f"Mismatch in the number of entries between candidate data and financial data for {state_district}")

    return pandas_data


In [266]:
async def fetch_all_data():
    all_data = []
    tasks = []
    for state in congressional_districts:
        for district in congressional_districts[state]:
            webpage = state + district
            # print(webpage)
            tasks.append(scrape_candidate_data(webpage))
    all_data.extend(await asyncio.gather(*tasks))
    return all_data

# To run your async function in Jupyter, you could use:
all_scraped_data = await fetch_all_data()

Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ05&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ08&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ02&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AL06&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ07&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AL07&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ03&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AL02&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ04&spec=N: 429
Failed to retrieve https://www.opensecrets.org/races/candidates?cycle=2020&id=AZ01&spec=N: 429
Failed to retrieve https://www.opensecrets.org/rac

TimeoutError: 