In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [43]:
def convert_to_float(val):
    """
    Converts a string to a float by removing non-numeric and non-decimal characters.

    Args:
        val (str): The input string to be converted to a float.

    Returns:
        float: The converted float value.

    Example:
        convert_to_float("$123.45")  # Returns 123.45
    """
    val = re.sub(r"[^0-9\.]", "", val)
    return float(val)

In [101]:
def get_candidate_info(member_div):
    """
    Extracts candidate information from a BeautifulSoup element (a div) representing a candidate.

    Args:
        member_div (BeautifulSoup): The BeautifulSoup element containing candidate information.

    Returns:
        dict: A dictionary containing the extracted candidate information, including Name, Affiliation, Incumbent status,
        Winner status, Vote Percentage, Cash Raised, and Cash Spent.

    Example:
        get_candidate_info(member_div)  # Returns a dictionary with candidate information.
    """
    results = {"Name": "", "Affiliation": "", "Incumbent": 0, "Winner": 0, "Vote %": "", "Cash Raised": "", "Cash Spent": ""}

    # handle the case of the incumbent
    if member_div.find("a") != None:
        candidate_text = member_div.find("a").text
        affiliation = re.findall(r"\([\w+]\)", candidate_text)[0][1] if re.findall(r"\([\w+]\)", candidate_text) else ""
        incumbent = 1 if "Incumbent" in candidate_text else 0
        results["Incumbent"] = incumbent
        candidate_text = re.sub(r"\([\w+]\)", "", candidate_text)
        candidate_text = re.sub(r"Incumbent", "", candidate_text)
        name = re.sub("[^0-9a-zA-Z\s]+", "", candidate_text)
    # get information for non-incumbents
    else:
        txt = member_div.find("h2").find("strong").text.strip()
        candidate_info = re.findall(r"[^\n\t+]+", txt)
        affiliation = re.findall(r"\([\w+]\)", candidate_info[0])[0][1] if re.findall(r"\([\w+]\)", candidate_info[0])[0][1] else ""
        name = re.sub(r"\([\w+]\)", "", candidate_info[0]).strip()
    
    # collect the name and affiliation of the candidate collected from above
    results["Name"] = name
    results["Affiliation"] = affiliation
    
    # check if the candidate is a winner
    if member_div.find("span", attrs={'class': 'winner'}):
        results["Winner"] = 1
    # get the vote percentage
    if member_div.find("span", attrs={'class': 'Members--vote-pct'}):
        votepctstr = member_div.find("span", attrs={'class': 'Members--vote-pct'}).text
        votepct = re.findall(r"\d+\.\d+%", votepctstr)[0]
        results["Vote %"] = convert_to_float(votepct)
    cash_numbers = member_div.find("table", attrs={'class': 'Members--table'}).findAll("td", attrs={'class': 'Members--number'})
    # get the cash amounts
    if cash_numbers:
        results["Cash Raised"] = convert_to_float(cash_numbers[0].text)
        results["Cash Spent"] = convert_to_float(cash_numbers[1].text)
    return results




In [107]:
def get_all_candidates_for_district_url(url):
    """
    Retrieves and extracts information about all candidates for a specific district from a given URL.

    Args:
        url (str): The URL of the page containing candidate information for a specific district.

    Returns:
        list: A list of dictionaries, where each dictionary represents candidate information for the district. 
              Returns an empty list if the URL is not found (404 error).
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if the response status code is not 200 (OK).
        soup = BeautifulSoup(response.text)
        member_divs = soup.findAll("div", attrs={'class': 'Members--list-item'})
        candidate_list = [get_candidate_info(div) for div in member_divs]
        state_districtnum = re.findall(r'[A-Z][A-Z]\d+', url)[0]
        state = state_districtnum[:2]
        districtnum = re.findall(r'\d+', state_districtnum)[0]
        for candidate in candidate_list:
            candidate["State"] = state
            candidate["District Number"] = districtnum
        return candidate_list
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
        return []  # Return an empty list in case of a 404 error.
    except Exception as e:
        print(f"An error occurred: {e} for url {url}")
        return []  # Return an empty list for any other exceptions.

    

In [110]:
URL = "https://www.opensecrets.org/races/candidates?cycle=2020&id=OR02&spec=N"
candidate_list = get_all_candidates_for_district_url(URL)
candidate_df = pd.DataFrame(candidate_list)
candidate_df.head()

Unnamed: 0,Name,Affiliation,Incumbent,Winner,Vote %,Cash Raised,Cash Spent,State,District Number
0,Cliff Bentz,R,0,1,60.0,1450780.0,1361727.0,OR,2
1,Alex Spenser,D,0,0,36.9,20640.0,5089.0,OR,2


In [None]:
# JUNK CODE (just for testing)
# URL = "https://www.opensecrets.org/races/candidates?cycle=2020&id=CO03&spec=N" 
# response = requests.get(URL)
# response.raise_for_status()  # Raise an exception if the response status code is not 200 (OK).
# soup = BeautifulSoup(response.text)
# member_divs = soup.findAll("div", attrs={'class': 'Members--list-item'})
# # print(len(member_divs))
# candidate_text = member_divs[1].find("h2").find("strong").text.strip()
# print(candidate_text)
# affiliation = re.findall(r"\([\w+]\)", candidate_text)
# print(affiliation[0][1])