In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
def convert_to_float(val):
    """
    Converts a string to a float by removing non-numeric and non-decimal characters.

    Args:
        val (str): The input string to be converted to a float.

    Returns:
        float: The converted float value.

    Example:
        convert_to_float("$123.45")  # Returns 123.45
    """
    val = re.sub(r"[^0-9\.]", "", val)
    return float(val)

In [3]:
def get_candidate_info(member_div):
    """
    Extracts candidate information from a BeautifulSoup element (a div) representing a candidate.

    Args:
        member_div (BeautifulSoup): The BeautifulSoup element containing candidate information.

    Returns:
        dict: A dictionary containing the extracted candidate information, including Name, Affiliation, Incumbent status,
        Winner status, Vote Percentage, Cash Raised, and Cash Spent.

    Example:
        get_candidate_info(member_div)  # Returns a dictionary with candidate information.
    """
    results = {"Name": "", "Affiliation": "", "Incumbent": 0, "Winner": 0, "Vote %": "", "Cash Raised": "", "Cash Spent": ""}

    # handle the case of the incumbent
    if member_div.find("a") != None:
        candidate_text = member_div.find("a").text
        affiliation = re.findall(r"\([\w+]\)", candidate_text)[0][1] if re.findall(r"\([\w+]\)", candidate_text) else ""
        incumbent = 1 if "Incumbent" in candidate_text else 0
        results["Incumbent"] = incumbent
        candidate_text = re.sub(r"\([\w+]\)", "", candidate_text)
        candidate_text = re.sub(r"Incumbent", "", candidate_text)
        name = re.sub("[^0-9a-zA-Z\s]+", "", candidate_text).strip()
    # get information for non-incumbents
    else:
        txt = member_div.find("h2").find("strong").text.strip()
        candidate_info = re.findall(r"[^\n\t+]+", txt)
        affiliation = re.findall(r"\([\w+]\)", candidate_info[0])[0][1] if re.findall(r"\([\w+]\)", candidate_info[0])[0][1] else ""
        name = re.sub(r"\([\w+]\)", "", candidate_info[0]).strip()
    
    # collect the name and affiliation of the candidate collected from above
    results["Name"] = name
    results["Affiliation"] = affiliation
    
    # check if the candidate is a winner
    if member_div.find("span", attrs={'class': 'winner'}):
        results["Winner"] = 1
    # get the vote percentage
    if member_div.find("span", attrs={'class': 'Members--vote-pct'}):
        votepctstr = member_div.find("span", attrs={'class': 'Members--vote-pct'}).text
        votepct = re.findall(r"\d+\.\d+%", votepctstr)[0]
        results["Vote %"] = convert_to_float(votepct)
    cash_numbers = member_div.find("table", attrs={'class': 'Members--table'}).findAll("td", attrs={'class': 'Members--number'})
    # get the cash amounts
    if cash_numbers:
        results["Cash Raised"] = convert_to_float(cash_numbers[0].text)
        results["Cash Spent"] = convert_to_float(cash_numbers[1].text)
    return results




In [4]:
def get_all_candidates_for_district_url(url):
    """
    Retrieves and extracts information about all candidates for a specific district from a given URL.

    Args:
        url (str): The URL of the page containing candidate information for a specific district.

    Returns:
        list: A list of dictionaries, where each dictionary represents candidate information for the district. 
              Returns an empty list if the URL is not found (404 error).
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if the response status code is not 200 (OK).
        soup = BeautifulSoup(response.text)
        #get all the divs that contain the candidates
        member_divs = soup.findAll("div", attrs={'class': 'Members--list-item'})
        #call get_candidate_info() to get the information on each of the divs and store them in a variable called candidate_list
        candidate_list = [get_candidate_info(div) for div in member_divs]

        # get the state and district number directly from the URL
        state_districtnum = re.findall(r'[A-Z][A-Z]\d+', url)[0]
        state = state_districtnum[:2]
        districtnum = re.findall(r'\d+', state_districtnum)[0]

        # add state and district number to each entry in candidate_list
        for candidate in candidate_list:
            candidate["State"] = state
            candidate["District Number"] = districtnum
        return candidate_list
    # handle exceptions
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
        return []  # Return an empty list in case of a 404 error.
    except Exception as e:
        print(f"An error occurred: {e} for url {url}")
        return []  # Return an empty list for any other exceptions.

    

In [5]:
"""
This cell is just to test the get_all_candidates_for_district_url() here by giving it different urls for districts. 
Example urls:
URL = "https://www.opensecrets.org/races/candidates?cycle=2020&id=GA14&spec=N"
URL = "https://www.opensecrets.org/races/candidates?cycle=2020&id=CO0314&spec=N"
"""

URL = "https://www.opensecrets.org/races/candidates?cycle=2020&id=GA14&spec=N"
candidate_list = get_all_candidates_for_district_url(URL)
candidate_df = pd.DataFrame(candidate_list)
candidate_df.head()

Unnamed: 0,Name,Affiliation,Incumbent,Winner,Vote %,Cash Raised,Cash Spent,State,District Number
0,Marjorie Taylor Greene,R,0,1,74.8,2596914.0,2237599.0,GA,14


In [6]:
def create_url(abbrev, reps):
    """
    Creates a URL for retrieving candidate information for a specific state and distric.

    Args:
        abbrev (str): A string representing the state abbreviation (e.g., 'CA' for California).
        reps (str): A string representing the district or representative identifier.

    Returns:
        str: A URL string for accessing candidate information for the specified state and district in the 2020 election cycle.

    Example:
        create_url("CA", "12")  # Returns a URL for California's 12th district in the 2020 election cycle.
    """
    return f"https://www.opensecrets.org/races/candidates?cycle=2020&id={abbrev}{reps}&spec=N"

In [7]:
#import a file containing all the US State abbreviations and store as a df
states_abbrev_df = pd.read_csv('..\\data\\states.csv')


# Generate URLs for congressional districts for all states using the URL provided
URL = "https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120"
response = requests.get(URL)
soup = BeautifulSoup(response.text)
#get all the table rows
table_rows = soup.find('tbody').findAll("tr")

# from each row get the state name and number of representatives and store in states_reps as a list of dictionaries
states_reps = []
for row in table_rows[:-1]:
    [state_cell, numreps_cell]=row.findAll("td")
    state = state_cell.text.strip()
    dists = int(numreps_cell.text.strip())
    for dist in range(1, dists+1):
        states_reps.append({"State": state, "Dist": str(dist)})

#convert to a dataframe
states_df = pd.DataFrame(states_reps)
# merge this with the df containing state abbreviations to get the state abbreviations
states_df= pd.merge(states_df, states_abbrev_df, on="State")
states_df["URL"]=states_df.apply(lambda x: create_url(x["Abbreviation"], x["Dist"].zfill(2)), axis=1)
states_df.head()


Unnamed: 0,State,Dist,Abbreviation,URL
0,Alabama,1,AL,https://www.opensecrets.org/races/candidates?c...
1,Alabama,2,AL,https://www.opensecrets.org/races/candidates?c...
2,Alabama,3,AL,https://www.opensecrets.org/races/candidates?c...
3,Alabama,4,AL,https://www.opensecrets.org/races/candidates?c...
4,Alabama,5,AL,https://www.opensecrets.org/races/candidates?c...


In [8]:
# This cell is just to test a random url (239) from states_df
url = states_df["URL"][239]
get_all_candidates_for_district_url(url)

[{'Name': 'Steven Horsford',
  'Affiliation': 'D',
  'Incumbent': 1,
  'Winner': 1,
  'Vote %': 50.7,
  'Cash Raised': 3492372.0,
  'Cash Spent': 3024427.0,
  'State': 'NV',
  'District Number': '04'},
 {'Name': 'Jim Marchant',
  'Affiliation': 'R',
  'Incumbent': 0,
  'Winner': 0,
  'Vote %': 45.8,
  'Cash Raised': 1462772.0,
  'Cash Spent': 1419405.0,
  'State': 'NV',
  'District Number': '04'},
 {'Name': 'Jonathan Royce Esteban',
  'Affiliation': 'L',
  'Incumbent': 0,
  'Winner': 0,
  'Vote %': 2.4,
  'Cash Raised': 2788.0,
  'Cash Spent': 1603.0,
  'State': 'NV',
  'District Number': '04'}]

In [9]:
# Get all the candididate data from all URLS and store them in candidate_data, and then in a dataframe
candidate_data = []
for count, url in enumerate(states_df["URL"]):
    candidate_data.extend(get_all_candidates_for_district_url(url))
    if count%50 == 0:
        print(f"processing state {count} of {states_df.shape[0]}")
print("Writing to dataframe...")
candidates_df = pd.DataFrame(candidate_data)
print("Done")

processing state 0 of 435
processing state 50 of 435
processing state 100 of 435
processing state 150 of 435
processing state 200 of 435
processing state 250 of 435
processing state 300 of 435
processing state 350 of 435
processing state 400 of 435
Writing to dataframe...
Done


In [10]:
candidates_df.to_csv("../data/candidates.csv")

In [13]:
candidates_df.head()

Unnamed: 0,Name,Affiliation,Incumbent,Winner,Vote %,Cash Raised,Cash Spent,State,District Number
0,Jerry Carl,R,0,1,64.9,1971321.0,1859349.0,AL,1
1,James Averhart,D,0,0,35.0,80095.0,78973.0,AL,1
2,Barry Moore,R,0,1,65.3,650807.0,669368.0,AL,2
3,Phyllis Harvey-Hall,D,0,0,34.6,56050.0,55988.0,AL,2
4,Mike D Rogers,R,1,1,67.5,1193111.0,1218564.0,AL,3
