In [83]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import sys

In [84]:
# Fucntion which recieves state and district in the form e.g. TN07 as arg to scrape page
def scrape_candidate_data(state_district):
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state_district}&spec=N'
    res = requests.get(endpoint)
    soup = BeautifulSoup(res.text, 'html.parser')
    pandas_data = []
    bad_districts = []

    try:
        # Locate the main div containing all candidate blocks
        candidates_div = soup.find('div', class_="Members--list")
        
        if candidates_div:
            # Extract candidate blocks
            candidates_blocks = candidates_div.find_all('div', class_='Members--list-item')
            
            for block in candidates_blocks:
                # Extract the name, party, and other details
                candidate_name_party = block.find('h2').get_text(strip=True)
                pattern = re.compile(r"(.+?) \((R|D|I)\)( • <i>Incumbent</i>)?")
                match = pattern.match(candidate_name_party)
                
                name, party, incumbent = match.groups()
                incumbent = incumbent is not None

                # Check winner status via tag and class directly
                winner = block.find('span', class_='winner') is not None

                # Extract vote percentage by looking directly for the relevant span and class
                vote_pct_tag = block.find('span', class_='Members--vote-pct')
                vote_pct = None
                if vote_pct_tag:
                    vote_pct = vote_pct_tag.get_text(strip=True).strip("()").replace('% of vote', '').strip()

               # Loop through each 'td' with the label names
                    # Find the 'td' element containing the label name
                    raised_label_td = block.find('td', string="Raised:")
                    spent_label_td = block.find('td', string="Spent:")
                    
                    # If the label is found, find the next 'td' sibling (which should contain the dollar value)
                    if raised_label_td or spent_label_td:
                        raised = raised_label_td.find_next_sibling('td').get_text()
                        spent = spent_label_td.find_next_sibling('td').get_text()
                        print(raised)                

                        pandas_data.append({
                            "State": state_district[:2],
                            "District": state_district[2:],
                            "Name": name,
                            "Party": party,
                            "Incumbent": incumbent,
                            "Winner": winner,
                            "Vote Percentage": vote_pct,
                            "Raised": raised,
                            "Spent": spent
                        })
        else:
            print(f"No candidate data found for {state_district}")
            bad_districts.append(state_district)
            
    except Exception as e:
        print(f"An error occurred while processing {state_district}: {str(e)}")
        bad_districts.append(state_district)

    return pandas_data


In [86]:
data = scrape_candidate_data("CA20")
print(data)
df = pd.DataFrame(data)
df

$2,009,894
$67,634
[{'State': 'CA', 'District': '20', 'Name': 'Jimmy Panetta', 'Party': 'D', 'Incumbent': False, 'Winner': True, 'Vote Percentage': '76.8', 'Raised': '$2,009,894', 'Spent': '$1,592,671'}, {'State': 'CA', 'District': '20', 'Name': 'Jeff Gorman', 'Party': 'R', 'Incumbent': False, 'Winner': False, 'Vote Percentage': '23.2', 'Raised': '$67,634', 'Spent': '$64,947'}]


Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,CA,20,Jimmy Panetta,D,False,True,76.8,"$2,009,894","$1,592,671"
1,CA,20,Jeff Gorman,R,False,False,23.2,"$67,634","$64,947"
