In [319]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import sys
%run ../data/states_districts.py

In [320]:
# Fucntion which recieves state and district in the form e.g. TN07 as arg to scrape page
def scrape_candidate_data(state_district):
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state_district}&spec=N'
    res = requests.get(endpoint)
    soup = BeautifulSoup(res.text, 'html.parser')
    pattern_candidate = re.compile(r"(.+?) \((R|D|I)\)( •Incumbent•Winner)?(\(([\d.]+)% of vote\))?")

    pandas_data = []
    bad_districts = []
    # Extract candidate data
    for element in soup.find_all('h2'):
        text = element.get_text(strip=True)
        match = pattern_candidate.match(text)
        if match:
            name, party, incumbent_winner, _, vote_percentage = match.groups()
            incumbent = incumbent_winner is not None
            winner = incumbent  # Assumes if they are incumbent they are also the winner
            pandas_data.append({
                "Name": name,
                "Party": party,
                "Incumbent": incumbent,
                "Winner": winner,
                "Vote Percentage": vote_percentage,
                "Raised": None,
                "Spent": None
            })

    financial_tables = soup.find_all('table', class_='Members--table')
    
    # print(f"Found {len(pandas_data)} candidates and {len(financial_tables)} financial data tables")
    
    # Loop through all the tables with class 'Members--table'
    for candidate_data, table in zip(pandas_data, financial_tables):
        # print(f"Processing financial data for {candidate_data['Name']}")
        # Loop through all the tr elements in the table
        for row in table.find_all('tr'):
            # Get the text content of all td elements in the row
            cols = [col.get_text() for col in row.find_all('td')]
            
            # Check if the first column is 'Raised' or 'Spent', and if so, store the data
            if cols[0] == 'Raised:':
                candidate_data['Raised'] = cols[1]
            elif cols[0] == 'Spent:':
                candidate_data['Spent'] = cols[1]
        # print(candidate_data)
    
    # Additional step: Check the length mismatch between candidate data and financial tables
    if len(pandas_data) != len(financial_tables):
        # print(f"Data length mismatch in {state_district}: {len(pandas_data)} candidate entries and {len(financial_tables)} financial entries.")
        bad_districts.append(state_district)
    
    return pandas_data, bad_districts

In [324]:
data = scrape_candidate_data("CA20")
df = pd.DataFrame(data)
df

Found 2 candidates and 2 financial data tables
Processing financial data for Jimmy Panetta
{'Name': 'Jimmy Panetta', 'Party': 'D', 'Incumbent': True, 'Winner': True, 'Vote Percentage': '76.8', 'Raised': '$2,009,894', 'Spent': '$1,592,671'}
Processing financial data for Jeff Gorman
{'Name': 'Jeff Gorman', 'Party': 'R', 'Incumbent': False, 'Winner': False, 'Vote Percentage': '23.2', 'Raised': '$67,634', 'Spent': '$64,947'}


Unnamed: 0,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,Jimmy Panetta,D,True,True,76.8,"$2,009,894","$1,592,671"
1,Jeff Gorman,R,False,False,23.2,"$67,634","$64,947"


In [322]:
'''
Mismatch in the number of entries between candidate data and financial data for CA29
Mismatch in the number of entries between candidate data and financial data for CO06
Mismatch in the number of entries between candidate data and financial data for CO03
Mismatch in the number of entries between candidate data and financial data for CO05
Mismatch in the number of entries between candidate data and financial data for CT03
Mismatch in the number of entries between candidate data and financial data for GA08
Mismatch in the number of entries between candidate data and financial data for GA05
Mismatch in the number of entries between candidate data and financial data for GA13
Mismatch in the number of entries between candidate data and financial data for IL06
Mismatch in the number of entries between candidate data and financial data for HI02
Mismatch in the number of entries between candidate data and financial data for IN09
Mismatch in the number of entries between candidate data and financial data for IL07
Mismatch in the number of entries between candidate data and financial data for UT04
Mismatch in the number of entries between candidate data and financial data for KY02
Mismatch in the number of entries between candidate data and financial data for MI02
'''

'\nMismatch in the number of entries between candidate data and financial data for CA29\nMismatch in the number of entries between candidate data and financial data for CO06\nMismatch in the number of entries between candidate data and financial data for CO03\nMismatch in the number of entries between candidate data and financial data for CO05\nMismatch in the number of entries between candidate data and financial data for CT03\nMismatch in the number of entries between candidate data and financial data for GA08\nMismatch in the number of entries between candidate data and financial data for GA05\nMismatch in the number of entries between candidate data and financial data for GA13\nMismatch in the number of entries between candidate data and financial data for IL06\nMismatch in the number of entries between candidate data and financial data for HI02\nMismatch in the number of entries between candidate data and financial data for IN09\nMismatch in the number of entries between candidate