In [146]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import aiohttp
import asyncio
import random
import time
%run ../data/states_districts.py

In [147]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
]

In [148]:
async def scrape_candidate_data(state_district):
    headers = {
    'User-Agent': random.choice(user_agents)
    }
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state_district}&spec=N'

    timeout = aiohttp.ClientTimeout(total=600)
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(endpoint, headers=headers) as res:
            if res.status == 200:  # HTTP Status Code 200 means OK
                text = await res.text()
                soup = BeautifulSoup(text, 'html.parser')
            else:
                print(f"Failed to retrieve {endpoint}: {res.status}")
                return []  # Return an empty list or another suitable default value

    pandas_data = []
    bad_districts = []

    try:
        # Locate the main div containing all candidate blocks
        candidates_div = soup.find('div', class_="Members--list")
        
        if candidates_div:
            # Extract candidate blocks
            candidates_blocks = candidates_div.find_all('div', class_='Members--list-item')
            
            for block in candidates_blocks:
                # Extract the name, party, and other details
                candidate_h2_text = block.find('h2').get_text(strip=True)
                pattern = re.compile(r"(.+?) \((R|D|I)\)")
                match = pattern.match(candidate_h2_text)
                name, party = match.groups()
                incumbent = "Incumbent" in candidate_h2_text

                # Check winner status via tag and class directly
                winner = block.find('span', class_='winner') is not None

                # Extract vote percentage by looking directly for the relevant span and class
                vote_pct_tag = block.find('span', class_='Members--vote-pct')
                vote_pct = None
                if vote_pct_tag:
                    vote_pct = vote_pct_tag.get_text(strip=True).strip("()").replace('% of vote', '').strip()

               # Loop through each 'td' with the label names
                    # Find the 'td' element containing the label name
                    raised_label_td = block.find('td', string="Raised:")
                    spent_label_td = block.find('td', string="Spent:")
                    
                    # If the label is found, find the next 'td' sibling (which should contain the dollar value)
                    if raised_label_td or spent_label_td:
                        raised = raised_label_td.find_next_sibling('td').get_text()
                        spent = spent_label_td.find_next_sibling('td').get_text()

                        pandas_data.append({
                            "State": state_district[:2],
                            "District": state_district[2:],
                            "Name": name,
                            "Party": party,
                            "Incumbent": incumbent,
                            "Winner": winner,
                            "Vote Percentage": vote_pct,
                            "Raised": raised,
                            "Spent": spent
                        })
        else:
            print(f"No candidate data found for {state_district}")
            bad_districts.append(state_district)
            
    except Exception as e:
        print(f"An error occurred while processing {state_district}: {str(e)}")
        bad_districts.append(state_district)


    return pandas_data, bad_districts



In [149]:
def get_district_codes():
    district_codes = []
    for state in congressional_districts:
        for district in congressional_districts[state]:
            code = state + district
            district_codes.append(code)
    return district_codes

In [150]:
async def gather_data(district_codes):
    all_data = []
    bad_districts = []

    tasks = [scrape_candidate_data(code) for code in district_codes]

    results = await asyncio.gather(*tasks, return_exceptions=True)

    for data, bad_district in results:
        all_data.extend(data)
        bad_districts.extend(bad_district)

    df = pd.DataFrame(all_data)

    return df, bad_districts

# Example usage in a Jupyter Notebook cell:

district_codes = get_district_codes() 
print(len(district_codes))
district_codes_1 = district_codes[:150]
time.sleep(5)
district_codes_2 = district_codes[151:300]
time.sleep(5)
district_codes_3 = district_codes[300:]

# Directly await the function in a notebook cell.
# df_1, bad_districts_1 = await gather_data(district_codes_1)
# df_2, bad_districts_2 = await gather_data(district_codes_2)
# df_3, bad_districts_3 = await gather_data(district_codes_3)



439


In [152]:
df_1, bad_districts_1 = await gather_data(district_codes_1)
print(bad_districts_1)
print(len(district_codes_1))
df_1

An error occurred while processing CA29: 'NoneType' object has no attribute 'groups'
An error occurred while processing CO06: 'NoneType' object has no attribute 'groups'
An error occurred while processing CT03: 'NoneType' object has no attribute 'groups'
An error occurred while processing CO05: 'NoneType' object has no attribute 'groups'
An error occurred while processing CO03: 'NoneType' object has no attribute 'groups'
An error occurred while processing GA05: 'NoneType' object has no attribute 'groups'
An error occurred while processing GA08: 'NoneType' object has no attribute 'groups'
An error occurred while processing HI02: 'NoneType' object has no attribute 'groups'
An error occurred while processing GA13: 'NoneType' object has no attribute 'groups'
An error occurred while processing IL07: 'NoneType' object has no attribute 'groups'
An error occurred while processing IL06: 'NoneType' object has no attribute 'groups'
['CA29', 'CO03', 'CO05', 'CO06', 'CT03', 'GA05', 'GA08', 'GA13', 

Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,AL,01,Jerry Carl,R,False,True,64.9,"$1,971,321","$1,859,349"
1,AL,01,James Averhart,D,False,False,35.0,"$80,095","$78,973"
2,AL,02,Barry Moore,R,False,True,65.3,"$650,807","$669,368"
3,AL,02,Phyllis Harvey-Hall,D,False,False,34.6,"$56,050","$55,988"
4,AL,03,Mike D Rogers,R,True,True,67.5,"$1,193,111","$1,218,564"
...,...,...,...,...,...,...,...,...,...
276,IL,15,Erika Weaver,D,False,False,26.5,"$58,689","$23,349"
277,IL,16,Adam Kinzinger,R,True,True,64.8,"$2,105,736","$1,588,550"
278,IL,16,Dani Brzozowski,D,False,False,35.2,"$459,344","$376,167"
279,IL,17,Cheri Bustos,D,True,True,52.0,"$4,975,192","$6,391,009"


In [153]:
df_2, bad_districts_2 = await gather_data(district_codes_2)
print(bad_districts_2)
df_2

An error occurred while processing IN09: 'NoneType' object has no attribute 'groups'
An error occurred while processing KY02: 'NoneType' object has no attribute 'groups'
An error occurred while processing MI02: 'NoneType' object has no attribute 'groups'
An error occurred while processing NV04: 'NoneType' object has no attribute 'groups'
An error occurred while processing MN05: 'NoneType' object has no attribute 'groups'
An error occurred while processing NY10: 'NoneType' object has no attribute 'groups'
An error occurred while processing NY02: 'NoneType' object has no attribute 'groups'
An error occurred while processing NY17: 'NoneType' object has no attribute 'groups'
An error occurred while processing NY27: 'NoneType' object has no attribute 'groups'
An error occurred while processing NC02: 'NoneType' object has no attribute 'groups'
An error occurred while processing NY25: 'NoneType' object has no attribute 'groups'
An error occurred while processing OH01: 'NoneType' object has no

Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,IN,01,Frank Mrvan Jr,D,False,True,56.6,"$578,635","$464,407"
1,IN,01,Mark Leyva,R,False,False,40.4,"$16,551","$15,343"
2,IN,02,Jackie Walorski,R,True,True,61.5,"$2,383,467","$1,698,202"
3,IN,02,Pat Hackett,D,False,False,38.5,"$881,888","$867,301"
4,IN,03,Jim Banks,R,True,True,67.8,"$1,202,509","$769,591"
...,...,...,...,...,...,...,...,...,...
260,NC,13,Scott Huffman,D,False,False,31.8,"$193,334","$189,967"
261,ND,01,Kelly Armstrong,R,True,True,69.0,"$1,269,346","$1,080,105"
262,ND,01,Zach Raknerud,D,False,False,27.6,"$28,047","$26,029"
263,OH,01,Steve Chabot,R,True,True,51.8,"$3,177,647","$2,861,464"


In [154]:
df_3, bad_districts_3 = await gather_data(district_codes_3)
print(bad_districts_3)
df_3

An error occurred while processing OH12: 'NoneType' object has no attribute 'groups'
An error occurred while processing OH07: 'NoneType' object has no attribute 'groups'
An error occurred while processing PA12: 'NoneType' object has no attribute 'groups'
An error occurred while processing TX12: 'NoneType' object has no attribute 'groups'
An error occurred while processing TX02: 'NoneType' object has no attribute 'groups'
An error occurred while processing UT03: 'NoneType' object has no attribute 'groups'
An error occurred while processing TX05: 'NoneType' object has no attribute 'groups'
An error occurred while processing TX17: 'NoneType' object has no attribute 'groups'
An error occurred while processing TX24: 'NoneType' object has no attribute 'groups'
An error occurred while processing UT02: 'NoneType' object has no attribute 'groups'
An error occurred while processing UT04: 'NoneType' object has no attribute 'groups'
An error occurred while processing PR00: 'NoneType' object has no

Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,OH,02,Brad Wenstrup,R,True,True,61.1,"$1,794,354","$1,757,078"
1,OH,02,Jaime Castle,D,False,False,38.9,"$284,197","$271,471"
2,OH,03,Joyce Beatty,D,True,True,70.8,"$2,237,887","$2,540,687"
3,OH,03,Nicholas Moss,I,False,False,0.0,"$1,258","$1,248"
4,OH,04,Jim Jordan,R,True,True,67.9,"$18,313,823","$12,945,651"
...,...,...,...,...,...,...,...,...,...
257,WI,07,Tricia Zunker,D,False,False,39.2,"$1,261,957","$1,232,690"
258,WI,08,Mike Gallagher,R,True,True,64.0,"$3,202,905","$2,841,801"
259,WI,08,Amanda Stuck,D,False,False,36.0,"$416,978","$399,916"
260,WY,01,Liz Cheney,R,True,True,68.6,"$3,003,883","$3,060,167"


In [155]:
# Bad districts whose finance data is hard to scrape
bad_districts = bad_districts_1 + bad_districts_2 + bad_districts_3
print(len(bad_districts))
print(bad_districts)
# This is data for all districts except for bad districts
df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df.to_csv('../data/scraped_data.csv')

36
['CA29', 'CO03', 'CO05', 'CO06', 'CT03', 'GA05', 'GA08', 'GA13', 'HI02', 'IL06', 'IL07', 'IN09', 'KY02', 'MI02', 'MN05', 'NV04', 'NY02', 'NY10', 'NY17', 'NY25', 'NY27', 'NC02', 'OH01', 'OH07', 'OH12', 'PA12', 'TX02', 'TX05', 'TX12', 'TX17', 'TX24', 'UT02', 'UT03', 'UT04', 'VA11', 'PR00']


In [156]:
df.head(100)

Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,AL,01,Jerry Carl,R,False,True,64.9,"$1,971,321","$1,859,349"
1,AL,01,James Averhart,D,False,False,35.0,"$80,095","$78,973"
2,AL,02,Barry Moore,R,False,True,65.3,"$650,807","$669,368"
3,AL,02,Phyllis Harvey-Hall,D,False,False,34.6,"$56,050","$55,988"
4,AL,03,Mike D Rogers,R,True,True,67.5,"$1,193,111","$1,218,564"
...,...,...,...,...,...,...,...,...,...
95,CA,32,Joshua Scott,R,False,False,33.4,"$23,624","$14,248"
96,CA,33,Ted Lieu,D,True,True,67.6,"$1,661,436","$1,582,573"
97,CA,33,James Bradley,R,False,False,32.4,"$78,910","$79,035"
98,CA,34,Jimmy Gomez,D,True,True,53.0,"$1,401,166","$1,268,312"
