In [424]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import aiohttp
import asyncio
import random
import time
%run ../data/states_districts.py

In [425]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
]

In [426]:
async def scrape_candidate_data(state_district):
    headers = {
    'User-Agent': random.choice(user_agents)
    }
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle=2020&id={state_district}&spec=N'

    timeout = aiohttp.ClientTimeout(total=600)
    connector = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(endpoint, headers=headers) as res:
            if res.status == 200:  # HTTP Status Code 200 means OK
                text = await res.text()
                soup = BeautifulSoup(text, 'html.parser')
            else:
                print(f"Failed to retrieve {endpoint}: {res.status}")
                return []  # Return an empty list or another suitable default value

    try:
        pattern_candidate = re.compile(r"(.+?) \((R|D|I)\)( •Incumbent•Winner)?(\(([\d.]+)% of vote\))?")

        pandas_data = []
        bad_districts = []
        # Extract candidate data
        for element in soup.find_all('h2'):
            text = element.get_text(strip=True)
            match = pattern_candidate.match(text)
            if match:
                name, party, incumbent_winner, _, vote_percentage = match.groups()
                incumbent = incumbent_winner is not None
                winner = incumbent  # Assumes if they are incumbent they are also the winner
                pandas_data.append({
                    "State": state_district[:2],
                    "District": state_district[2:],
                    "Name": name,
                    "Party": party,
                    "Incumbent": incumbent,
                    "Winner": winner,
                    "Vote Percentage": vote_percentage,
                    "Raised": None,
                    "Spent": None
                })

        financial_tables = soup.find_all('table', class_='Members--table')
        
        # print(f"Found {len(pandas_data)} candidates and {len(financial_tables)} financial data tables")
        
        # Loop through all the tables with class 'Members--table'
        for candidate_data, table in zip(pandas_data, financial_tables):
            # print(f"Processing financial data for {candidate_data['Name']}")
            # Loop through all the tr elements in the table
            for row in table.find_all('tr'):
                # Get the text content of all td elements in the row
                cols = [col.get_text() for col in row.find_all('td')]
                
                # Check if the first column is 'Raised' or 'Spent', and if so, store the data
                if cols[0] == 'Raised:':
                    candidate_data['Raised'] = cols[1]
                elif cols[0] == 'Spent:':
                    candidate_data['Spent'] = cols[1]
            # print(candidate_data)
        
        # Additional step: Check the length mismatch between candidate data and financial tables
        if len(pandas_data) != len(financial_tables):
            # print(f"Data length mismatch in {state_district}: {len(pandas_data)} candidate entries and {len(financial_tables)} financial entries.")
            bad_districts.append(state_district)
    except Exception as e:
        print(f"An error occurred while processing {state_district}: {str(e)}")
        pandas_data = []
        bad_districts = [state_district]

    return pandas_data, bad_districts


In [427]:
def get_district_codes():
    district_codes = []
    for state in congressional_districts:
        for district in congressional_districts[state]:
            code = state + district
            district_codes.append(code)
    return district_codes

In [428]:
async def gather_data(district_codes):
    all_data = []
    bad_districts = []

    tasks = [scrape_candidate_data(code) for code in district_codes]

    results = await asyncio.gather(*tasks, return_exceptions=True)

    for data, bad_district in results:
        all_data.extend(data)
        bad_districts.extend(bad_district)

    df = pd.DataFrame(all_data)

    return df, bad_districts

# Example usage in a Jupyter Notebook cell:

district_codes = get_district_codes() 
print(len(district_codes))
district_codes_1 = district_codes[:150]
time.sleep(5)
district_codes_2 = district_codes[151:300]
time.sleep(5)
district_codes_3 = district_codes[300:]

# Directly await the function in a notebook cell.
# df_1, bad_districts_1 = await gather_data(district_codes_1)
# df_2, bad_districts_2 = await gather_data(district_codes_2)
# df_3, bad_districts_3 = await gather_data(district_codes_3)



439


In [430]:
df_1, bad_districts_1 = await gather_data(district_codes_1)
print(bad_districts_1)
df_1

['CA29', 'CO03', 'CO05', 'CO06', 'CT03', 'GA05', 'GA08', 'GA13', 'HI02', 'IL06', 'IL07']


Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,AL,01,Jerry Carl,R,False,False,,"$1,971,321","$1,859,349"
1,AL,01,James Averhart,D,False,False,35.0,"$80,095","$78,973"
2,AL,02,Barry Moore,R,False,False,,"$650,807","$669,368"
3,AL,02,Phyllis Harvey-Hall,D,False,False,34.6,"$56,050","$55,988"
4,AL,03,Mike D Rogers,R,True,True,67.5,"$1,193,111","$1,218,564"
...,...,...,...,...,...,...,...,...,...
290,IL,15,Erika Weaver,D,False,False,26.5,"$58,689","$23,349"
291,IL,16,Adam Kinzinger,R,True,True,64.8,"$2,105,736","$1,588,550"
292,IL,16,Dani Brzozowski,D,False,False,35.2,"$459,344","$376,167"
293,IL,17,Cheri Bustos,D,True,True,52.0,"$4,975,192","$6,391,009"


In [432]:
df_2, bad_districts_2 = await gather_data(district_codes_2)
print(bad_districts_2)
df_2

['IN09', 'KY02', 'MI02', 'MN05', 'NV04', 'NY02', 'NY10', 'NY17', 'NY25', 'NY27', 'NC02', 'OH01']


Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,IN,01,Frank Mrvan Jr,D,False,False,,"$578,635","$464,407"
1,IN,01,Mark Leyva,R,False,False,40.4,"$16,551","$15,343"
2,IN,02,Jackie Walorski,R,True,True,61.5,"$2,383,467","$1,698,202"
3,IN,02,Pat Hackett,D,False,False,38.5,"$881,888","$867,301"
4,IN,03,Jim Banks,R,True,True,67.8,"$1,202,509","$769,591"
...,...,...,...,...,...,...,...,...,...
282,NC,13,Scott Huffman,D,False,False,31.8,"$193,334","$189,967"
283,ND,01,Kelly Armstrong,R,True,True,69.0,"$1,269,346","$1,080,105"
284,ND,01,Zach Raknerud,D,False,False,27.6,"$28,047","$26,029"
285,OH,01,Steve Chabot,R,True,True,51.8,"$3,177,647","$2,861,464"


In [433]:
df_3, bad_districts_3 = await gather_data(district_codes_3)
print(bad_districts_3)
df_3

['OH07', 'OH12', 'PA12', 'TX02', 'TX05', 'TX12', 'TX17', 'TX24', 'UT02', 'UT03', 'UT04', 'VA11', 'PR00']


Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,OH,02,Brad Wenstrup,R,True,True,61.1,"$1,794,354","$1,757,078"
1,OH,02,Jaime Castle,D,False,False,38.9,"$284,197","$271,471"
2,OH,03,Joyce Beatty,D,True,True,70.8,"$2,237,887","$2,540,687"
3,OH,03,Nicholas Moss,I,False,False,0.0,"$1,258","$1,248"
4,OH,04,Jim Jordan,R,True,True,67.9,"$18,313,823","$12,945,651"
...,...,...,...,...,...,...,...,...,...
274,GU,00,Robert A Underwood,D,False,False,,"$245,997","$245,868"
275,GU,00,William Mendiola Castro,R,False,False,,"$38,780","$43,890"
276,MP,00,Gregorio Sablan,I,False,False,,"$71,307","$56,637"
277,PR,00,Zayira Jordan,I,False,False,,"$1,306,012","$1,495,238"


In [436]:
# Bad districts whose finance data is hard to scrape
bad_districts = bad_districts_1 + bad_districts_2 + bad_districts_3
print(len(bad_districts))
print(bad_districts)
# This is data for all districts except for bad districts
df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df.to_csv('../data/scraped_data.csv')



36
['CA29', 'CO03', 'CO05', 'CO06', 'CT03', 'GA05', 'GA08', 'GA13', 'HI02', 'IL06', 'IL07', 'IN09', 'KY02', 'MI02', 'MN05', 'NV04', 'NY02', 'NY10', 'NY17', 'NY25', 'NY27', 'NC02', 'OH01', 'OH07', 'OH12', 'PA12', 'TX02', 'TX05', 'TX12', 'TX17', 'TX24', 'UT02', 'UT03', 'UT04', 'VA11', 'PR00']


Unnamed: 0,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,AL,01,Jerry Carl,R,False,False,,"$1,971,321","$1,859,349"
1,AL,01,James Averhart,D,False,False,35.0,"$80,095","$78,973"
2,AL,02,Barry Moore,R,False,False,,"$650,807","$669,368"
3,AL,02,Phyllis Harvey-Hall,D,False,False,34.6,"$56,050","$55,988"
4,AL,03,Mike D Rogers,R,True,True,67.5,"$1,193,111","$1,218,564"
...,...,...,...,...,...,...,...,...,...
856,GU,00,Robert A Underwood,D,False,False,,"$245,997","$245,868"
857,GU,00,William Mendiola Castro,R,False,False,,"$38,780","$43,890"
858,MP,00,Gregorio Sablan,I,False,False,,"$71,307","$56,637"
859,PR,00,Zayira Jordan,I,False,False,,"$1,306,012","$1,495,238"
