Notebook created on 16 October 2021 by Chris Mulvey  
For NSS Data Science Bootcamp

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup as BS
from tqdm.notebook import tqdm
import time

### Working out a function to get all of the districts in a state

#### Reading in state and district info from a previous webscrape

In [2]:
districts = pd.read_csv('../data/district_info.csv')

districts.head()

Unnamed: 0,state,representatives,Code
0,Alabama,7,AL
1,Alaska,1,AK
2,Arizona,9,AZ
3,Arkansas,4,AR
4,California,53,CA


### The URL has state and district number in it `id=TN02`

In [None]:
url = 'https://www.opensecrets.org/races/summary?cycle=2020&id=TN02&spec=N'

In [None]:
url = 'https://www.opensecrets.org/races/summary?cycle=2020&id='+ state + district'&spec=N'

### Function to get candidate info for a single state and district

In [None]:
def get_district_candidates(state, district):
    url = 'https://www.opensecrets.org/races/summary?cycle=2020&id='+ state + district + '&spec=N'
    response = requests.get(url)
    soup = BS(response.text)
    try:
        candidates = pd.read_html(str(soup.find('table')))[0]
    except:
        print(response.headers) #prints out the header to help troubleshoot HTTP errors
    candidates['state'] = state
    candidates['district'] = district
    candidates['year'] = year
    
    return candidates

In [None]:
get_district_candidates('TN', str('0' + '3'))

### Now to bring in more states

In [None]:
state_dist = pd.DataFrame()

for index in tqdm(districts.index):
    state = districts['Code'][index]
    dist_no = districts['representatives'][index]
    
    if index == 10:
        time.sleep(70)
    elif index == 20:
        time.sleep(70)
    elif index == 30:
        time.sleep(70)
    elif index == 40:
        time.sleep(70)
    
    for i in range(1, dist_no + 1):
        if i < 10:
            i = '0' + str(i)
            
        state_dist = state_dist.append(get_district_candidates(str(state), str(i)),
                                      ignore_index = True)
            

In [None]:
state_dist.info()

In [None]:
state_dist.to_csv('../data/all_state.csv',
                 index = False)

### Troubleshooting error

In [None]:
districts.loc[43]

In [None]:
for i in range(1, 4):
    if i < 10:
        i = '0' + str(i)
        
    print(get_district_candidates('UT', str(i)))

In [None]:
i = 3

if i < 10:
    i = '0' + str(i)
    print(str(i))

## Building function to properly set up columns

In [None]:
def fix_columns(df):
    df['party'] = df['Candidate'].str.extract(r'\((\w)\)')
    df['incumbent'] = df['Candidate'].str.extract(r'([I]\w+)')
    df['winner'] = df['Candidate'].str.extract(r'([W]\w+)')
    df['candidate'] = df['Candidate'].str.extract(r'(^\w+\s\w+)')

    df = df.drop('Candidate',
                 axis = 1)
    
    df = df[['candidate',
             'party',
             'state',
             'district',
             'incumbent',
             'winner',
             'Raised',
             'Spent',
             'year']]
         
    df = df.rename(columns = {
        'Raised': 'raised',
        'Spent': 'spent'
    })
    
    return df

## Building a function to run it all and get data for other years as well

In [3]:
def get_all_reps_time(df, st_col, rep_col, year):
    """Takes in a DataFrame that holds the number of representatives per state and the year to
    pull data for from opersecrets.org and returns a dataframe of nationwide candidates and
    associated info.
    
    df: Variable name for the DataFrame that holds the state and number of Reps
    st_col: the DataFrame column that holds the two-digit state code
    rep_col: the DataFrame column that holds the number of reps
    year: Needs to be a string and 4 digits.
    """
    
    # Defining the function that will set up the columns in the final dataframe
    def fix_columns(df):
        df['party'] = df['Candidate'].str.extract(r'\((\w)\)')
        df['incumbent'] = df['Candidate'].str.extract(r'([I]\w+)')
        df['winner'] = df['Candidate'].str.extract(r'([W]\w+)')
        df['candidate'] = df['Candidate'].str.extract(r'(^\w+\s\w+)')

        df = df.drop('Candidate',
                 axis = 1)
    
        df = df[['candidate',
                 'party',
                 'state',
                 'district',
                 'incumbent',
                 'winner',
                 'Raised',
                 'Spent',
                 'year']]
         
        df = df.rename(columns = {
            'Raised': 'raised',
            'Spent': 'spent'
        })
    
        return df
    
    # Defining the function that will webscrape the candidate info
    def get_district_candidates(state, district, year):
        url = 'https://www.opensecrets.org/races/summary?cycle=' + year + '&id=' + state + district + '&spec=N'
        response = requests.get(url)
        soup = BS(response.text)
        try:
            candidates = pd.read_html(str(soup.find('table')))[0]
        except:
            print(response.headers) #prints out the header to help troubleshoot HTTP errors
        candidates['state'] = state
        candidates['district'] = district
        candidates['year'] = year
    
        return candidates
    
    # Setting up the for loop to get all states for a defined year
    
    state_dist = pd.DataFrame() # Empty DataFrame for the for loop

    for index in tqdm(df.index):
        state = df[st_col][index]
        dist_no = df[rep_col][index]
    
        if index == 10:
            time.sleep(70)
        elif index == 20:
            time.sleep(70)
        elif index == 30:
            time.sleep(70)
        elif index == 40:
            time.sleep(70)
    
        for i in range(1, dist_no + 1):
            if i < 10:
                i = '0' + str(i)
            
            state_dist = state_dist.append(get_district_candidates(str(year), str(state), str(i)),
                                          ignore_index = True)

In [4]:
get_all_reps_time(districts, 'Code', 'representatives', '2016')

  0%|          | 0/50 [00:00<?, ?it/s]

{'Date': 'Wed, 20 Oct 2021 02:29:07 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'Set-Cookie': '_opensecrets_session=713d6fGo8njyTJ5upR6olT3MTSP90A%2B86%2F%2BsA6%2FROBjCAgHsv0jA9dOwqzD5IsgBh7DBX%2FzoOyZZtLov52wv58ynPRcKQ9QALzaMauRdjc%2FvxRAgrCnS6SNK4dXE6VYkXDbicR9ZJfH%2BHban3Exy3dcPOgBzGox2CIBYttzipoiTf54g7TwKlt%2F18hsjapEs2YZwgSozB4sIM1m%2FPoaWhkn5kIiT45eE13L9csffj6Nje2FsnX68109o443MjKXpt2b6GtuQLVr%2FsZzJYtuFDbntMW%2FjautcUbczyg%3D%3D--P2KkauASi%2BtVjRrz--%2FgJF6E3r6uJqO4MkTjBYQQ%3D%3D; path=/; HttpOnly', 'Status': '404 Not Found', 'Strict-Transport-Security': 'max-age=30758400', 'X-Content-Type-Options': 'nosniff', 'X-Download-Options': 'noopen', 'X-Frame-Options': 'DENY', 'X-Permitted-Cross-Domain-Policies': 'none', 'X-Powered-By': 'Phusion Passenger(R) 6.0.8', 'X-RAILS-IP': '10.33.1.190', 'X-Request-Id': '35bce312-7984-4d29-8e8d-1ca861cab2b7', 'X-UA-Compatible': 'IE=edge', 'X-XSS-Protectio

UnboundLocalError: local variable 'candidates' referenced before assignment