In [2]:
import requests
from bs4 import BeautifulSoup as BS
import numpy as np
import pandas as pd
import re

In [3]:
# Get request
URL = 'https://www.opensecrets.org/races/summary?cycle=2020&id=TN02&spec=N'

response = requests.get(URL)
response

<Response [200]>

In [4]:
# Get soup
soup = BS(response.text)

In [5]:
# Read table as dataframe
TN02 = pd.read_html(str(soup.find('table')))[0]

In [6]:
# Make column titles lowercase
TN02.columns = TN02.columns.str.lower()

# Split candidate column and add named columns to TN02
TN02[['name', 'party', 'incumbent', 'winner']] = TN02['candidate'].str.split(r'\(|•', expand = True)

# Drop candidate column and rearrange columns for readability
TN02 = TN02.drop(columns = 'candidate')[['name', 'party','incumbent', 'winner', 'raised', 'spent']]

# Clean the party column
TN02['party'] = TN02['party'].str.replace(r'\)', '', regex = True)

# Clean and change types of raised and spent
money_cols = ['raised', 'spent']
for col in money_cols:
    TN02[col] = TN02[col].str.replace(r'\$|,', '', regex = True).astype('int')

In [7]:
TN02

Unnamed: 0,name,party,incumbent,winner,raised,spent
0,Tim Burchett,R,Incumbent,Winner,1336276,878488
1,Renee Hoyos,D,,,812784,816793


In [8]:
TN02b = pd.read_html(str(soup.find('table')))[0]

# Make column titles lowercase
TN02b.columns = TN02b.columns.str.lower()

# Split candidate column and add named columns to TN02
TN02b['incumbent'] = TN02b['candidate'].str.contains(r'Incumbent')
TN02b['winner'] = TN02b['candidate'].str.contains(r'Winner')
TN02b['party'] = TN02b['candidate'].apply(lambda x: re.search(r'\((.*?)\)',x).group(1))
TN02b['name'] = TN02b['candidate'].str.split(r'\(.\)', expand = True)[0]

# Drop candidate column and rearrange columns for readability
TN02b = TN02b.drop(columns = 'candidate')[['name', 'party','incumbent', 'winner', 'raised', 'spent']]

# Clean the party column
TN02b['party'] = TN02b['party'].str.replace(r'\)', '', regex = True)

# Clean and change types of raised and spent
money_cols = ['raised', 'spent']
for col in money_cols:
    TN02b[col] = TN02b[col].str.replace(r'\$|,', '', regex = True).astype('int')

In [9]:
TN02b

Unnamed: 0,name,party,incumbent,winner,raised,spent
0,Tim Burchett,R,True,True,1336276,878488
1,Renee Hoyos,D,False,False,812784,816793


In [46]:
def candidate_info(df):
    """Extract candidate info from Open Secrets tables, then drop candidate column."""
    
    df['incumbent'] = df['candidate'].str.contains(r'Incumbent')
    df['winner'] = df['candidate'].str.contains(r'Winner')
    df['party'] = df['candidate'].apply(lambda x: re.search(r'\((.*?)\)',x).group(1))
    df['name'] = df['candidate'].str.split(r'\(.\)', expand = True)[0]
    df = df.drop(columns = 'candidate')[['name', 'party','incumbent', 'winner', 'raised', 'spent']]

    return df

def clean_money(df):
    """Clean and conver the money columns"""
    
    money_cols = ['raised', 'spent']
    for col in money_cols:
        df[col] = df[col].str.replace(r'\$|,', '', regex = True).astype('int')
    
    return df

def format_os_table(df):
    """lowercase column names, extract candidate info, rearrange columns, and clean the money columns"""
    
    df.columns = df.columns.str.lower()
    a = candidate_info(df)
    b = clean_money(a)
    
    return b

In [47]:
TN02c = pd.read_html(str(soup.find('table')))[0]
format_os_table(TN02c)

Unnamed: 0,name,party,incumbent,winner,raised,spent
0,Tim Burchett,R,True,True,1336276,878488
1,Renee Hoyos,D,False,False,812784,816793
