In [1]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import numpy as np 
import requests
import re

In [2]:
URL = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN07&spec=N'
#URL = 'https://www.opensecrets.org/races/candidates?cycle=2020&id=TN02&spec=N'
response = requests.get(URL)

In [3]:
soup = BS(response.text)

Regex Functions for grabbing specific info from the HTML file.

In [4]:
#These below functions sift through the candidate header for information
def name_extractor(candidate):
    return re.findall(r'[A-Z]\w+ [A-Z]\w+',candidate)[0]

def party_extractor(candidate):
    return re.findall(r'\(\w\)',candidate)[0]

def incumbent_finder(candidate):
    incumbent = re.findall(r'Incumbent',candidate)
    if len(incumbent)>0:
        return incumbent[0]
    else: 
        return 'N/a'

def winner_finder(candidate):
    winner = re.findall(r'Winner',candidate)
    if len(winner)>0:
        return winner[0]
    else:
        return 'N/a'

def percentage_vote_finder(candidate):
    return re.findall(r'(?<=\()\d+.\d\%',candidate)

#This will be based on the website you are on, there are different websites for each State,District combo. That's why soup is input
def state_and_district_finder(soup):
    string = str(soup.findAll('h1'))
    state = re.findall(r'(?<=>)[A-Z]\w+', string)
    district = re.findall(r'District \d{2}', string)
    return state, district; 


Lets extract the state and district number for the site. This will be the same for all candidates on a site.

In [5]:
state_and_district = state_and_district_finder(soup)

For loops to first extract the candidate headers from the page, and then extract their info into seperate lists from those headers. Then take the money information from the tables for each candidate and put into an array where rows = candidate and col = money category(Raised, Spent, Cash on Hand)

In [6]:
candidates = []
for candidate in soup.findAll('div', class_ = "Members--bio u-richtext"):
   candidates.append(candidate.text.strip()) 

money = []
for tag in soup.findAll('table', class_ = 'Members--table'):
   for anchor in tag.findAll('td', class_ = 'Members--number'):
      money.append(int(anchor.text.replace('$','').replace(',','')))
money_table = np.reshape(money, (len(candidates),3))

names = []
party_affiliation = []
incumbent_status = []
winner_status = []
percentage_vote = []
for i in candidates:
   names.append(name_extractor(i))
   party_affiliation.append(party_extractor(i))
   incumbent_status.append(incumbent_finder(i))
   winner_status.append(winner_finder(i))
   percentage_vote.append(percentage_vote_finder(i))

In [7]:
type(list(money_table[:,0])[0])

numpy.int32

Now lets merge all the Data together into 1 DF

In [8]:
data_dictionary = {'Name':names,
                    'Party':party_affiliation,
                    'State':state_and_district[0]*len(candidates),
                    'District Number':state_and_district[1]*len(candidates),
                    'Incumbent Status':incumbent_status,
                    'Winner Status':winner_status,
                    'Percentage of Vote':percentage_vote,
                    'Total Amount Raised':list(money_table[:,0]),
                    'Total Amount Spent':list(money_table[:,1])
                    }
congressional_races_TN_07_2020 = pd.DataFrame(data_dictionary)

In [9]:
congressional_races_TN_07_2020.dtypes

Name                   object
Party                  object
State                  object
District Number        object
Incumbent Status       object
Winner Status          object
Percentage of Vote     object
Total Amount Raised     int32
Total Amount Spent      int32
dtype: object