# An Analysis of Political Contributions During the 2020 House of Representatives Election
Name: Habeeb Kotun Jr.<br>
Team: Silver SLoths<br>
Date: 10/14/2021<br>

In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup as BS

In [2]:
# Scrape the data from the summary page for Tennessee's 2nd District
URL = 'https://www.opensecrets.org/races/summary?cycle=2020&id=TN02&spec=N'
response = requests.get(URL)
soup = BS(response.text)
TN02 = pd.read_html(str(soup.find('table')))[0]
TN02

Unnamed: 0,Candidate,Raised,Spent,Cash on Hand,Last Report
0,Tim Burchett (R) • Incumbent • Winner,"$1,336,276","$878,488","$593,678",12/31/2020
1,Renee Hoyos (D),"$812,784","$816,793",$210,12/31/2020


In [3]:
for index in TN02.index:
    # Split up string in candidate column
    candidate_split = TN02['Candidate'][index].split()
    
    # Assign candidate name to candidate column
    TN02.at[index, 'Candidate'] = candidate_split[0] + ' ' + candidate_split[1]
    
    # Create new column named party
    TN02.at[index, 'party'] = re.sub(r'\W', '', candidate_split[2])
    
    # Create column named incumbent and determine incumbent status
    if (len(candidate_split) > 3) and (candidate_split[4] == 'Incumbent'):
        TN02.at[index, 'incumbent'] =  True
    else:
        TN02.at[index, 'incumbent'] =  False

    # Create column named race and determine if candidate won the race
    if (len(candidate_split) > 3) and (candidate_split[6] == 'Winner'):
        TN02.at[index, 'race'] =  True
    else:
        TN02.at[index, 'race'] =  False
        
    # Remove non-digit characters from raised and spent columns
    # Also transform raised and spent columns to int type
    TN02.at[index, 'Raised'] = int(re.sub(r'\D', '', TN02.at[index, 'Raised']))
    TN02.at[index, 'Spent'] = int(re.sub(r'\D', '', TN02.at[index, 'Spent']))
    
    # Create state and district columns
    TN02.at[index, 'state'] = re.findall(r'id=\w\w\d\d', str(soup.find('link')))[0][3:5]
    TN02.at[index, 'district'] = re.findall(r'id=\w\w\d\d', str(soup.find('link')))[0][5:7]

In [4]:
# Drop unneeded columns
TN02 = TN02.drop(columns=['Cash on Hand', 'Last Report'])

# Rename columns
TN02 = (TN02.rename({'Candidate':'name',
                     'Raised': 'raised',
                     'Spent':'spent'},
                    axis='columns'))

In [5]:
TN02

Unnamed: 0,name,raised,spent,party,incumbent,race,state,district
0,Tim Burchett,1336276,878488,R,True,True,TN,2
1,Renee Hoyos,812784,816793,D,False,False,TN,2
