In [1]:
import requests
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd

<b>1\. Start by scraping the data from the summary page for Tennessee's 2nd District, which is available at https://www.opensecrets.org/races/summary?cycle=2020&id=TN02&spec=N.</b>
   * The data that we want is contained in the "Total Raised and Spent" table.
   * Make a DataFrame showing, for each candidate:
        * the candidate's name
        * the candidate's party
        * state
        * district number
        * whether the candidate was an incumbent
        * whether the candidate won the race
        * the total amount raised by that candidate (as a numeric variable)
        * the total amount spent by the candidate (as a numeric variable)

<b>2\. Once you have working code for Tennessee's 2nd District, expand on your code to capture all of Tennessee's districts.</b>

<b>3\. Once you have working code for all of Tennessee's districts, expand on it to capture all states and districts. The number of representatives each state has can be found in a table on this page: https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120</b>

In [2]:
def check_soup(state, district, year):
    
    URL = f'https://www.opensecrets.org/races/candidates?cycle={year}&id={state}{district}&spec=N'
    response = requests.get(URL)
    soup = BS(response.text)
    
    if (year in soup.find('title').text) and (len(soup.findAll('table')) > 0):
        return True
    
    return False

In [3]:
def get_election_info(state, district, year, data = pd.DataFrame()):
    
    URL = f'https://www.opensecrets.org/races/candidates?cycle={year}&id={state}{district}&spec=N'
    response = requests.get(URL)
    soup = BS(response.text)
        
    candidates = soup.findAll('strong')
    tables = (table for table in soup.findAll('table'))

    for candidate in candidates:

        text = candidate.text.replace('\n','').replace('\t','')
        name = text.split(' (')[0]
        party = text[text.find('(') + 1:text.find(')')]

        qualifiers = [qualifier.text for qualifier in candidate.findAll('i')]
        incumbent = 'Incumbent' in qualifiers
        winner = 'Winner' in qualifiers

        table_1 = pd.read_html(str(next(tables)))[0]
        total_category = table_1[0].str.strip(':').to_list()
        total_amount = table_1[1].str.replace('[$,]', '', regex = True).astype(int).to_list()

        table_2 = pd.read_html(str(next(tables)))[0]
        contribution_category = table_2['Type of Contribution'].str.replace('[^A-Za-z\s\-]', '', regex = True).str.strip().to_list()
        contribution_amounts = table_2['Amount'].str.replace('[$,]', '', regex = True).astype(int).to_list()

        table_3 = pd.read_html(str(next(tables)))[0]
        disclosure_category = table_3['Type of Disclosure'].to_list()
        disclosure_amount = table_3['Amount'].str.replace('[$,]', '', regex = True).astype(int).to_list()

        keys = (['Name', 'Party', 'State', 'District', 'Year', 'Incumbent', 'Winner'] + 
                total_category + contribution_category + disclosure_category)

        values = ([name, party, state, district, year, incumbent, winner] + 
                  total_amount + contribution_amounts + disclosure_amount)

        data = data.append(dict(zip(keys, values)), ignore_index = True)
        
    return data

In [4]:
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 
          'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
          'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
          'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
          'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

districts = ['{:02d}'.format(num) for num in list(range(1,54))]

column_names = ['Name', 
                'Party', 
                'State', 
                'District', 
                'Year',
                'Incumbent', 
                'Winner', 
                'Raised', 
                'Spent', 
                'Cash on Hand',
                'Small Individual Contributions', 
                'Large Individual Contributions', 
                'PAC Contributions', 
                'Candidate self-financing',
                'Other', 
                'Full Disclosure', 
                'Incomplete Disclosure', 
                'No Disclosure']

data = pd.DataFrame(columns = column_names)

In [5]:
for state in states:
        
    if check_soup(state, 'S1', '2020'):
        data = get_election_info(state, 'S1', '2020', data)
        
    if check_soup(state, 'S2', '2020'):
        data = get_election_info(state, 'S2', '2020', data)
        
    for district in districts:   

        if not check_soup(state, district, '2020'): break
        data = get_election_info(state, district, '2020', data)

In [7]:
data.to_csv('../data/election_finances.csv', index = False)

<b>4\. Using your scraped data, investigates different relationships between candidates and the amount of money they raised. Here are some suggestions to get you started, but feel free to pose you own questions or do additional exploration:</b>

a. How often does the candidate who raised more money win a race?  
b. How often does the candidate who spent more money win a race?  
c. Does the difference between either money raised or money spent seem to influence the likelihood of a candidate winning a race?  
d. How often does the incumbent candidate win a race?  
e. Can you detect any relationship between amount of money raised and the incumbent status of a candidate?

### Bonus Questions:
If you complete all of the above, you can attempt these challenging bonus questions.

Open Secrets also gives a detailed breakdown of contributions by source. For example, for Tennessee's second district, this is located at https://www.opensecrets.org/races/candidates?cycle=2020&id=TN02&spec=N

Scrape these pages to get information on contributions by source. See if you can find anything interesting in terms of the source of contributions. Some examples to get you started:
* What does the overall distribution of funding sources look like?
* Is there any detectable difference in contribution sources between Democrat and Republican candidates?
* Do the funding sources for either the winning candidate or incumbent candidate differ from the other candidates?