### Bonus Questions:

If you complete all of the above, you can attempt these challenging bonus questions.

Open Secrets also gives a detailed breakdown of contributions by source. For example, for Tennessee's second district, this is located at https://www.opensecrets.org/races/candidates?cycle=2020&id=TN02&spec=N

Scrape these pages to get information on contributions by source. See if you can find anything interesting in terms of the source of contributions. Some examples to get you started:
* What does the overall distribution of funding sources look like?
* Is there any detectable difference in contribution sources between Democrat and Republican candidates?
* Do the funding sources for either the winning candidate or incumbent candidate differ from the other candidates?

In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup as BS
import re
from urllib.request import Request, urlopen
import warnings
warnings.filterwarnings('ignore')

In [None]:
def fundsource_df_creation(URL):
    """Function to pull in URL, scrape it and return a dataframe."""
    
    # Show that I'm human; will otherwise be blocked by captcha.
    request = Request(URL, headers = {'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(request).read().decode('utf-8')
    soup = BS(webpage,'lxml')
    
    # Create an empty list of candidates, then find candidate names on webpage.
    list_of_candidates = []
    candidates_on_page = soup.findAll('div', attrs = {'class' : 'Members--bio u-richtext'})
    
    # Cycle through what is scraped and pull out names only.
    for i in candidates_on_page:
        candidates = re.search(r'([A-Za-z]+\s*-*[A-Za-z]*\s*-*[A-Za-z]+)\s', i.text)
        list_of_candidates.append(candidates.group(1))
     
    # Reverse the list of names for order of appearance in tables on webpage.
    list_of_candidates = list_of_candidates[::-1]
    
    # Pull out *all* tables from webpage.
    all_tables_in_candidates_page = pd.read_html(str(soup.findAll('table')))
    list_of_tables = []
    
    # Then pull out only the contribution source tables and change to dataframes, then append to a list.
    for table in all_tables_in_candidates_page:
        if "Type of Contribution" in table:
            table = pd.DataFrame(table)
            list_of_tables.append(table)
    
    # Merge each applicable dataframe together along with each candidate's name.
    for e, table in enumerate(list_of_tables):
        table['Name'] = list_of_candidates[e]
        if e == 0:
            all_tables = table
        elif e <= len(list_of_tables) - 1:
            all_tables = all_tables.merge(table, how = 'outer')
    
    
    # Pull out state name from URL.
    state = re.compile(r'id=([A-Z]+)\d*')
    state = state.search(URL)
    all_tables['State'] = state.group(1)
    
    # Pull out district number from URL.
    district_number = re.compile(r'id=[A-Z]+(\d*)')
    district_number = district_number.search(URL)
    all_tables['District'] = district_number.group(1)
    
    # Cast amount column as int.
    all_tables['Amount'] = (
        all_tables['Amount'].\
        str.replace(',', '', regex = False).\
        str.replace('$', '', regex = False).\
        astype('int64')
    )
    
    # Cast percentage column as float.
    all_tables['Percentage'] = (
        all_tables['Percentage'].\
        str.replace('%', '', regex = False).\
        astype('float64')/100
    )
    
    # Reorder tables.
    all_tables = all_tables.iloc[:,[3,4,5,1,2,0]]
    
    return all_tables

In [None]:
def pull_all_districts_info(URL_Base, reps_plus_abbreviations):

    # Set enumeration value.
    e = 0
    
    #Code for testing purposes only but kept in comments in case needed.
    #reps_plus_abbreviations = reps_plus_abbreviations.loc[reps_plus_abbreviations['Postal'] == 'CA']
    
    # Cycle through all links for each district.
    for row in reps_plus_abbreviations.itertuples():
        abbrev = row[2]
        reps_endpoint = row[3] + 1
        # DC is different.
        if abbrev != 'DC':
            for k in range(1, reps_endpoint):
                if k < 10:
                    url = URL_Base + abbrev + f"0{k}"
                    district = fundsource_df_creation(url)
                else:
                    url = URL_Base + abbrev + f"{k}"
                    district = fundsource_df_creation(url)
                # Set the first district as the base into which all others will merge.
                if e == 0:
                    all_districts = district
                    e += 1
                # Merge everyone else.
                else:
                    all_districts = all_districts.merge(district, how = 'outer')
        else:
            url = URL_Base + 'DC00'
            district = fundsource_df_creation(url)
            all_districts = all_districts.merge(district, how = 'outer')
        # Delay, otherwise will get HTTPS 429 from server.
        time.sleep(0.05)
    
    return all_districts

# Read in reps_plus_abbreviations dataframe for function call.
reps_plus_abbreviations = pd.read_csv("Write_Data_Here/reps_plus_abbreviations.csv")

# Call function to pull all contribution source information.
all_contributions_by_source = (
    pull_all_districts_info('https://www.opensecrets.org/races/candidates?cycle=2020&id=',reps_plus_abbreviations)
)

# Read in previous dataframe for needed columns.
all_districts = pd.read_csv("Write_Data_Here/all_districts.csv", usecols = ['Name', 'Party', 'Incumbent', 'Winner'])

# Merge in incumbent, winner and party information.
all_contributions_by_source = (
    all_contributions_by_source.merge(all_districts, how = 'outer')[['Name', 
                                                                     'State', 
                                                                     'District', 
                                                                     'Party', 
                                                                     'Incumbent', 
                                                                     'Winner', 
                                                                     'Amount', 
                                                                     'Percentage',
                                                                     'Type of Contribution']]
)

# Write to csv for use later.
all_contributions_by_source.to_csv("Write_Data_Here/all_contributions_by_source.csv", index = False)