### Part 1: Data Gathering
1. Start by acquiring the data from Tennessee's 7th District, which is available at https://www.opensecrets.org/races/summary?cycle=2020&id=TN07&spec=N. If you click the "Download .csv file", you can get a csv for this district. However, we don't want to have to click this button across all districts. Instead, we'll use Python to help automate this process. Start by sending a get request to the download button URL, https://www.opensecrets.org/races/summary.csv?cycle=2020&id=TN07. Convert the result to a DataFrame.


In [2]:
# import os
# os.chdir('..')
# print(f'Current working directory is {os.getcwd()}')
# from folder.file import function_name
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import pandas as pd
import re
import requests
import plotly.express as px
from bs4 import BeautifulSoup, SoupStrainer
from IPython.core.display import HTML
from io import StringIO
from urllib.request import Request, urlopen

In [4]:
def parse_html(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, features = 'html.parser')
    table = str(soup.findAll('table', {'class': 'wikitable'}))
    return table

In [5]:
def get_wiki_url(URL):
    table = parse_html(URL)
    seats_df = pd.read_html(StringIO(str(table)))[1][['State', 'Total seats']].rename(columns = {'Total seats': 'Districts'})
    return seats_df

In [6]:
def get_abbrev_url(URL):
    table = parse_html(URL)
    seats_df = pd.read_html(StringIO(str(table)))[1][['Name', 'USPS']].rename(columns = {'Name': 'State', 'Unnamed: 5_level_1': 'Code'})
    return seats_df

In [7]:
def wiki_states_merge():
    wiki_district_url = 'https://en.wikipedia.org/wiki/2020_United_States_House_of_Representatives_elections'
    wiki_abbrev_url = 'https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations'
    abbrev = get_abbrev_url(wiki_abbrev_url)
    wiki = get_wiki_url(wiki_district_url)
    state_code_df = pd.merge(wiki, abbrev).droplevel(0, axis=1)
    state_code_df = state_code_df[state_code_df['Code'] != 'NB']
    return state_code_df

Unnamed: 0,State,Districts,Code
0,Alabama,7,AL
1,Alaska,1,AK
2,Arizona,9,AZ
3,Arkansas,4,AR
4,California,53,CA
5,Colorado,7,CO
6,Connecticut,5,CT
7,Delaware,1,DE
8,Florida,27,FL
9,Georgia,14,GA


In [8]:
def get_tennessee_districts(URL_start):
    urls_list = []
    num = 1
    while num < 10:
        URL = URL_start + str(num).zfill(2)
        response = requests.get(URL).text
        TN_df = pd.read_csv(StringIO(response))
        TN_df.insert(0, 'District', str(num).zfill(2))
        urls_list.append(TN_df)
        num += 1
        #TN_df.to_csv('../open-secrets-skittles/data/TN_df.csv', index = False)
    return pd.concat(urls_list)

get_tennessee_districts('https://www.opensecrets.org/races/summary.csv?cycle=2020&id=TN')

Unnamed: 0,District,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,EndCash,...,Result,CRPICO,State,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,1,N00046688,Diana Harshbarger (R),2126945.6,1869099.77,222800.0,359728.5,1461293.0,83124.1,257845.83,...,W,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
1,1,N00046686,Blair Nicole Walsingham (D),140209.14,134994.55,1520.0,138689.14,0.0,0.0,5214.59,...,L,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,1,N00047760,Steve Holder (I),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
0,2,N00041594,Tim Burchett (R),1336275.75,878487.63,269535.0,1072845.61,0.0,-6104.86,593677.72,...,W,I,Tennessee,,,2020-08-06 00:00:00 +0000,TN02,0,1,N
1,2,N00041699,Renee Hoyos (D),812783.86,816793.15,3100.0,807459.01,0.0,2224.85,209.82,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,2,N00047761,Matthew Campbell (I),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
0,3,N00030815,Chuck Fleischmann (R),1051653.39,381411.2,453858.46,603344.93,0.0,-5550.0,1880341.32,...,W,I,Tennessee,,,2020-08-06 00:00:00 +0000,TN03,0,1,N
1,3,N00046911,Meg Gorman (D),85843.21,77759.83,2671.6,81271.61,2000.0,-100.0,8083.38,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,3,N00046589,Nancy Baxley (I),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
3,3,N00047762,Amber Hysell (I),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N


In [10]:
def create_states_df(URL_start, dataframe):

    NUM = dataframe['Districts'].tolist()
    ID = dataframe['Code'].tolist()
    
    urls_list = []
    num = 1
    
    for district, code in zip(NUM, ID):
        while num <= district:
            URL = URL_start + code + str(num).zfill(2)
            response = requests.get(URL).text
            States_df = pd.read_csv(StringIO(response))
            States_df.insert(0, 'District', str(num).zfill(2))
            col = States_df.pop('State')
            States_df.insert(1, 'State', col)
            urls_list.append(States_df)
            num += 1
        num = 1
    #States_df.to_csv('../open-secrets-skittles/data/States_df.csv', index = False)    
    return pd.concat(urls_list, ignore_index=True)

create_states_df('https://www.opensecrets.org/races/summary.csv?cycle=2020&id=', wiki_states_merge())

Unnamed: 0,District,State,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,...,VotePercent,Result,CRPICO,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,01,Alabama,N00044245,Jerry Carl (R),1971321.50,1859348.91,387000.00,1044195.95,434655.50,105470.05,...,64.88,W,O,,,2020-03-03 00:00:00 +0000,,0,2,N
1,01,Alabama,N00044750,James Averhart (D),80094.95,78973.24,0.00,50849.95,29245.00,0.00,...,35.04,L,O,,,2020-03-03 00:00:00 +0000,,0,2,N
2,02,Alabama,N00041295,Barry Moore (R),650806.75,669367.70,230281.65,408536.20,11500.00,488.90,...,65.30,W,O,,,2020-03-03 00:00:00 +0000,,0,2,N
3,02,Alabama,N00045944,Phyllis Harvey-Hall (D),56049.68,55988.07,2032.00,42411.95,10575.41,1030.32,...,34.61,L,O,,,2020-03-03 00:00:00 +0000,,0,2,N
4,02,Alabama,N00045631,John Page (L),0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,,O,,,2020-03-03 00:00:00 +0000,,0,2,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,01,Wyoming,N00035504,Liz Cheney (R),3003883.34,3060166.78,1292490.00,1169995.46,0.00,541397.88,...,68.60,W,I,,,2020-08-18 00:00:00 +0000,WY01,0,1,N
1260,01,Wyoming,N00047272,Lynnette Grey Bull (D),134597.32,132234.75,2800.00,130197.32,0.00,1600.00,...,24.60,L,C,,,2020-08-18 00:00:00 +0000,,0,2,N
1261,01,Wyoming,N00047207,Zoilo Adalia (3),0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,,C,,,2020-08-18 00:00:00 +0000,,0,2,N
1262,01,Wyoming,N00035139,Richard Brubaker (L),0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,,C,,,2020-08-18 00:00:00 +0000,,0,2,N
