In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

In [2]:
URL = 'https://www.opensecrets.org/races/summary.csv?cycle=2020&id=TN07'
response = requests.get(URL)
print(type(response))
if response.status_code == requests.codes.ok:
    print('Request is okay!')
else:
    response.raise_for_status()
TN_district7 = pd.read_csv(StringIO(response.text), sep=',')
TN_district7

<class 'requests.models.Response'>
Request is okay!


Unnamed: 0,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,EndCash,LgIndivs,...,Result,CRPICO,State,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,N00041873,Mark Green (R),1194960.47,935486.67,171900.0,819151.42,0.0,203909.05,287888.55,819151.42,...,W,I,Tennessee,,,2020-08-06 00:00:00 +0000,TN07,0,1,N
1,N00045536,Kiran Sreepada (D),206644.28,207190.98,4000.0,202644.28,0.0,0.0,0.0,179129.75,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,N00047077,Ronald Brown (I),1750.0,0.0,0.0,1750.0,0.0,0.0,9006.0,300.0,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
3,N00046592,Scott Vieira Jr (I),655.47,1048.51,10.0,45.0,35.0,565.47,-196.52,0.0,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
4,N00045535,Benjamin Estes (3),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N


In [3]:
URL = 'https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations'
states_df = pd.read_html(URL)[1]
states_df.columns = states_df.columns.map(lambda x: x[1])
states_df = (
    states_df
    .reset_index()
    .drop(columns = ['index', 'Status of region', 'Unnamed: 2_level_1', 'Unnamed: 4_level_1', 'Unnamed: 5_level_1', 'Unnamed: 6_level_1', 'GPO', 'AP', 'Other abbreviations'])
    .dropna()
    .rename(columns = {'Name': 'State', 'Unnamed: 3_level_1': 'Abbreviation'})
    .drop(0).reset_index(drop=True)
)
states_abr_dict = states_df.set_index('State')['Abbreviation'].to_dict()
states_abr_dict

{'Alabama': 'AL',
 'Alaska': 'AK',
 'Arizona': 'AZ',
 'Arkansas': 'AR',
 'California': 'CA',
 'Colorado': 'CO',
 'Connecticut': 'CT',
 'Delaware': 'DE',
 'District of Columbia': 'DC',
 'Florida': 'FL',
 'Georgia': 'GA',
 'Hawaii': 'HI',
 'Idaho': 'ID',
 'Illinois': 'IL',
 'Indiana': 'IN',
 'Iowa': 'IA',
 'Kansas': 'KS',
 'Kentucky': 'KY',
 'Louisiana': 'LA',
 'Maine': 'ME',
 'Maryland': 'MD',
 'Massachusetts': 'MA',
 'Michigan': 'MI',
 'Minnesota': 'MN',
 'Mississippi': 'MS',
 'Missouri': 'MO',
 'Montana': 'MT',
 'Nebraska': 'NE',
 'Nevada': 'NV',
 'New Hampshire': 'NH',
 'New Jersey': 'NJ',
 'New Mexico': 'NM',
 'New York': 'NY',
 'North Carolina': 'NC',
 'North Dakota': 'ND',
 'Ohio': 'OH',
 'Oklahoma': 'OK',
 'Oregon': 'OR',
 'Pennsylvania': 'PA',
 'Rhode Island': 'RI',
 'South Carolina': 'SC',
 'South Dakota': 'SD',
 'Tennessee': 'TN',
 'Texas': 'TX',
 'Utah': 'UT',
 'Vermont': 'VT',
 'Virginia': 'VA',
 'Washington': 'WA',
 'West Virginia': 'WV',
 'Wisconsin': 'WI',
 'Wyoming': 'WY

In [4]:
URL = 'https://en.wikipedia.org/wiki/2020_United_States_House_of_Representatives_elections'
response = requests.get(URL)
if response.status_code == requests.codes.ok:
    soup = BeautifulSoup(response.text, features="html.parser")
else:
    response.raise_for_status()
tables_html = str(soup.find_all('table', attrs={'class' : 'wikitable'}))
all_states_df = pd.read_html(StringIO(str(tables_html)))[1].fillna('-')
all_states_df.columns = all_states_df.columns.map(lambda x: x[1])
all_states_df = all_states_df.drop(columns=['Seats', 'Change'])
all_states_df.head(3)
state_representatives_df = pd.merge(left=all_states_df, right=states_df, on='State')
state_representatives_df

Unnamed: 0,State,Total seats,Abbreviation
0,Alabama,7,AL
1,Alaska,1,AK
2,Arizona,9,AZ
3,Arkansas,4,AR
4,California,53,CA
5,Colorado,7,CO
6,Connecticut,5,CT
7,Delaware,1,DE
8,Florida,27,FL
9,Georgia,14,GA


In [5]:
def ensure_two_digits(num):
    #Ensures an integer is represented by two digits, padding with '0' if necessary.
    return str(num).zfill(2)
    
def retrieve_2020_state_district_data(state: str, district: int, state_abbreviations=False):
    #need some kind of dictionary that will take state name if state_abbreviations=False
    state_abr_dict = states_abr_dict
    base_url = 'https://www.opensecrets.org/races/summary.csv?cycle=2020&id='
    district_num = ensure_two_digits(district)
    if state_abbreviations:
        state_district_url = base_url+state+district_num
    else:
        state_district_url = base_url+state_abr_dict[state]+district_num
    response = requests.get(state_district_url)
    if response.status_code != requests.codes.ok:
        response.raise_for_status()
        #print('Check to see if state has this District number')
    state_distr_df = pd.read_csv(StringIO(response.text), sep=',')
    state_distr_df.insert(0, 'State_Abbreviation', state if state_abbreviations else state_abr_dict[state])
    state_distr_df.insert(1, 'District', district_num)
    return state_distr_df

retrieve_2020_state_district_data('Connecticut', 2, state_abbreviations=False)


Unnamed: 0,State_Abbreviation,District,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,...,Result,CRPICO,State,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,CT,2,N00024842,Joe Courtney (D),964731.32,999359.87,587250.0,363656.46,105.1,13719.76,...,W,I,Connecticut,,,2020-08-11 00:00:00 +0000,CT02,0,1,N
1,CT,2,N00045015,Justin Anderson (R),80227.41,78767.12,0.0,27837.71,52089.7,300.0,...,L,C,Connecticut,,,2020-08-11 00:00:00 +0000,,0,2,N
2,CT,2,N00047360,Cassandra Martineau (3),0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Connecticut,,,2020-08-11 00:00:00 +0000,,0,2,N
3,CT,2,N00029657,Daniel Reale (L),0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Connecticut,,,2020-08-11 00:00:00 +0000,,0,2,N


In [None]:
import tqdm

def get_all_data(state_representatives_df):
    #Tennessee       TN   7
    #Massachusetts   MA   6
    state_district_dict = state_representatives_df.set_index('Abbreviation')['Total seats'].to_dict()
    data_list = []
    result_df = pd.DataFrame()
    for state in tqdm.tqdm(state_district_dict):
        for district in range(1, state_district_dict[state]+1):
            data_list.append(retrieve_2020_state_district_data(state, district, state_abbreviations=True))
    for data_df in data_list:
        result_df = pd.concat([result_df, data_df])
    return result_df

all_representatives_df = get_all_data(state_representatives_df)
all_representatives_df

{'AL': 7, 'AK': 1, 'AZ': 9, 'AR': 4, 'CA': 53, 'CO': 7, 'CT': 5, 'DE': 1, 'FL': 27, 'GA': 14, 'HI': 2, 'ID': 2, 'IL': 18, 'IN': 9, 'IA': 4, 'KS': 4, 'KY': 6, 'LA': 6, 'ME': 2, 'MD': 8, 'MA': 9, 'MI': 14, 'MN': 8, 'MS': 4, 'MO': 8, 'MT': 1, 'NE': 3, 'NV': 4, 'NH': 2, 'NJ': 12, 'NM': 3, 'NY': 27, 'NC': 13, 'ND': 1, 'OH': 16, 'OK': 5, 'OR': 5, 'PA': 18, 'RI': 2, 'SC': 7, 'SD': 1, 'TN': 9, 'TX': 36, 'UT': 4, 'VT': 1, 'VA': 11, 'WA': 10, 'WV': 3, 'WI': 8, 'WY': 1}


100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Unnamed: 0,State_Abbreviation,District,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,...,Result,CRPICO,State,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,AL,01,N00044245,Jerry Carl (R),1971321.50,1859348.91,387000.00,1044195.95,434655.50,105470.05,...,W,O,Alabama,,,2020-03-03 00:00:00 +0000,,0,2,N
1,AL,01,N00044750,James Averhart (D),80094.95,78973.24,0.00,50849.95,29245.00,0.00,...,L,O,Alabama,,,2020-03-03 00:00:00 +0000,,0,2,N
0,AL,02,N00041295,Barry Moore (R),650806.75,669367.70,230281.65,408536.20,11500.00,488.90,...,W,O,Alabama,,,2020-03-03 00:00:00 +0000,,0,2,N
1,AL,02,N00045944,Phyllis Harvey-Hall (D),56049.68,55988.07,2032.00,42411.95,10575.41,1030.32,...,L,O,Alabama,,,2020-03-03 00:00:00 +0000,,0,2,N
2,AL,02,N00045631,John Page (L),0.00,0.00,0.00,0.00,0.00,0.00,...,,O,Alabama,,,2020-03-03 00:00:00 +0000,,0,2,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WY,01,N00035504,Liz Cheney (R),3003883.34,3060166.78,1292490.00,1169995.46,0.00,541397.88,...,W,I,Wyoming,,,2020-08-18 00:00:00 +0000,WY01,0,1,N
1,WY,01,N00047272,Lynnette Grey Bull (D),134597.32,132234.75,2800.00,130197.32,0.00,1600.00,...,L,C,Wyoming,,,2020-08-18 00:00:00 +0000,,0,2,N
2,WY,01,N00047207,Zoilo Adalia (3),0.00,0.00,0.00,0.00,0.00,0.00,...,,C,Wyoming,,,2020-08-18 00:00:00 +0000,,0,2,N
3,WY,01,N00035139,Richard Brubaker (L),0.00,0.00,0.00,0.00,0.00,0.00,...,,C,Wyoming,,,2020-08-18 00:00:00 +0000,,0,2,N


In [12]:
all_representatives_df[all_representatives_df['State_Abbreviation']=='TN']

Unnamed: 0,State_Abbreviation,District,cid,FirstLastP,Rcpts,Spent,PACs,Indivs,Cand,Other,...,Result,CRPICO,State,IncCID,Incumbent,primarydate,DistIDCurr,capeye,sort,SmLgIndivsNote
0,TN,1,N00046688,Diana Harshbarger (R),2126945.6,1869099.77,222800.0,359728.5,1461293.0,83124.1,...,W,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
1,TN,1,N00046686,Blair Nicole Walsingham (D),140209.14,134994.55,1520.0,138689.14,0.0,0.0,...,L,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,TN,1,N00047760,Steve Holder (I),0.0,0.0,0.0,0.0,0.0,0.0,...,,O,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
0,TN,2,N00041594,Tim Burchett (R),1336275.75,878487.63,269535.0,1072845.61,0.0,-6104.86,...,W,I,Tennessee,,,2020-08-06 00:00:00 +0000,TN02,0,1,N
1,TN,2,N00041699,Renee Hoyos (D),812783.86,816793.15,3100.0,807459.01,0.0,2224.85,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,TN,2,N00047761,Matthew Campbell (I),0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
0,TN,3,N00030815,Chuck Fleischmann (R),1051653.39,381411.2,453858.46,603344.93,0.0,-5550.0,...,W,I,Tennessee,,,2020-08-06 00:00:00 +0000,TN03,0,1,N
1,TN,3,N00046911,Meg Gorman (D),85843.21,77759.83,2671.6,81271.61,2000.0,-100.0,...,L,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
2,TN,3,N00046589,Nancy Baxley (I),0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
3,TN,3,N00047762,Amber Hysell (I),0.0,0.0,0.0,0.0,0.0,0.0,...,,C,Tennessee,,,2020-08-06 00:00:00 +0000,,0,2,N
