In [1]:
import json
import re
import requests
import pandas as pd 
import numpy as np
import urllib
from bs4 import BeautifulSoup

In [2]:
congress_numbers = [111,112,113,114,115,116]
page_size = 250

In [3]:
def get_congress_url(congress_numbers, page_size, page):
    congress_q_str = "{" + '"congress":[{}]'.format(','.join('"{0}"'.format(num) for num in congress_numbers)) + "}"
    params= {
        "q": congress_q_str,
        "pageSize" : page_size,
        "page" : page,
        "searchResultViewType": "expanded",
        "KWICView": "true"
    }
    actual_params = urllib.parse.urlencode(params, safe='{}:[]')
    return "https://www.congress.gov/members?{}".format(actual_params)

def get_num_pages(soup):
    page_num_element = soup.find("div", {"class": "basic-search-tune-number"}).find("div", {"class": "pagination"}).find("span", {"class": "results-number"}).text
    page_num_raw = [int(s) for s in page_num_element.split() if s.isdigit()]
    return int(page_num_raw[0])


In [4]:
def extract_member_name_url(entry):
    result_heading = entry.find("span", {"class": "result-heading"})
    member_name = result_heading.text
    if "Representative" in member_name:
        member_name = member_name.replace("Representative", "").strip()

    elif "Senator" in member_name:
        member_name = member_name.replace("Senator", "").strip()
    
    url = result_heading.a['href']
    return [member_name, url]   

def extract_congress_members(congress_list):
    members = []
    for entry in congress_list:
        member = {}
        [member["name"], member["url"]] = extract_member_name_url(entry)
        
        members.append(member)
    return members

In [5]:
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)

response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")

num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)

Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Number of result pages:  5


In [6]:
#Now that we know how many pages we'll scrape, we can start scraping
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)

response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")

#We need the number of pages we will need to scrape
num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)

all_members = []
for page in range(1, num_pages+1):
    congress_url = get_congress_url(congress_numbers, page_size, page)
    print("Congress URL: ", congress_url)
    
    response = requests.get(congress_url).text
    soup = BeautifulSoup(response, "html.parser")
    
    congress_list = soup.find("ol", {"class": "basic-search-results-lists"}).find_all("li", {"class": "expanded"})

    print("Page Size: {}, Members found: {}".format(page_size, len(congress_list)))

    members = extract_congress_members(congress_list)
    print("Extracted {} congressional entries from Page {}".format(len(members), page))
    all_members.extend(members)

Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 250 congressional entries from Page 1
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=2&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 250 congressional entries from Page 2
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=3&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 250 congressional entries from Page 3
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pag

In [15]:
congress_members_urls = pd.DataFrame(all_members)
congress_members_urls.drop_duplicates(ignore_index=True, inplace=True)
congress_members_urls.head()

Unnamed: 0,name,url
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...
1,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...
2,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...
3,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...
4,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...


In [23]:
def parse_profile(soup):
    results = []

    profile = soup.find("div", {"class": "overview-member-column-profile member_profile"})
    term_table = profile.find("table", {"class": "standard01 lateral01"})
    party_table = profile.find("table", {"class": "standard01 nomargin"})

    data = []
    temp = []
    for row in party_table.find_all("tr"):
        col = row.find('td').text.strip()
        temp.append(col)
    party = temp[-1]

    data = []
    for row in term_table.find_all("tr"):
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        res = [ele for ele in cols if ele]
        if res:
            state = res[0]
            if len(res) == 2: #senate
                district = "N/A"
                info = res[1].split(":")
            elif len(res) == 3:
                district = res[1]
                info = res[2].split(":")

            if district == "--":
                district = 0
            
            chamber = info[0]

            years_str = info[1][info[1].find("(")+1:info[1].find(")")]
            years = years_str.split("-")
            if len(years) == 1:
                start = years[0]
                end = years[0]
            if len(years) == 2:
                start = years[0]
                end = years[1]
            if end == 'Present':
                end = np.inf
            results.append([party, state, district, chamber, start, end])
    return results


In [20]:
# Now we will fetch the profile of each representative and senator and parse it for the information we need
# Takes a bit of time
columns=['name','url','party','state','district','chamber', 'start', 'end']
congress_member_info = pd.DataFrame()
for index, row in congress_members_urls.iterrows():
    name = row['name']
    url = row['url']
    print(name)
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    terms = parse_profile(soup)
    for term in terms:
        to_add = [name,url]
        to_add.extend(term)
        to_add = pd.Series(to_add, columns)
        congress_member_info = congress_member_info.append([to_add], ignore_index=True)


Abercrombie, Neil
Abraham, Ralph Lee
Ackerman, Gary L.
Adams, Alma S.
Adams, Sandy
Aderholt, Robert B.
Adler, John H.
Aguilar, Pete
Akaka, Daniel K.
Akin, W. Todd
Alexander, Lamar
Alexander, Rodney
Allen, Rick W.
Allred, Colin Z.
Altmire, Jason
Amash, Justin
Amodei, Mark E.
Andrews, Robert E.
Arcuri, Michael A.
Armstrong, Kelly
Arrington, Jodey C.
Ashford, Brad
Austria, Steve
Axne, Cynthia
Ayotte, Kelly
Babin, Brian
Baca, Joe
Bachmann, Michele
Bachus, Spencer
Bacon, Don
Baird, Brian
Baird, James R.
Balderson, Troy
Baldwin, Tammy
Banks, Jim
Barber, Ron
Barletta, Lou
Barr, Andy
Barragan, Nanette Diaz
Barrasso, John
Barrett, J. Gresham
Barrow, John
Bartlett, Roscoe G.
Barton, Joe
Bass, Charles F.
Bass, Karen
Baucus, Max
Bayh, Evan
Bean, Melissa L.
Beatty, Joyce
Becerra, Xavier
Begich, Mark
Benishek, Dan
Bennet, Michael F.
Bennett, Robert F.
Bentivolio, Kerry L.
Bera, Ami
Berg, Rick
Bergman, Jack
Berkley, Shelley
Berman, Howard L.
Berry, Marion
Beyer, Donald S., Jr.
Biden, Joseph R., Jr.
B

Kind, Ron
King, Angus S., Jr.
King, Peter T.
King, Steve
Kingston, Jack
Kinzinger, Adam
Kirk, Mark Steven
Kirk, Paul Grattan, Jr.
Kirkpatrick, Ann
Kissell, Larry
Klein, Ron
Kline, John
Klobuchar, Amy
Knight, Stephen
Kohl, Herb
Kosmas, Suzanne M.
Kratovil, Frank,  Jr.
Krishnamoorthi, Raja
Kucinich, Dennis J.
Kuster, Ann M.
Kustoff, David
Kyl, Jon
Labrador, Raul R.
LaHood, Darin
LaMalfa, Doug
Lamb, Conor
Lamborn, Doug
Lance, Leonard
Landrieu, Mary L.
Landry, Jeffrey M.
Langevin, James R.
Lankford, James
Larsen, Rick
Larson, John B.
Latham, Tom
LaTourette, Steven C.
Latta, Robert E.
Lautenberg, Frank R.
Lawrence, Brenda L.
Lawson, Al, Jr.
Leahy, Patrick J.
Lee, Barbara
Lee, Christopher J.
Lee, Mike
Lee, Susie
LeMieux, George S.
Lesko, Debbie
Levin, Andy
Levin, Carl
Levin, Mike
Levin, Sander M.
Lewis, Jason
Lewis, Jerry
Lewis, John
Lieberman, Joseph I.
Lieu, Ted
Lincoln, Blanche L.
Linder, John
Lipinski, Daniel
LoBiondo, Frank A.
Loebsack, David
Loeffler, Kelly
Lofgren, Zoe
Long, Billy
Lou

In [21]:
#Quick look at the data
congress_member_info

Unnamed: 0,name,url,party,state,district,chamber,start,end
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,1991,2011
1,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,1985,1987
2,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Republican,Louisiana,5,House,2015,inf
3,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,5,House,1993,2013
4,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,7,House,1983,1993
...,...,...,...,...,...,...,...,...
1333,"Young, Don",https://www.congress.gov/member/don-young/Y000033,Republican,Alaska,At Large,House,1973,inf
1334,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Republican,Indiana,0,Senate,2017,inf
1335,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Republican,Indiana,9,House,2011,2017
1336,"Zeldin, Lee M.",https://www.congress.gov/member/lee-zeldin/Z00...,Republican,New York,1,House,2015,inf


In [22]:
# Finally save what we scraped
congress_member_info.to_csv("congress_111_116.csv")