In [5]:
import json
import re
import requests
import pandas as pd 
import urllib
from bs4 import BeautifulSoup

In [6]:
congress_numbers = [116]
page_size = 50

In [7]:
def get_congress_url(congress_numbers, page_size, page):
    congress_q_str = "{" + '"congress":[{}]'.format(','.join('"{0}"'.format(num) for num in congress_numbers)) + "}"
    params= {
        "q": congress_q_str,
        "pageSize" : page_size,
        "page" : page,
        "searchResultViewType": "expanded",
        "KWICView": "true"
    }
    actual_params = urllib.parse.urlencode(params, safe='{}:[]')
    return "https://www.congress.gov/members?{}".format(actual_params)

def get_num_pages(soup):
    page_num_element = soup.find("div", {"class": "basic-search-tune-number"}).find("div", {"class": "pagination"}).find("span", {"class": "results-number"}).text
    page_num_raw = [int(s) for s in page_num_element.split() if s.isdigit()]
    return int(page_num_raw[0])


In [8]:
def extract_member_name_url(entry):
    result_heading = entry.find("span", {"class": "result-heading"})
    member_name = result_heading.text
    if "Representative" in member_name:
        member_name = member_name.replace("Representative", "").strip()

    elif "Senator" in member_name:
        member_name = member_name.replace("Senator", "").strip()
    
    url = result_heading.a['href']
    return [member_name, url]   

def extract_terms_served(raw_terms):
    #   E.G. House: 1983-1995, 2003-2005
    #        Senate: 2005-2009
    #   E.G  House: 1999
    #   E.G  House: 1999-Present
    congressional_terms = []
    for raw_term in raw_terms.find_all("li"):
        [chamber, in_office] = raw_term.text.split(":")
        for year_pairs in in_office.split(","):
            term = {}
            term["chamber"] = chamber.strip()

            period = year_pairs.split("-")
            term["term-start"] = period[0].strip()
            
            endYear = period[0].strip() if len(period) == 1 else period[1].strip()
            term["term-end"] = (None if endYear.lower() == "present" else endYear)
            
            congressional_terms.append(term)
    return congressional_terms

def extract_congress_members(congress_list):
    members = []
    for entry in congress_list:
        member = {}
        [member["name"], member["url"]] = extract_member_name_url(entry)
        
        member_info = entry.find_all("span", {"class": "result-item"})
        in_office_terms = []
        for data in member_info: 
            field = data.find("strong").text.replace(":", "").strip()             
            value = data.find("span")
            if field.lower() == 'served':
                in_office_terms = extract_terms_served(value)
            else:
                member[field.lower()] = value.text.strip()

        for term in in_office_terms:
            member_copy = member.copy()
            for key in term.keys():
                member_copy[key] = term[key]
            if member_copy["chamber"] == "House" and "district" not in member_copy.keys():
                member_copy["district"] = "Missing"
            if  member_copy["chamber"] == "Senate":
                member_copy["district"] = "N/A"

            members.append(member_copy)
    return members


In [9]:
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)

response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")

num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)



Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22116%22]}&pageSize=50&page=1&searchResultViewType=expanded&KWICView=true
Number of result pages:  11


In [16]:
all_members = []
for page in range(1, num_pages+1):
    congress_url = get_congress_url(congress_numbers, page_size, page)
    print("Congress URL: ", congress_url)
    
    response = requests.get(congress_url).text
    soup = BeautifulSoup(response, "html.parser")
    
    congress_list = soup.find("ol", {"class": "basic-search-results-lists"}).find_all("li", {"class": "expanded"})

    print("Page Size: {}, Members found: {}".format(page_size, len(congress_list)))

    members = extract_congress_members(congress_list)
    print("Extracted {} congressional entries from Page {}".format(len(members), page))
    all_members.extend(members)

Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 282 congressional entries from Page 1
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=2&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 281 congressional entries from Page 2
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=3&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 275 congressional entries from Page 3
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pag

In [17]:
congress = pd.DataFrame(all_members)
congress

Unnamed: 0,name,url,state,district,party,chamber,term-start,term-end
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Hawaii,1,Democratic,House,1985,1987
1,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Hawaii,1,Democratic,House,1991,2011
2,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Louisiana,5,Republican,House,2015,
3,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,New York,5,Democratic,House,1983,2013
4,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...,North Carolina,12,Democratic,House,2014,
...,...,...,...,...,...,...,...,...
1122,"Young, Don",https://www.congress.gov/member/don-young/Y000033,Alaska,Missing,Republican,House,1973,
1123,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Indiana,,Republican,Senate,2017,
1124,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Indiana,Missing,Republican,House,2011,2017
1125,"Zeldin, Lee M.",https://www.congress.gov/member/lee-zeldin/Z00...,New York,1,Republican,House,2015,


In [18]:
fix_me = congress[(congress['chamber'] == 'House') & (congress['district'] == 'Missing')]
print("There are {} House members with no district that need checking/fixing".format(len(fix_me)))

There are 106 House members with no district that need checking/fixing


In [19]:
congress.to_csv("congress_list.csv")