## Crawling Congressional Bill Metadata

In [1]:
import pandas as pd
import time
import urllib
import json

You need a user string to download CSVs. Go to a search page on Congress.gov, click "Download", and copy the
url to the template_url below.

In [2]:
# Template URL
template_url = "https://www.congress.gov/search?pageSize=25&q=%7B%22source%22%3A%22legislation%22%2C%22bill-status%22%3A%22law%22%2C%22type%22%3A%22bills%22%2C%22congress%22%3A%22110%22%7D&1ddcb92ade31c8fbd370001f9b29a7d9=628cb5675ff524f3e719b7aa2e88fe3f"
template_url = urllib.parse.urlparse(template_url)

# Change args and reencode
def construct_url(params, template_url=template_url):
    args = dict(urllib.parse.parse_qsl(template_url.query))
    args['q'] = json.dumps(params)
    parts = list(template_url)
    parts[4] = urllib.parse.urlencode(tuple(dict(args).items()))
    return urllib.parse.urlunparse(parts)

def url_to_df(url, add_cols = None):
    # Need header else rejected
    headers = {"User-Agent": 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
    request = urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(request)
    # Skip info in csv before the data
    df = pd.read_csv(response, skiprows=3)
    #data = response.read()
    if add_cols:
      for colname, val in add_cols:
        df[colname] = [val] * len(df)
    return df
  
opts = dict(
type = [['bills'], ['resolutions', 'joint-resolutions', 'concurrent-resolutions']],
chamber = ["House", "Senate"],
party = [['Democratic'], ['Republican', 'Independent', 'Independent Democrat']],
subject = ['Taxation', 'Government Operations and Politics', 'Health', 'Congress', 'Armed Forces and National Security', 'Foreign Trade and International Finance', 'International Affairs', 'Public Lands and Natural Resources', 'Crime and Law Enforcement', 'Transportation and Public Works', 'Social Welfare', 'Education', 'Energy', 'Agriculture and Food', 'Economics and Public Finance', 'Labor and Employment', 'Environmental Protection', 'Finance and Financial Sector', 'Commerce', 'Science, Technology, Communications', 'Immigration', 'Law', 'Housing and Community Development', 'Water Resources Development', 'Native Americans', 'Civil Rights and Liberties, Minority Issues', 'Emergency Management', 'Families', 'Animals', 'Arts, Culture, Religion', 'Sports and Recreation', 'Social Sciences and History']
)

Order:
  - type:bills, source:legislation, bill-status: passed-both
  - subject
     - (for three cases where >1000 rows) party

In [3]:
base_params = {'bill-status': 'passed-both', 'source': 'legislation', 'type': 'bills'}
# Other potential statuses: 'law' denotes just the bills that became law, 'introduced'
# gets everything. Currently using a medium-wide net with 'passed-both':
# bills that passed both chambers

all_dfs = []
for subject in opts['subject']:
  print('Subject:', subject)
  
  params = base_params.copy()
  params['subject'] = subject
  url = construct_url(params, template_url)
  df = url_to_df(url, add_cols=[('Subject', subject)])
  if len(df) < 1000:
    all_dfs.append(df)
    # Wait 10s, easier to interupt this way:
    for i in range(10):
       time.sleep(1)
  else:
    for party in opts['party']:
      print("\tParty", party)
      params['party'] = party
      url = construct_url(params, template_url)
      df = url_to_df(url, add_cols=[('Subject', subject)])
      if len(df) < 1000:
        all_dfs.append(df)
        for i in range(10):
           time.sleep(1)
      else:
        print("Still too long:", subject, party)

alldf = pd.concat(all_dfs)
alldf.sample(3)

Subject: Taxation
Subject: Government Operations and Politics
	Party ['Democratic']


KeyboardInterrupt: 

In [7]:
pd.concat(all_dfs)

Unnamed: 0,Legislation Number,URL,Congress,Amends Bill,Title,Sponsor,Date of Introduction,Date Offered,Date Submitted,Date Proposed,Number of Cosponsors,Committees,Latest Action Date,Latest Action,Subject
0,H.R. 3823,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,Disaster Tax Relief and Airport and Airway Ext...,"Rep. Brady, Kevin [R-TX-8]",09/25/2017,,,,2,"House - Ways and Means, Transportation and Inf...",09/29/2017,Became Public Law No: 115-63.,Taxation
1,H.R. 1,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,An Act to provide for reconciliation pursuant ...,"Rep. Brady, Kevin [R-TX-8]",11/02/2017,,,,24,"House - Ways and Means, Ways and Means",12/22/2017,Became Public Law No: 115-97.,Taxation
2,H.R. 5946,https://www.congress.gov/bill/114th-congress/h...,114th Congress (2015-2016),,United States Appreciation for Olympians and P...,"Rep. Dold, Robert J. [R-IL-10]",09/07/2016,,,,12,"House - Ways and Means, Ways and Means",10/07/2016,Became Public Law No: 114-239.,Taxation
3,H.R. 3209,https://www.congress.gov/bill/114th-congress/h...,114th Congress (2015-2016),,Recovering Missing Children Act,"Rep. Paulsen, Erik [R-MN-3]",07/23/2015,,,,30,"House - Ways and Means, Ways and Means | Senat...",06/30/2016,Became Public Law No: 114-184.,Taxation
4,H.R. 1527,https://www.congress.gov/bill/114th-congress/h...,114th Congress (2015-2016),,Slain Officer Family Support Act of 2015,"Rep. Jeffries, Hakeem S. [D-NY-8]",03/23/2015,,,,7,House - Ways and Means,04/01/2015,Became Public Law No: 114-7.,Taxation
5,H.R. 606,https://www.congress.gov/bill/114th-congress/h...,114th Congress (2015-2016),,Don't Tax Our Fallen Public Safety Heroes Act,"Rep. Paulsen, Erik [R-MN-3]",01/28/2015,,,,25,House - Ways and Means,05/22/2015,Became Public Law No: 114-14.,Taxation
6,H.R. 5771,https://www.congress.gov/bill/113th-congress/h...,113th Congress (2013-2014),,To amend the Internal Revenue Code of 1986 to ...,"Rep. Camp, Dave [R-MI-4]",12/01/2014,,,,0,"House - Ways and Means, Education and the Work...",12/19/2014,Became Public Law No: 113-295.,Taxation
7,H.R. 3771,https://www.congress.gov/bill/113th-congress/h...,113th Congress (2013-2014),,Philippines Charitable Giving Assistance Act,"Rep. Swalwell, Eric [D-CA-15]",12/12/2013,,,,35,"House - Ways and Means, Budget",03/25/2014,Became Public Law No: 113-92.,Taxation
8,H.R. 3458,https://www.congress.gov/bill/113th-congress/h...,113th Congress (2013-2014),,Fallen Firefighters Assistance Tax Clarificati...,"Rep. Slaughter, Louise McIntosh [D-NY-25]",11/12/2013,,,,3,House - Ways and Means,12/20/2013,Became Public Law No: 113-63.,Taxation
9,H.R. 3043,https://www.congress.gov/bill/113th-congress/h...,113th Congress (2013-2014),,Tribal General Welfare Exclusion Act of 2014,"Rep. Nunes, Devin [R-CA-22]",08/02/2013,,,,61,House - Ways and Means,09/26/2014,Became Public Law No: 113-168.,Taxation


**Note**: There seem to be bills without subject areas. At the time of writing, congress.gov lists 11,078 Legislation/Bill/Passed Both results, but we only scraped 9011. The counts per category (e.g. "Government Operations and Politics": 1408) are all correct, suggesting 2000 results without a subject.

In [None]:
len(alldf)

In [None]:

alldf.Subject.value_counts()

Government Operations and Politics             1408
Public Lands and Natural Resources             1305
Armed Forces and National Security              678
Economics and Public Finance                    490
Transportation and Public Works                 469
Native Americans                                382
Health                                          378
International Affairs                           367
Crime and Law Enforcement                       319
Agriculture and Food                            266
Commerce                                        266
Finance and Financial Sector                    247
Energy                                          234
Water Resources Development                     206
Foreign Trade and International Finance         191
Environmental Protection                        190
Education                                       181
Law                                             180
Science, Technology, Communications             179
Taxation    

In [8]:
df.to_csv('./bill-meta.csv')

In [9]:
df

Unnamed: 0,Legislation Number,URL,Congress,Amends Bill,Title,Sponsor,Date of Introduction,Date Offered,Date Submitted,Date Proposed,Number of Cosponsors,Committees,Latest Action Date,Latest Action,Subject
0,H.R. 4840,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Murphy, Stephanie N. [D-FL-7]",01/18/2018,,,,26,House - Oversight and Government Reform | Sena...,07/24/2018,Became Public Law No: 115-217.,Government Operations and Politics
1,H.R. 4685,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Cicilline, David N. [D-RI-1]",12/19/2017,,,,1,House - Oversight and Government Reform | Sena...,07/24/2018,Became Public Law No: 115-215.,Government Operations and Politics
2,H.R. 4574,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Krishnamoorthi, Raja [D-IL-8]",12/06/2017,,,,17,House - Oversight and Government Reform | Sena...,07/24/2018,Became Public Law No: 115-213.,Government Operations and Politics
3,H.R. 4463,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Velazquez, Nydia M. [D-NY-7]",11/28/2017,,,,29,House - Oversight and Government Reform | Sena...,07/24/2018,Became Public Law No: 115-212.,Government Operations and Politics
4,H.R. 4406,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Espaillat, Adriano [D-NY-13]",11/15/2017,,,,26,House - Oversight and Government Reform | Sena...,07/24/2018,Became Public Law No: 115-211.,Government Operations and Politics
5,H.R. 4042,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Soto, Darren [D-FL-9]",10/12/2017,,,,26,House - Oversight and Government Reform | Sena...,03/23/2018,Became Public Law No: 115-154.,Government Operations and Politics
6,H.R. 3638,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Lawson, Al, Jr. [D-FL-5]",08/01/2017,,,,26,House - Oversight and Government Reform | Sena...,03/23/2018,Became Public Law No: 115-150.,Government Operations and Politics
7,H.R. 3243,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,FITARA Enhancement Act of 2017,"Rep. Connolly, Gerald E. [D-VA-11]",07/14/2017,,,,3,"House - Oversight and Government Reform, Overs...",11/21/2017,Became Public Law No: 115-88.,Government Operations and Politics
8,H.R. 3031,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,TSP Modernization Act of 2017,"Rep. Cummings, Elijah E. [D-MD-7]",06/23/2017,,,,8,"House - Oversight and Government Reform, Overs...",11/17/2017,Became Public Law No: 115-84.,Government Operations and Politics
9,H.R. 2873,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,To designate the facility of the United States...,"Rep. Boyle, Brendan F. [D-PA-13]",06/12/2017,,,,17,House - Oversight and Government Reform | Sena...,03/23/2018,Became Public Law No: 115-147.,Government Operations and Politics
