## Downloading Metadata for Congressional Bills

In [1]:
import pandas as pd
import time
import urllib
import json

You need a user string to download CSVs. Go to a search page on Congress.gov, click "Download", and copy the
url to the tempalte_url below.

In [2]:
# Template URL
template_url = "https://www.congress.gov/search?pageSize=25&q=%7B%22source%22%3A%22legislation%22%2C%22bill-status%22%3A%22law%22%2C%22type%22%3A%22bills%22%2C%22congress%22%3A%22110%22%7D&1ddcb92ade31c8fbd370001f9b29a7d9=628cb5675ff524f3e719b7aa2e88fe3f"
template_url = urllib.parse.urlparse(template_url)

# Change args and reencode
def construct_url(params, template_url=template_url):
    args = dict(urllib.parse.parse_qsl(template_url.query))
    args['q'] = json.dumps(params)
    parts = list(template_url)
    parts[4] = urllib.parse.urlencode(tuple(dict(args).items()))
    return urllib.parse.urlunparse(parts)

def url_to_df(url, add_cols = None):
    # Need header else rejected
    headers = {"User-Agent": 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
    request = urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(request)
    # Skip info in csv before the data
    df = pd.read_csv(response, skiprows=3)
    #data = response.read()
    if add_cols:
      for colname, val in add_cols:
        df[colname] = [val] * len(df)
    return df
  
opts = dict(
type = [['bills'], ['resolutions', 'joint-resolutions', 'concurrent-resolutions']],
chamber = ["House", "Senate"],
party = [['Democratic'], ['Republican', 'Independent', 'Independent Democrat']],
subject = ['Taxation', 'Government Operations and Politics', 'Health', 'Congress', 'Armed Forces and National Security', 'Foreign Trade and International Finance', 'International Affairs', 'Public Lands and Natural Resources', 'Crime and Law Enforcement', 'Transportation and Public Works', 'Social Welfare', 'Education', 'Energy', 'Agriculture and Food', 'Economics and Public Finance', 'Labor and Employment', 'Environmental Protection', 'Finance and Financial Sector', 'Commerce', 'Science, Technology, Communications', 'Immigration', 'Law', 'Housing and Community Development', 'Water Resources Development', 'Native Americans', 'Civil Rights and Liberties, Minority Issues', 'Emergency Management', 'Families', 'Animals', 'Arts, Culture, Religion', 'Sports and Recreation', 'Social Sciences and History']
)

Order:
  - type:bills, source:legislation, bill-status: passed-both
  - subject
     - (for three cases where >1000 rows) party

In [3]:
base_params = {'bill-status': 'passed-both', 'source': 'legislation', 'type': 'bills'}
# Other potential statuses: 'law' denotes just the bills that became law, 'introduced'
# gets everything. Currently using a medium-wide net with 'passed-both':
# bills that passed both chambers

all_dfs = []
for subject in opts['subject']:
    print('Subject:', subject)

    params = base_params.copy()
    params['subject'] = subject
    url = construct_url(params, template_url)
    df = url_to_df(url, add_cols=[('Subject', subject)])
    if len(df) < 1000:
        all_dfs.append(df)
        # Wait 10s, easier to interupt this way:
        for i in range(10):
            time.sleep(1)
    else:
        for party in opts['party']:
            print("\tParty", party)
            params['party'] = party
            url = construct_url(params, template_url)
            df = url_to_df(url, add_cols=[('Subject', subject)])
            if len(df) < 1000:
                all_dfs.append(df)
                for i in range(10):
                    time.sleep(1)
            else:
                print("Still too long:", subject, party)

alldf = pd.concat(all_dfs)
alldf.sample(3)

Subject: Taxation
Subject: Government Operations and Politics
	Party ['Democratic']
	Party ['Republican', 'Independent', 'Independent Democrat']
Subject: Health
Subject: Congress
Subject: Armed Forces and National Security
Subject: Foreign Trade and International Finance
Subject: International Affairs
Subject: Public Lands and Natural Resources
	Party ['Democratic']
	Party ['Republican', 'Independent', 'Independent Democrat']
Subject: Crime and Law Enforcement
Subject: Transportation and Public Works
Subject: Social Welfare
Subject: Education
Subject: Energy
Subject: Agriculture and Food
Subject: Economics and Public Finance
Subject: Labor and Employment
Subject: Environmental Protection
Subject: Finance and Financial Sector
Subject: Commerce
Subject: Science, Technology, Communications
Subject: Immigration
Subject: Law
Subject: Housing and Community Development
Subject: Water Resources Development
Subject: Native Americans
Subject: Civil Rights and Liberties, Minority Issues
Subject: 

Unnamed: 0,Legislation Number,URL,Congress,Amends Bill,Title,Sponsor,Date of Introduction,Date Offered,Date Submitted,Date Proposed,Number of Cosponsors,Committees,Latest Action Date,Latest Action,Subject
207,H.R. 3547,https://www.congress.gov/bill/111th-congress/h...,111th Congress (2009-2010),,To designate the facility of the United States...,"Rep. Chaffetz, Jason [R-UT-3]",09/10/2009,,,,2,House - Oversight and Government Reform | Sena...,11/30/2009,Became Public Law No: 111-108.,Government Operations and Politics
264,H.R. 1426,https://www.congress.gov/bill/99th-congress/ho...,99th Congress (1985-1986),,Indian Health Care Amendments of 1986,"Rep. Udall, Morris K. [D-AZ-2]",03/05/1985,,,,32,"House - Energy and Commerce, Interior and Insu...",10/18/1986,Senate concurred in the House amendments to th...,Native Americans
125,H.R. 355,https://www.congress.gov/bill/102nd-congress/h...,102nd Congress (1991-1992),,Reclamation States Emergency Drought Relief Ac...,"Rep. Lehman, Richard H. [D-CA-18]",01/03/1991,,,,6,"House - Interior and Insular Affairs, Merchant...",03/05/1992,Became Public Law No: 102-250.,Water Resources Development


**Note**: There seem to be bills without subject areas. At the time of writing, congress.gov lists 11,078 Legislation/Bill/Passed Both results, but we only scraped 9011. The counts per category (e.g. "Government Operations and Politics": 1408) are all correct, suggesting 2000 results without a subject.

## Cleaning and parsing metadata

In [14]:
import pandas as pd
alldf = pd.read_csv('bill-meta.csv')

Index(['legislation_number', 'url', 'congress', 'amends_bill', 'title',
       'sponsor', 'date_of_introduction', 'date_offered', 'date_submitted',
       'date_proposed', 'number_of_cosponsors', 'committees',
       'latest_action_date', 'latest_action', 'subject', 'sponsor_party',
       'id'],
      dtype='object')

In [24]:
# Add metadata
alldf.columns = alldf.columns.str.lower().str.replace(' ', '_')
alldf['sponsor_party'] = alldf.sponsor.str.extract('\[(\w)-')[0]
alldf['id'] = alldf.congress.apply(lambda x: x[:3]) + ' ' + alldf.legislation_number
alldf.head(2)

Unnamed: 0,legislation_number,url,congress,amends_bill,title,sponsor,date_of_introduction,date_offered,date_submitted,date_proposed,number_of_cosponsors,committees,latest_action_date,latest_action,subject,sponsor_party,id
0,H.R. 3823,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,Disaster Tax Relief and Airport and Airway Ext...,"Rep. Brady, Kevin [R-TX-8]",09/25/2017,,,,2,"House - Ways and Means, Transportation and Inf...",09/29/2017,Became Public Law No: 115-63.,Taxation,R,115 H.R. 3823
1,H.R. 88,https://www.congress.gov/bill/115th-congress/h...,115th Congress (2017-2018),,Shiloh National Military Park Boundary Adjustm...,"Rep. Blackburn, Marsha [R-TN-7]",01/03/2017,,,,0,House - Natural Resources | Senate - Energy an...,12/21/2018,Message on House action received in Senate and...,Taxation,R,115 H.R. 88


## Save Data

In [25]:
alldf.to_csv('bill-meta.csv', index=False)