In [1]:
import numpy as np
import pandas as pd
import os
import requests
import json
from bs4 import BeautifulSoup
import collections
collections.Callable = collections.abc.Callable
import time

In [2]:
propublica_token = os.environ['propublicatoken']

useragent_url = 'https://httpbin.org/user-agent'
r = requests.get(useragent_url)
useragent = json.loads(r.text)['user-agent']

KeyError: 'propublicatoken'

In [3]:
headers = {'X-API-Key': propublica_token,
          'User-Agent': useragent,
          'From': 'tby8aj@virginia.edu'}

## Goal: Get the text of all bills sponsored by Bob Good in the 117th Congress
### Step 1: Get Bob Good's ID number from the propublica members API

In [4]:
root = 'https://api.propublica.org'
congress = '117'
chamber = 'house'
endpoint = '/congress/v1/{congress}/{chamber}/members.json'.format(congress = congress, chamber = chamber)

r = requests.get(root+endpoint,
                headers = headers)
myjson = json.loads(r.text)
membersdf = pd.json_normalize(myjson,record_path=['results','members'])
membersdf.head(3).T

Unnamed: 0,0,1,2
id,A000370,A000055,A000371
title,Representative,Representative,Representative
short_title,Rep.,Rep.,Rep.
api_uri,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...
first_name,Alma,Robert,Pete
middle_name,,B.,
last_name,Adams,Aderholt,Aguilar
suffix,,,
date_of_birth,1946-05-27,1965-07-22,1979-06-19
gender,F,M,M


In [10]:
bobgood = membersdf.query("last_name=='Good'")

In [11]:
bobgood['id']
bobgoodid = bobgood.reset_index()['id'][0]

## Step 2: User BG's ID to query the bills API

In [12]:
endpoint = '/congress/v1/members/{memberid}/bills/{billtype}.json'.format(memberid = bobgoodid, billtype = 'introduced')
r = requests.get(root+endpoint,
                headers = headers)
myjson = json.loads(r.text)
bgbills1 = pd.json_normalize(myjson, record_path = ['results', 'bills'])

In [13]:
r = requests.get(root+endpoint,
                headers = headers, params = {'offset': 20})
myjson = json.loads(r.text)
bgbills2 = pd.json_normalize(myjson, record_path = ['results', 'bills'])

In [14]:
bgbills = pd.concat([bgbills1, bgbills2], ignore_index=True)

In [15]:
bgbills.head(3).T

Unnamed: 0,0,1,2
congress,117,117,117
bill_id,hr8935-117,hr8767-117,hres1297-117
bill_type,hr,hr,hres
number,H.R.8935,H.R.8767,H.RES.1297
bill_uri,https://api.propublica.org/congress/v1/117/bil...,https://api.propublica.org/congress/v1/117/bil...,https://api.propublica.org/congress/v1/117/bil...
title,To amend the Labor-Management Reporting and Di...,To establish a private right of action for par...,"Designating the week beginning November 7, 202..."
short_title,To amend the Labor-Management Reporting and Di...,Empowering Parents Act,"Designating the week beginning November 7, 202..."
sponsor_title,Rep.,Rep.,Rep.
sponsor_id,G000595,G000595,G000595
sponsor_name,Robert Good,Robert Good,Robert Good


In [21]:
bgbills['congressdotgov_url'][11] 

'https://www.congress.gov/bill/117th-congress/house-bill/5731/text?format=txt'

In [22]:
urltoscrape = bgbills['congressdotgov_url'][10] + '/text?format=txt'

In [23]:
r = requests.get(urltoscrape,
                headers = {'User-Agent': useragent,
                          'From': 'tby8aj@virginia.edu'})
myhtml = BeautifulSoup(r.text,'html.parser')

In [36]:
myhtml.find_all('h3', "currentVersion")[0].text

'Shown Here:Introduced in House (11/05/2021)'

In [40]:
print(myhtml.find_all('pre')[0].text)

[Congressional Bills 117th Congress]
[From the U.S. Government Publishing Office]
[H.R. 5901 Introduced in House (IH)]








117th CONGRESS
  1st Session
                                H. R. 5901

 To amend title 38, United States Code, to provide for the elimination 
 of delimiting dates under the educational assistance programs of the 
        Department of Veterans Affairs, and for other purposes.


_______________________________________________________________________


                    IN THE HOUSE OF REPRESENTATIVES

                            November 5, 2021

    Mr. Good of Virginia (for himself, Mr. Newhouse, Mrs. Miller of 
 Illinois, Mr. Posey, and Mr. Murphy of North Carolina) introduced the 
   following bill; which was referred to the Committee on Veterans' 
                                Affairs

_______________________________________________________________________

                                 A BILL


 
 To amend title 38, United States Code, to provide

In [43]:
for i in range(0,10): 
    print('https://www.congress.gov/bill/117th-congress/house-bill/573{i}/text?format=txt'.format(i=i))

https://www.congress.gov/bill/117th-congress/house-bill/5730/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5731/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5732/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5733/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5734/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5735/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5736/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5737/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5738/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5739/text?format=txt


In [62]:
def scrape_one_bill(url):
    time.sleep(2.5)
    print('Now getting the text from ' + url)
    r = requests.get(url, headers = {'User-Agent': useragent, 'From': 'tby8aj@virginia.edu'})
    myhtml = BeautifulSoup(r.text,'html.parser')
    try:
        billtext = myhtml.find_all('pre')[0].text
        return billtext
    except: pass


In [63]:
urllist = ['https://www.congress.gov/bill/117th-congress/house-bill/{i}/text?format=txt'.format(i=i) for i in range(8,13)]

In [64]:
bills = [scrape_one_bill(u) for u in urllist]

Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/10/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/11/text?format=txt
Now getting the text from https://www.congress.gov/bill/117th-congress/house-bill/12/text?format=txt


In [77]:
print(bills[3])

None
