# Public Commenting in a Pandemic

## Retrieving Comments

January through May for each year (2012 to 2020)

## References

API documentation: https://regulationsgov.github.io/developers/basics/

In [None]:
# import packages
import pandas as pd
import numpy as np
import requests
import urllib
import json
import time
import os

# datetime package too: https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

In [None]:
# Specify the path of the folder where the data are saved
filePath = "C:/Users/mark/Box Sync/_MF/Assignments/Insights/Public Commenting and COVID-19/Data/API/"

# general variables for setting parameters
APIkey = "fYTx9mVjuwc2ZSsdqmbgdtSqx7HGUd3aCRkiH6bC"
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

## Endpoint: documents

### Public Comments: Jan 2020 - May 2020

In [None]:
# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/20'
pdEnd = '05/31/20'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numPS // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numPS // rpp + 1) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2019 - May 2019

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/19'
pdEnd = '05/31/19'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2019Jan01_2019May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2019Jan01_2019May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2018 - May 2018

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/18'
pdEnd = '05/31/18'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2018Jan01_2018May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2018Jan01_2018May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2017 - May 2017

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/17'
pdEnd = '05/31/17'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2017Jan01_2017May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2017Jan01_2017May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2016 - May 2016

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/16'
pdEnd = '05/31/16'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2016Jan01_2016May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2016Jan01_2016May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2015 - May 2015

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/15'
pdEnd = '05/31/15'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2015Jan01_2015May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2015Jan01_2015May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2014 - May 2014

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/14'
pdEnd = '05/31/14'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2014Jan01_2014May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2014Jan01_2014May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2013 - May 2013

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/13'
pdEnd = '05/31/13'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2013Jan01_2013May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2013Jan01_2013May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2012 - May 2012

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/12'
pdEnd = '05/31/12'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2012Jan01_2012May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2012Jan01_2012May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')