# Public Commenting in a Pandemic

## Retrieving Comments

January through May for each year (2012 to 2020)

## References

API documentation: https://regulationsgov.github.io/developers/basics/

In [1]:
# import packages
import pandas as pd
import numpy as np
import requests
import urllib
import json
import time
import os

# datetime package too: https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

In [2]:
# Specify the path of the folder where the data are saved
filePath = "C:/Users/mark/Box Sync/_MF/Assignments/Insights/Public Commenting and COVID-19/Data/API/"

# general variables for setting parameters
APIkey = "fYTx9mVjuwc2ZSsdqmbgdtSqx7HGUd3aCRkiH6bC"
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

## Endpoint: documents

### Public Comments: Jan 2020 - May 2020

In [3]:
# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/20'
pdEnd = '05/31/20'

In [4]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numPS // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numPS // rpp + 1) - numPS)+' records.',sep='\n')

Status Code: 200
Request URL: https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate&po=0&rpp=1000&api_key=fYTx9mVjuwc2ZSsdqmbgdtSqx7HGUd3aCRkiH6bC&pd=01%2F01%2F20-05%2F31%2F20

Total number of records requested: 1368244
Number retrieved: 1000

Determine how many pages of records need to be combined via the extend method...
Start with: 1369
That would be enough to retrieve 1369000 records -- a margin of 756 records.
Wall time: 15.7 s


In [5]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

Initial length of data: 0
Time array length: 1000
nextAllowableTimeIndex = 0  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:20:25.635644
request made (pageIndex = 0)
Retrieved: 1000 

nextAllowableTimeIndex = 100  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:28:28.349666
request made (pageIndex = 100)
Retrieved: 101000 

nextAllowableTimeIndex = 200  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:32:11.166921
request made (pageIndex = 200)
Retrieved: 201000 

nextAllowableTimeIndex = 300  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:35:20.872248
request made (pageIndex = 300)
Retrieved: 301000 

nextAllowableTimeIndex = 400  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:37:50.102452
request made (pageIndex = 400)
Retrieved: 401000 

nextAllowableTimeIndex = 500  nextAllowableTime = 2020-06-24 14:20:25.635644  currentTime = 2020-06-24 14:

In [6]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

Exported as JSON!


In [13]:
%%time

for n in range(0,len(dctsPS_all)):
    if dctsPS_all[n]['documentId']=='FAA-2019-1100-14848':
        print(dctsPS_all[n])

print('done')

{'agencyAcronym': 'FAA', 'allowLateComment': False, 'attachmentCount': 0, 'commentDueDate': None, 'commentStartDate': None, 'commentText': 'This nprm will effectively destroy the fpv and rc aircraft hobby along with a vibrant stem community. Strongly oppose!', 'docketId': 'FAA-2019-1100', 'docketTitle': 'Remote Identification of Unmanned Aircraft Systems ', 'docketType': 'Rulemaking', 'documentId': 'FAA-2019-1100-14848', 'documentStatus': 'Posted', 'documentType': 'Public Submission', 'numberOfCommentsReceived': 1, 'openForComment': False, 'postedDate': '2020-02-14T00:00:00-05:00', 'rin': '2120-AL31', 'submitterName': 'Chi Woodruff', 'title': 'Comment from Chi Woodruff'}
done
Wall time: 1min 18s


In [22]:
print('0',dctsPS_all[0],'\n',
      '123456',dctsPS_all[123456],'\n',
      '1234567',dctsPS_all[1234567],'\n',
      'last',dctsPS_all[-1], sep='\n')

0
{'agencyAcronym': 'DOJ', 'allowLateComment': False, 'attachmentCount': 0, 'commentDueDate': None, 'commentStartDate': None, 'commentText': 'Forced DNA collection from immigrants in detention without their consent would have wide repercussions for everyone in this country -- not only those in immigrant detention sites. In effect, this would constitute a big step toward a mass database for full population surveillance. And it would be achieved by miscasting the hundreds and thousands of children and adults in immigration detention as threats to the countrys safety. With this vast amount of sensitive information in the governments hands, the potential for abuse is too great. ', 'docketId': 'DOJ-OAG-2019-0004', 'docketTitle': 'DNA-Sample Collection From Immigration Detainees', 'docketType': 'Rulemaking', 'documentId': 'DOJ-OAG-2019-0004-18252', 'documentStatus': 'Posted', 'documentType': 'Public Submission', 'numberOfCommentsReceived': 1, 'openForComment': False, 'organization': 'America

In [14]:
%%time

# convert to pandas DataFrame
df2020 = pd.DataFrame(dctsPS_all)
df2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1368244 entries, 0 to 1368243
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype 
---  ------                    --------------    ----- 
 0   agencyAcronym             1368244 non-null  object
 1   allowLateComment          1368244 non-null  bool  
 2   attachmentCount           1368244 non-null  int64 
 3   commentDueDate            0 non-null        object
 4   commentStartDate          0 non-null        object
 5   commentText               1368150 non-null  object
 6   docketId                  1368244 non-null  object
 7   docketTitle               1368244 non-null  object
 8   docketType                1368244 non-null  object
 9   documentId                1368244 non-null  object
 10  documentStatus            1368244 non-null  object
 11  documentType              1368244 non-null  object
 12  numberOfCommentsReceived  1368244 non-null  int64 
 13  openForComment            1368244 non-null

In [15]:
dfColumns = df2020.columns.tolist()
print(dfColumns)

['agencyAcronym', 'allowLateComment', 'attachmentCount', 'commentDueDate', 'commentStartDate', 'commentText', 'docketId', 'docketTitle', 'docketType', 'documentId', 'documentStatus', 'documentType', 'numberOfCommentsReceived', 'openForComment', 'organization', 'postedDate', 'rin', 'submitterName', 'title']


In [17]:
write_columns = ['agencyAcronym','attachmentCount','commentDueDate','commentStartDate','docketId',
                 'docketType','documentId','numberOfCommentsReceived','openForComment','postedDate',
                 'submitterName','title','organization']

savePath = 'C:/Users/mark/Box Sync/_MF/Assignments/Insights/Public Commenting and COVID-19/Data/Annual/'
saveFile = 'endpoint_documents_PS_2020.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    df2020.to_csv(outfile, index_label='index', line_terminator='\n', columns=write_columns)

### Public Comments: Jan 2019 - May 2019

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/19'
pdEnd = '05/31/19'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2019Jan01_2019May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2019Jan01_2019May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2018 - May 2018

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/18'
pdEnd = '05/31/18'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2018Jan01_2018May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2018Jan01_2018May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2017 - May 2017

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/17'
pdEnd = '05/31/17'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2017Jan01_2017May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2017Jan01_2017May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2016 - May 2016

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/16'
pdEnd = '05/31/16'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2016Jan01_2016May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2016Jan01_2016May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2015 - May 2015

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/15'
pdEnd = '05/31/15'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2015Jan01_2015May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2015Jan01_2015May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2014 - May 2014

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/14'
pdEnd = '05/31/14'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2014Jan01_2014May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2014Jan01_2014May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2013 - May 2013

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/13'
pdEnd = '05/31/13'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2013Jan01_2013May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2013Jan01_2013May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

### Public Comments: Jan 2012 - May 2012

In [None]:
# reset parameters
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/12'
pdEnd = '05/31/12'

In [None]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2012Jan01_2012May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS//rpp + (1 if (numPS%rpp>0) else 0)),
          'That would be enough to retrieve '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)))+' records'
          ' -- a margin of '+str(rpp * (numPS//rpp + (1 if (numPS%rpp>0) else 0)) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2012Jan01_2012May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')