# Public Commenting in a Pandemic

## References

API documentation: https://regulationsgov.github.io/developers/basics/

In [1]:
# import packages
import pandas as pd
import numpy as np
import requests
import urllib
import json
import time
import os

# datetime package too: https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

In [None]:
# test datetime -- CAN SKIP
now = datetime.now()
print("recorded time is " + str(now))

timeIndices = []
totalIndices = 5
for index in range(0,5):
    timeIndices.append(now)

later = now + timedelta(seconds = 5)
print("later time is " + str(later))

while True:
    currTime = datetime.now()
    if (currTime < later):
        print("not yet")
        time.sleep(1)
    else:
        print("finally waited long enough")
        break;

In [None]:
# test retrieving chunk time -- CAN SKIP
t1 = 1*60+58
c1 = 20000
cx = numPS
tx = t1*cx/c1

print(str(round(tx))+' seconds')
print(str(round(tx//60))+' minutes, '+str(round(tx%60))+' seconds')
print(str(round(tx/60//60))+' hours, '+str(round(tx/60%60))+' minutes')

In [2]:
# Specify the path of the folder where the data are saved
filePath = "C:/Users/mark/Box Sync/_MF/Assignments/Insights/Public Commenting and COVID-19/Data/"

# general variables for setting parameters
APIkey = "fYTx9mVjuwc2ZSsdqmbgdtSqx7HGUd3aCRkiH6bC"
rpp = 1000
pageIndex = 0
po = pageIndex * rpp

## Endpoint: documents

### Public Comments: Jan 2020 - May 2020

In [3]:
# variables for setting parameters: document type = Public Submission
baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate"
pdStart = '01/01/20'
pdEnd = '05/31/20'

In [4]:
%%time

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': pdStart+'-'+pdEnd}

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_'+pdStart+'-'+pdEnd+'.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numPS // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numPS // rpp + 1) - numPS)+' records.',sep='\n')

Status Code: 200
Request URL: https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dct=PS&so=ASC&sb=postedDate&po=0&rpp=1000&api_key=fYTx9mVjuwc2ZSsdqmbgdtSqx7HGUd3aCRkiH6bC&pd=01%2F01%2F20-05%2F31%2F20

Total number of records requested: 1368250
Number retrieved: 1000

Determine how many pages of records need to be combined via the extend method...
Start with: 1369
That would be enough to retrieve 1369000 records -- a margin of 750 records.
Wall time: 2.89 s


In [6]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

Initial length of data: 0
Time array length: 1000
nextAllowableTimeIndex = 0  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:35:09.451395
request made (pageIndex = 0)
Retrieved: 1000 

nextAllowableTimeIndex = 100  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:37:36.397942
request made (pageIndex = 100)
Retrieved: 101000 

nextAllowableTimeIndex = 200  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:39:33.201408
request made (pageIndex = 200)
Retrieved: 201000 

nextAllowableTimeIndex = 300  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:43:15.629961
request made (pageIndex = 300)
Retrieved: 301000 

nextAllowableTimeIndex = 400  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:45:28.055897
request made (pageIndex = 400)
Retrieved: 401000 

nextAllowableTimeIndex = 500  nextAllowableTime = 2020-06-16 22:35:09.451395  currentTime = 2020-06-16 22:

In [18]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')

else:
    print('Export unsuccessful. Check your code.')

Exported as JSON!


### Public Comments: Jan 2019 - May 2019