# EPA Rulemaking Documents

In [None]:
# import packages
import pandas as pd
import numpy as np
import requests
import urllib
import json
import time
import os

# datetime package too: https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

## API Retrieval

### Agency Documents: Jan 2020 - June 2020

In [None]:
# create variables for setting parameters
# document type = Notices, Proposed Rules, Final Rules, Other
# Rulemaking dockets only
# range: postedDate

APIkey = "INSERT_API_KEY"
rpp = 1000
pageIndex = 0
po = pageIndex * rpp
agency = 'EPA'

baseURL_rules = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dkt=R&so=ASC&sb=postedDate"
dct_N = 'N'
dct_PR = 'PR'
dct_FR = 'FR'
dct_O = 'O'

dateRangeStart = '01/01/20'
dateRangeEnd = '06/30/20'

# set parameters
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'pd': dateRangeStart+'-'+dateRangeEnd, 
          'dct': dct_N, 
          'a': agency}

In [None]:
# ----- RETRIEVE DOCUMENTS ----- #
# 1) Notices -------------------------------------------------------------------------------------------
print('Retrieve Notices') ## e.g., agency information collection activities, EIS availability, etc.

params.update({'dct': dct_N})

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_rules, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numN = dcts_response.json()['totalNumRecords']
dctsN = dcts_response.json()['documents']
print('Total number of records requested: '+str(numN), 'Number retrieved: '+str(len(dctsN)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsN)==numN:
    dataFile = 'EPA_endpoint_documents_N_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsN, outfile, ensure_ascii=False, indent=4)    
    print('Document Type '+str(params['dct'])+' exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numN // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numN // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numN // rpp + 1) - numN)+' records.',sep='\n')


# 2) Proposed Rules -------------------------------------------------------------------------------------
print('\n','Retrieve Proposed Rules')

# update parameters
params.update({'dct': dct_PR})

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_rules, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPR = dcts_response.json()['totalNumRecords']
dctsPR = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPR), 'Number retrieved: '+str(len(dctsPR)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPR)==numPR:
    dataFile = 'EPA_endpoint_documents_PR_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsPR, outfile, ensure_ascii=False, indent=4)    
    print('Document Type '+str(params['dct'])+' exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPR // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numPR // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numPR // rpp + 1) - numPR)+' records.',sep='\n')


# 3) Final Rules ---------------------------------------------------------------------------------------
print('\n','Retrieve Final Rules')

# update parameters
params.update({'dct': dct_FR})

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_rules, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numFR = dcts_response.json()['totalNumRecords']
dctsFR = dcts_response.json()['documents']
print('Total number of records requested: '+str(numFR), 'Number retrieved: '+str(len(dctsFR)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsFR)==numFR:
    dataFile = 'EPA_endpoint_documents_FR_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsFR, outfile, ensure_ascii=False, indent=4)    
    print('Document Type '+str(params['dct'])+' exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numFR // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numFR // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numFR // rpp + 1) - numFR)+' records.',sep='\n')


# 4) Other -----------------------------------------------------------------------------------------
print('\n','Retrieve Other documents') ## e.g., comment extension letter, pesticide registration review, etc.

# update parameters
params.update({'dct': dct_O})

# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_rules, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numO = dcts_response.json()['totalNumRecords']
dctsO = dcts_response.json()['documents']
print('Total number of records requested: '+str(numO), 'Number retrieved: '+str(len(dctsO)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsO)==numO:
    dataFile = 'EPA_endpoint_documents_O_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsO, outfile, ensure_ascii=False, indent=4)    
    print('Document Type '+str(params['dct'])+' exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numO // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numO // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numO // rpp + 1) - numO)+' records.',sep='\n')

In [None]:
# combine individual JSON files into extended object
dctType_list = ['N', 'PR', 'FR', 'O']

dctsType_all = []
print('Initial length of object: '+str(len(dctsType_all)))

for dctType in dctType_list:
    print(dctType)
    with open(filePath+'EPA_endpoint_documents_'+dctType+'_2020Jan_2020Jun.json', 'r', encoding='utf-8') as jf:
        dctsChunk = json.load(jf)
        dctsType_all.extend(dctsChunk)
        print('Current length of object: '+str(len(dctsType_all)))

# total number of agency documents (excluding public submissions)
numAll_Agency = numN+numPR+numFR+numO

print('Final length of object: '+str(len(dctsType_all)),
      'Should equal: '+str(numAll_Agency), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsType_all)==(numAll_Agency):
    dataFile = 'EPA_endpoint_documents_AGENCYDCTS_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsType_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')
else:
    print('Export unsuccessful. Check your code.')

In [None]:
# convert to pandas DataFrame
df2020AgencyDcts = pd.DataFrame(dctsType_all)
df2020AgencyDcts.info()

In [None]:
# create column with document URL 
df2020AgencyDcts['documentURL'] = "https://www.regulations.gov/document?D="
df2020AgencyDcts.loc[:,'documentURL'] = df2020AgencyDcts['documentURL']+df2020AgencyDcts['documentId']
print(df2020AgencyDcts.loc[0,'documentURL'], 
      df2020AgencyDcts.loc[1,'documentURL'], sep='\n')

In [None]:
dfColumns = df2020AgencyDcts.columns.tolist()
print(dfColumns)

In [None]:
write_columns = ['agencyAcronym','docketId','docketType','rin',
                 'documentId','documentType','frNumber','commentDueDate','commentStartDate','openForComment',
                 'numberOfCommentsReceived','postedDate',
                 'title','attachmentCount','documentURL']

savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'EPA_endpoint_documents_AGENCYDCTS_2020.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    df2020AgencyDcts.to_csv(outfile, index_label='index', line_terminator='\n', columns=write_columns)

print('Saved as CSV!')

## Analyze Comment Periods Extensions

In [None]:
# load CSV of Agency Documents data
savePath = 'DESIGNATE_FILE_PATH'
fileName = 'EPA_endpoint_documents_AGENCYDCTS_2020.csv'

with open(savePath+fileName,'r',encoding='utf-8') as loadfile:
    dfAgencyDcts = pd.read_csv(loadfile, index_col='index')
dfAgencyDcts.info()

In [None]:
# create bool array for rules with extension in title
    # reference for regex: https://docs.python.org/3/howto/regex.html
bool_extension = dfAgencyDcts.loc[:,'title'].str.contains('extension of comment|period extension', regex=True, case=False)
print(bool_extension.value_counts(),'\n')

dfExtension = dfAgencyDcts[bool_extension]
dfExtension.loc[:,['title','docketId','commentDueDate','documentURL']]

In [None]:
# manually check entries that were returned by the keyword search
extensionList = dfExtension.index.tolist()

for dct in extensionList:
    print(dct,' -- ',dfExtension.loc[dct,'title'],' -- ',
          dfExtension.loc[dct,'documentURL'],'\n')
print('Done.')