# EPA Comments

In [None]:
# import packages
import pandas as pd
import numpy as np
import requests
import urllib
import json
import time
import os

# datetime package too: https://docs.python.org/3/library/datetime.html
from datetime import datetime
from datetime import timedelta

## API Retrieval

In [None]:
# Specify the path of the folder where the data are saved
filePath = "DESIGNATE_FILE_PATH"

# general variables for setting parameters
APIkey = "INSERT_API_KEY"
rpp = 1000
pageIndex = 0
po = pageIndex * rpp
agency = 'EPA'

### Public Submissions: Jan 2020 - June 2020

In [None]:
# variables for setting parameters
# document type = Public Submission
# Rulemaking dockets only
# range: receivedDate

baseURL_PS = "https://api.data.gov/regulations/v3/documents.json?encoded=1&countsOnly=0&dkt=R&so=ASC&sb=postedDate"
dctType = 'PS'
dateRangeStart = '01/01/20'
dateRangeEnd = '06/30/20'

# set parameters to retrieve PS documents
params = {'po': po,
          'rpp': rpp,
          'api_key': APIkey,
          'rd': dateRangeStart+'-'+dateRangeEnd, 
          'dct': dctType, 
          'a': agency}

In [None]:
# ----- RETRIEVE COMMENTS ----- #
# retrieve comments using Requests library and check GET request response 
dcts_response = requests.get(baseURL_PS, params=params)
RL_remaining = int(dcts_response.headers['X-RateLimit-Remaining'])
print("Status Code: "+str(dcts_response.status_code),
      'Request URL: '+str(dcts_response.request.url)+'\n',sep='\n')

# nested list: separate 'documents' from 'totalNumRecords'
# confirm total requested and number of documents retrieved
numPS = dcts_response.json()['totalNumRecords']
dctsPS = dcts_response.json()['documents']
print('Total number of records requested: '+str(numPS), 'Number retrieved: '+str(len(dctsPS)), sep='\n')

# if requested == retrieved, then export as JSON
if len(dctsPS)==numPS:
    with open(filePath+'endpoint_documents_PS_2020Jan01_2020May31.json', 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS, outfile, ensure_ascii=False, indent=4)    
    print('Exported as JSON!')
        
else:
    print('\n''Determine how many pages of records need to be combined via the extend method...',
          'Start with: '+str(numPS // rpp + 1),
          'That would be enough to retrieve '+str(rpp * (numPS // rpp + 1))+' records'
          ' -- a margin of '+str(rpp * (numPS // rpp + 1) - numPS)+' records.',sep='\n')

In [None]:
%%time

# define empty object to put extended data
dctsPS_all = []
totalNumPages = numPS//rpp + (1 if (numPS%rpp>0) else 0)
print('Initial length of data: '+str(len(dctsPS_all)))

# define time objects for avoiding rate limit
initialNextTime = datetime.now()
nextAllowableTime = []
pagesPerHour = 1000 ## regulations.gov rate limit of 1000

# fill array of allowable times
for index in range(0,pagesPerHour):
    nextAllowableTime.append(initialNextTime)
print('Time array length: '+str(len(nextAllowableTime)))

# retrieve additional pages of documents and extend object
for pageIndex in range (0,totalNumPages): ## remember range is non-inclusive
    
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 5 minutes...", sep='\n')
        time.sleep(300)
    elif (RL_remaining <= 100) & (RL_remaining%25==0):
        print('Rate Limit remaining: '+str(RL_remaining))
    
    nextAllowableTimeIndex = pageIndex % pagesPerHour
    currentTime = datetime.now()
    if pageIndex%100 == 0:
        print("nextAllowableTimeIndex = "+str(nextAllowableTimeIndex),
              "nextAllowableTime = "+str(nextAllowableTime[nextAllowableTimeIndex]),
              "currentTime = "+str(currentTime), sep="  ")

    if currentTime < nextAllowableTime[nextAllowableTimeIndex]:
        waitTime = nextAllowableTime[nextAllowableTimeIndex] - currentTime
        print("sleeping " + str(waitTime.total_seconds()) + " seconds...")
        time.sleep(waitTime.total_seconds() + 0.01)
    
    if nextAllowableTime[nextAllowableTimeIndex] <= datetime.now():
        nextAllowableTime[nextAllowableTimeIndex] = datetime.now() + timedelta(seconds = 3600) ## add one hour to nextAllowableTime

        try:
            po = pageIndex * rpp
            params.update({'po': po})
            temp_response = requests.get(baseURL_PS, params=params)
            RL_remaining = int(temp_response.headers['X-RateLimit-Remaining'])
            if temp_response.status_code != 200: ## status code = 429 means over rate limit
                print('code '+str(temp_response.status_code)+' for page #'+str(pageIndex),
                      temp_response.text, sep='\n')

            data_this_page = temp_response.json()['documents']
            dctsPS_all.extend(data_this_page)
            if pageIndex%100 == 0:
                print("request made (pageIndex = " + str(pageIndex) + ")")
                print('Retrieved: '+str(len(dctsPS_all)),'\n')
        except:
            print('missing page: '+str(pageIndex))
            continue

    else:
        print("request failed")
        print("too soon -- breaking (pageIndex = "+str(pageIndex)+")")
        break

print('If this works, we should have retrieved all the requested documents: '+str(len(dctsPS_all)))

In [None]:
# if requested == retrieved, then export as JSON
if len(dctsPS_all)==numPS:
    dataFile = 'EPA_endpoint_documents_PS_2020Jan_2020Jun.json'
    with open(filePath+dataFile, 'w', encoding='utf-8') as outfile:
        json.dump(dctsPS_all, outfile, ensure_ascii=False, indent=4)
    print('Exported as JSON!')
else:
    print('Export unsuccessful. Check your code.')

In [None]:
print('0',dctsPS_all[0],'\n',
      'last',dctsPS_all[-1], sep='\n')

In [None]:
# convert to pandas DataFrame
df2020PS = pd.DataFrame(dctsPS_all)
df2020PS.info()

In [None]:
# create column with document URL 
df2020PS['documentURL'] = "https://www.regulations.gov/document?D="
df2020PS.loc[:,'documentURL'] = df2020PS['documentURL']+df2020PS['documentId']
print(df2020PS.loc[0,'documentURL'], 
      df2020PS.loc[1,'documentURL'], sep='\n')

In [None]:
# print columns list -- determine which ones to write to CSV
dfColumns = df2020PS.columns.tolist()
print(dfColumns)

In [None]:
write_columns = ['agencyAcronym','docketId','docketType','rin',
                 'documentId','documentType','numberOfCommentsReceived','postedDate',
                 'title','commentText','attachmentCount','documentURL']

savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'EPA_endpoint_documents_PS_2020.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    df2020PS.to_csv(outfile, index_label='index', line_terminator='\n', columns=write_columns)

print('Saved as CSV!')

## Data Cleaning

### Public Submissions data

In [None]:
# load CSV of Public Submissions data
savePath = 'DESIGNATE_FILE_PATH'
fileName = 'EPA_endpoint_documents_PS_2020.csv'

with open(savePath+fileName,'r',encoding='utf-8') as loadfile:
    dfPS = pd.read_csv(loadfile, index_col='index')
dfPS.info()

In [None]:
# create column for commentsPosted (==1 for each obs)
dfPS['commentsPosted'] = 1

# rename column
dfPS = dfPS.rename(columns={'numberOfCommentsReceived': 'commentsReceived'})

# create strings that will convert to naive datetime
dfPS['postedDateNaive'] = dfPS['postedDate'].str.slice(start=0,stop=10)

# convert to datetime format
dfPS['dtPosted'] = pd.to_datetime(dfPS['postedDateNaive'], infer_datetime_format=True)

# generate year and month columns
dfPS['postedMonth'] = dfPS['dtPosted'].dt.month
dfPS['postedYear'] = dfPS['dtPosted'].dt.year
dfPS.loc[:,['dtPosted','postedMonth','postedYear','commentsPosted','commentsReceived']]

#### Get receivedDate for Top 30 Dockets
from: Export Docket Folder (export all as csv)

In [None]:
Top30Received_by_Docket = pd.pivot_table(dfPS, values=['commentsPosted','commentsReceived','postedMonth'], index=['docketId'], 
               aggfunc={'commentsPosted': np.sum,
                        'commentsReceived': np.sum,
                        'postedMonth': np.max}
              ).sort_values('commentsReceived', ascending=False).head(30)
Top30Received_by_Docket['docketURL'] = 'https://www.regulations.gov/docket?D='+Top30Received_by_Docket.index
Top30Received_by_Docket['exportURL'] = 'https://www.regulations.gov/exportdocket?docketId='+Top30Received_by_Docket.index
top30DktList = Top30Received_by_Docket.index.tolist()
rdSample = Top30Received_by_Docket.sum(0)['commentsPosted']
print(rdSample)
Top30Received_by_Docket

In [None]:
for n in range(len(top30DktList)):
    print('Docket '+str(n)+': '+Top30Received_by_Docket['exportURL'][n])

In [None]:
# set variables outside the for loop
savePath = 'DESIGNATE_FILE_PATH'
docId = []
docType = []
docSub = []
rDate = []
pmDate = []

for dktId in range(len(top30DktList)):
    # update dktFile pointer
    dktFile = 'DOCKET_'+str(top30DktList[dktId])+'.csv'

    # load csv
    with open(savePath+dktFile,'r', encoding='utf-8') as loadfile:
        dfTopDkt = pd.read_csv(loadfile, skiprows=list(range(0,5)), 
                               usecols=['Document ID','Document Type','Document SubType','Received Date','Post Mark Date'],
                               dtype={'Document ID': 'str', 'Document Type': 'str', 'Document SubType': 'str'})
    
    # print length of documents for dktId
    print(str(top30DktList[dktId])+': '+str(len(dfTopDkt)))
    
    # narrow DataFrame and fix column names
    dfTopDkt = dfTopDkt.rename(columns={'Document ID': 'documentId', 
                                        'Document Type': 'documentType', 
                                        'Document SubType': 'documentSubType', 
                                        'Received Date': 'receivedDate', 
                                        'Post Mark Date': 'postmarkDate'})

    docId.extend(dfTopDkt['documentId'].tolist())
    docType.extend(dfTopDkt['documentType'].tolist())
    docSub.extend(dfTopDkt['documentSubType'].tolist())
    rDate.extend(dfTopDkt['receivedDate'].tolist())
    pmDate.extend(dfTopDkt['postmarkDate'].tolist())
    
    # print length of longest list
    print(max([len(docId), len(docType), len(docSub), len(rDate), len(pmDate)]),'\n')
    
dfTopDktcombo = pd.DataFrame(zip(docId, docType, docSub, rDate, pmDate), 
                             columns=['documentId','documentType','documentSubType','receivedDate','postmarkDate'])

# remove obs missing documentId (e.g., "withdrawn" documents)
print(len(dfTopDktcombo))
dfTopDktcombo = dfTopDktcombo[dfTopDktcombo['documentId'].notna()]
print(len(dfTopDktcombo))

# merge dataframes on documentId
print(rdSample,'\n') ## compare length to sample of top 20 dockets
dfPSrd = dfPS.merge(dfTopDktcombo, how='left', on=['documentId'], indicator=True, validate="1:1")
print(dfPSrd['_merge'].value_counts(),'\n')
dfPSrd = dfPSrd.rename(columns={'_merge': '_mergeTop30'}) ## rename _merge column
dfPSrd.info()

In [None]:
print(dfPSrd['documentType_x'].value_counts(),
      dfPSrd['documentType_y'].value_counts() ,sep='\n')

dfPSrd = dfPSrd.drop(columns=['documentType_y'], errors='ignore')

#### Get receivedDate for remaining comments
from: API document endpoint

In [None]:
# distribution of comments missing receivedDate by month
dfPSrd.loc[dfPSrd['receivedDate'].isna(),
           ['documentId','postedDate','postedMonth']].groupby('postedMonth').documentId.nunique()

In [None]:
# list of docIds for API request
bool_missingRD = dfPSrd['receivedDate'].isna()

missingRD = dfPSrd.loc[bool_missingRD,'documentId'].tolist()

print(bool_missingRD.value_counts(),
      len(missingRD), sep='\n')

In [None]:
# ----- Create new DataFrame for Remaining Comments ----- #

# ----- Retrieve receivedDates for comments ----- #
import requests

# general variables for setting parameters
APIkey = "INSERT_API_KEY"
baseURL = "https://api.data.gov:443/regulations/v3/document.json?"
dctId = ""

# set parameters
params = {'api_key': APIkey,
          'documentId': dctId}

rangeRD = len(missingRD)
listRD = [] # list for adding receivedDate of each entry

# retrieve comments using Requests library and check GET request response 
for d in range(rangeRD):
    dctId = missingRD[d]
    params.update({'documentId': dctId})

    dct_response = requests.get(baseURL, params=params)
    RL_remaining = int(dct_response.headers['X-RateLimit-Remaining'])

    if dct_response.status_code != 200:
        print('code '+str(dct_response.status_code)+' for page #'+str(pageIndex), 
              dct_response.text, sep='\n')
    if RL_remaining < 10:
        print('Rate Limit remaining: '+str(RL_remaining),
              "sleeping 1 minute...", sep='\n')
        time.sleep(60)

    this_receivedDate = dct_response.json()['receivedDate']
    listRD.append(this_receivedDate)
    
    if d%100==0:
        print("Number of comments retrieved: "+str(d))

print('Length of receivedDate list is '+str(len(listRD)))

# ----- Generate df from the lists ----- #
remainingList = list(zip(missingRD, listRD))
dfRemaining = pd.DataFrame(remainingList, columns = ['documentId', 'receivedDate'])
dfRemaining.info()

#### Concatenate and Merge DataFrames

In [None]:
# concatenate dfs so we have one df with receivedDate
dfRD = pd.concat([dfTopDktcombo, dfRemaining], 
                 axis=0, join='outer', ignore_index=True, verify_integrity=True)

# merge dataframes on documentId
dfPS2020 = dfPS.merge(dfRD, how='left', on=['documentId'], indicator=True, validate="1:1")
print(dfPS2020['_merge'].value_counts(),'\n')
dfPS2020 = dfPS2020.rename(columns={'_merge': '_mergeRD'})
dfPS2020.info()

In [None]:
# convert to datetime format
dfPS2020['receivedDateNaive'] = dfPS2020['receivedDate'].str.slice(start=0,stop=10)
dfPS2020['dtReceived'] = pd.to_datetime(dfPS2020['receivedDateNaive'])

dfPS2020['receivedMonth'] = dfPS2020['dtReceived'].dt.month
dfPS2020.loc[:,['receivedDate','dtReceived','receivedMonth']]

In [None]:
pd.pivot_table(dfPS2020, values=['commentsPosted'], index=['receivedMonth'], columns=['postedMonth'], 
               aggfunc={'commentsPosted': np.sum}, fill_value=0, margins=True)

#### Drop Select Documents

In [None]:
# create list for dropping PS documents
dropPS = []

### remove documents posted before 2020
dropPS = dfPS2020[dfPS2020['dtPosted']<datetime(2020,1,1)].index.tolist()
print(len(dropPS),'\n')

### remove documents posted in July 2020
dropPS.extend(dfPS2020[dfPS2020['dtReceived']>=datetime(2020,7,1)].index.tolist())
print(len(dropPS),'\n')

# drop entries
print(len(dfPS2020))
dfPS2020 = dfPS2020.drop(index=dropPS, errors='ignore') ## ignore → only existing labels are dropped
print(len(dfPS2020))

#### Results: Clean Public Submissions Dataset

In [None]:
pd.pivot_table(dfPS2020, values=['commentsPosted','commentsReceived'], index=['receivedMonth'],
               aggfunc=np.sum, fill_value=0, margins=True)

In [None]:
# https://stackoverflow.com/questions/15411158/pandas-countdistinct-equivalent
print(dfPS2020.groupby('postedMonth').docketId.nunique(),
      dfPS2020.groupby('receivedMonth').docketId.nunique(), sep='\n')

In [None]:
savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'EPA_cleaned_PS_2020.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    dfPS2020.to_csv(outfile, index_label='index', line_terminator='\n')

print('Saved as CSV!')

## Data for Analysis

In [None]:
# load CSV of Public Submissions data
savePath = 'DESIGNATE_FILE_PATH'
fileName = 'EPA_cleaned_PS_2020.csv'

with open(savePath+fileName,'r',encoding='utf-8') as loadfile:
    dfPS2020 = pd.read_csv(loadfile, index_col='index')
dfPS2020.info()

In [None]:
# create filter for Science Rule
bool_science = dfPS2020['docketId']=='EPA-HQ-OA-2018-0259'
dfPS2020.loc[bool_science,'scienceRule'] = 1
dfPS2020.loc[~bool_science,'scienceRule'] = 0
dfPS2020['scienceRule'].value_counts()

### Public Submissions per Month

In [None]:
print(len(dfPS2020.groupby(['receivedMonth','docketId'])))
print(sum(dfPS2020.groupby('receivedMonth').docketId.nunique().tolist()))

#### Include all dockets

In [None]:
# create new DataFrame with sum of comments per month
dfCommentsMonthly = dfPS2020.groupby('receivedMonth')[['commentsPosted','commentsReceived']].sum().reset_index()

# add new column with unique dockets receiving comments per month
dfCommentsMonthly['docketsUnique'] = dfPS2020.groupby('receivedMonth')['docketId'].nunique().tolist()

# add column for month labels
dfCommentsMonthly.insert(1,'labelMonth',['Jan','Feb','Mar','Apr','May','Jun'])

# rename columns
dfCommentsMonthly = dfCommentsMonthly.rename(columns={'commentsPosted': 'commentsUnique', 'commentsReceived': 'commentsAll'})

# calculate two new columns: comments per unique dockets
dfCommentsMonthly['unq_per_dkts'] = dfCommentsMonthly['commentsUnique']/dfCommentsMonthly['docketsUnique']
dfCommentsMonthly['all_per_dkts'] = dfCommentsMonthly['commentsAll']/dfCommentsMonthly['docketsUnique']

# view returned df
dfCommentsMonthly

In [None]:
savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'data_for_analysis_monthly.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    dfCommentsMonthly.to_csv(outfile, line_terminator='\n', index=False)

print('Saved as CSV!')

#### Science rule filter

In [None]:
# create new DataFrame with sum of comments per month
dfCommentsMonthly_filterSR = dfPS2020.groupby(['scienceRule','receivedMonth'])[['commentsPosted','commentsReceived']].sum().reset_index()

# add new column with unique dockets receiving comments per month
dfCommentsMonthly_filterSR['docketsUnique'] = dfPS2020.groupby(['scienceRule','receivedMonth'])['docketId'].nunique().tolist()

# add column for month labels
dfCommentsMonthly_filterSR.insert(2,'labelMonth',['Jan','Feb','Mar','Apr','May','Jun','Jan','Mar','Apr','May','Jun'])

# rename columns
dfCommentsMonthly_filterSR = dfCommentsMonthly_filterSR.rename(columns={'commentsPosted': 'commentsUnique', 'commentsReceived': 'commentsAll'})

# calculate two new columns: comments per unique dockets
exclude_science_rule = dfCommentsMonthly_filterSR['scienceRule']==0
dfCommentsMonthly_filterSR.loc[exclude_science_rule,'unq_per_dkts'] = dfCommentsMonthly_filterSR['commentsUnique']/dfCommentsMonthly_filterSR['docketsUnique']
dfCommentsMonthly_filterSR.loc[exclude_science_rule,'all_per_dkts'] = dfCommentsMonthly_filterSR['commentsAll']/dfCommentsMonthly_filterSR['docketsUnique']

# view returned df
dfCommentsMonthly_filterSR

In [None]:
savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'data_for_analysis_monthly_filterSR.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    dfCommentsMonthly_filterSR.to_csv(outfile, line_terminator='\n', index=False)

print('Saved as CSV!')

### Public Submissions per Day

In [None]:
print(len(dfPS2020.groupby('dtReceived')[['commentsPosted','commentsReceived']]))
print(len(dfPS2020.groupby('dtReceived')['docketId'].nunique().tolist()))

print(len(dfPS2020.groupby(['scienceRule','dtReceived'])[['commentsPosted','commentsReceived']]))
print(len(dfPS2020.groupby(['scienceRule','dtReceived'])['docketId'].nunique().tolist()))

#### Include all dockets

In [None]:
# create new DataFrame with sum of comments per month
dfCommentsDaily = dfPS2020.groupby('dtReceived')[['commentsPosted','commentsReceived']].agg(np.sum).reset_index()

# add new column with unique dockets receiving comments per month
dfCommentsDaily['docketsUnique'] = dfPS2020.groupby('dtReceived')['docketId'].nunique().tolist()

# rename columns
dfCommentsDaily = dfCommentsDaily.rename(columns={'commentsPosted': 'commentsUnique', 'commentsReceived': 'commentsAll'})

# calculate two new columns: comments per unique dockets
dfCommentsDaily['unq_per_dkts'] = dfCommentsDaily['commentsUnique']/dfCommentsDaily['docketsUnique']
dfCommentsDaily['all_per_dkts'] = dfCommentsDaily['commentsAll']/dfCommentsDaily['docketsUnique']

# view returned df
dfCommentsDaily

In [None]:
savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'data_for_analysis_daily.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    dfCommentsDaily.to_csv(outfile, line_terminator='\n', index=False)

print('Saved as CSV!')

#### Science rule filter

In [None]:
# create new DataFrame with sum of comments per month
dfCommentsDaily_filterSR = dfPS2020.groupby(['scienceRule','dtReceived'])[['commentsPosted','commentsReceived']].agg(np.sum).reset_index()

# add new column with unique dockets receiving comments per month
dfCommentsDaily_filterSR['docketsUnique'] = dfPS2020.groupby(['scienceRule','dtReceived'])['docketId'].nunique().tolist()

# rename columns
dfCommentsDaily_filterSR = dfCommentsDaily_filterSR.rename(columns={'commentsPosted': 'commentsUnique', 'commentsReceived': 'commentsAll'})

# calculate two new columns: comments per unique dockets
exclude_science_rule = dfCommentsDaily_filterSR['scienceRule']==0
dfCommentsDaily_filterSR.loc[exclude_science_rule,'unq_per_dkts'] = dfCommentsDaily_filterSR['commentsUnique']/dfCommentsDaily_filterSR['docketsUnique']
dfCommentsDaily_filterSR.loc[exclude_science_rule,'all_per_dkts'] = dfCommentsDaily_filterSR['commentsAll']/dfCommentsDaily_filterSR['docketsUnique']

# view returned df
dfCommentsDaily_filterSR

In [None]:
savePath = 'DESIGNATE_FILE_PATH'
saveFile = 'data_for_analysis_daily_filterSR.csv'

# write to csv, reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
with open(savePath+saveFile, 'w', encoding='utf-8') as outfile:
    dfCommentsDaily_filterSR.to_csv(outfile, line_terminator='\n', index=False)

print('Saved as CSV!')

## Docket Analysis

In [None]:
# load CSV of Public Submissions data
savePath = 'DESIGNATE_FILE_PATH'
fileName = 'EPA_cleaned_PS_2020.csv'

with open(savePath+fileName,'r',encoding='utf-8') as loadfile:
    dfPS2020 = pd.read_csv(loadfile, index_col='index')
dfPS2020.info()

### Top 5 Dockets

In [None]:
# view top 5 dockets in terms of total comments
dfTop5Dkts = pd.pivot_table(dfPS2020, values=['commentsPosted','commentsReceived'], index=['docketId'], 
                            aggfunc=np.sum, fill_value=0).sort_values('commentsReceived', ascending=False).head(5)
dfTop5Dkts

In [None]:
# view commentsReceived per month on top 5 dockets
select_dockets = dfTop5Dkts.index.tolist()

bool_select = [True if doc in select_dockets else False for doc in dfPS2020['docketId'].tolist()]
print(bool_select.count(True))

pd.pivot_table(dfPS2020[bool_select], values=['commentsPosted','commentsReceived'], index=['docketId','receivedMonth'],
               aggfunc=np.sum, fill_value=0)