# Public Commenting in a Pandemic

## Cleaning & Analysis

In [None]:
# import packages
import pandas as pd
import numpy as np
import json
import time
import os
import datetime

In [None]:
# Specify the path of the folder where the data are saved
filePath = "C:/Users/mark/Box Sync/_MF/Assignments/Insights/Public Commenting and COVID-19/Data/Annual/"

## 2020: Cleaning

In [None]:
# load CSV
fileName = 'endpoint_documents_PS_2020.csv'
with open(filePath+fileName,'r',encoding='utf-8') as loadfile:
    df2020 = pd.read_csv(loadfile, index_col='index')
df2020.info()

In [None]:
# shorten/rename number of comments received column
df2020 = df2020.rename(columns={'numberOfCommentsReceived': 'commentsReceived'})

# create posted count column
df2020['commentsPosted'] = 1

df2020.loc[:,['commentsPosted','commentsReceived']].query('commentsReceived > 1')

In [None]:
# create list for documentId's of entries to clean
cleaning_list = []
type(cleaning_list)

### Dates and Months

In [None]:
# create new columns for year and month
df2020['postedYear'] = df2020['postedDate'].str.slice(start=0,stop=4)
df2020['postedMonth'] = df2020['postedDate'].str.slice(start=6,stop=7)

# convert to integers
df2020['postedYear'] = pd.to_numeric(df2020['postedYear'])
df2020['postedMonth'] = pd.to_numeric(df2020['postedMonth'])

# return new columns
print(df2020.loc[:,['postedYear','postedMonth']].dtypes)
df2020.loc[:,['postedYear','postedMonth']]

In [None]:
# created new column with postedDate in datetime format
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
df2020['postedDatetime'] = pd.to_datetime(df2020['postedDate'], utc=True)
df2020.loc[:,['postedDate','postedDatetime']]

In [None]:
# pivot by month
by_YearMonth = pd.pivot_table(df2020,values=['commentsPosted','commentsReceived'],
                              columns=['postedYear'],
                              index=['postedMonth'],
                              aggfunc=np.sum, margins=False)

by_YearMonth.loc[:,:]

In [None]:
# query entries with Month == 6
queries = df2020.loc[:,['postedMonth','postedDate','documentId']].query('postedMonth == 6')
queries

In [None]:
# add to cleaning list
docs_to_add = {'fix_month':
             queries.loc[:,'documentId'].tolist()}
cleaning_list.append(docs_to_add)
print(cleaning_list)

### Agency groupings

In [None]:
by_Agency = pd.pivot_table(df2020,
                           values=['commentsPosted','commentsReceived'],
                           index=['agencyAcronym'],
                           aggfunc=np.sum, margins=False)
print(len(by_Agency))
by_Agency

In [None]:
agency_list = by_Agency.index.tolist()
print(len(agency_list),'\n')
print(agency_list)

In [None]:
# create dictionary for Branch to Agency lookups
branch_dict = {'Judicial': ['USC'], 
               'Legislative': ['LOC', 'COLC'], 
               'Independent': ['AID', 'ATBCB', 'CFPB', 'CNCS', 'CPSC', 'CSB', 'EAC', 
                               'EEOC', 'FRTIB', 'FTC', 'GSA', 'NARA', 'NCUA', 'NLRB', 
                               'NRC', 'NTSB', 'OPM', 'PBGC', 'SBA', 'SSA'], 
               'Executive': ['DHS', 'CISA', 'FEMA', 'TSA', 'USCBP', 'USCG', 'USCIS', 
                             'DOC', 'BIS', 'ITA', 'NIST', 'NOAA', 'PTO', 'USBC', 'DOD', 
                             'COE', 'DARS', 'USA', 'USAF', 'DOE', 'EERE', 'DOI', 'BIA', 
                             'BLM', 'BOR', 'BSEE', 'FWS', 'NPS', 'OSM', 'DOJ', 'BOP', 'DEA', 
                             'EOIR', 'DOL', 'ETA', 'LMSO', 'MSHA', 'OFCCP', 'OSHA', 'WCPO', 
                             'DOS', 'DOT', 'FAA', 'FHWA', 'FMCSA', 'FRA', 'FTA', 'MARAD', 
                             'NHTSA', 'PHMSA', 'ED', 'EOP', 'CEQ', 'OMB', 'USTR', 'EPA', 'FAR', 'HHS', 'ATSDR', 'CDC', 'CMS', 'FDA', 'HHSIG', 'HRSA', 'HUD', 'TREAS', 'FINCEN', 'FISCAL', 'IRS', 'OCC', 'TTB', 'USDA', 
                             'AMS', 'APHIS', 'CCC', 'FCIC', 'FNS', 'FS', 'FSA', 'FSIS', 
                             'NRCS', 'RBS', 'RHS', 'RUS', 'VA']
              }
print(len(branch_dict))
print(branch_dict['Independent'])
print(len(branch_dict['Judicial']+
          branch_dict['Legislative']+
          branch_dict['Independent']+
          branch_dict['Executive']) - len(['LOC','EOP']))

In [None]:
%%time

# references:
    # https://stackoverflow.com/questions/49161120/pandas-python-set-value-of-one-column-based-on-value-in-another-column
    # https://stackoverflow.com/questions/30446510/list-of-elements-to-boolean-array

# create boolean arrays for each branch
bool_jud = [True if item in branch_dict['Judicial'] else False for item in df2020.loc[:,'agencyAcronym'].tolist()]
bool_leg = [True if item in branch_dict['Legislative'] else False for item in df2020.loc[:,'agencyAcronym'].tolist()]
bool_ind = [True if item in branch_dict['Independent'] else False for item in df2020.loc[:,'agencyAcronym'].tolist()]
bool_exe = [True if item in branch_dict['Executive'] else False for item in df2020.loc[:,'agencyAcronym'].tolist()]

# create new column for branch
df2020['agencyBranch'] = ''

# use boolean arrays to fill new column
df2020.loc[bool_jud,'agencyBranch'] = 'Judicial'
df2020.loc[bool_leg,'agencyBranch'] = 'Legislative'
df2020.loc[bool_ind,'agencyBranch'] = 'Independent'
df2020.loc[bool_exe,'agencyBranch'] = 'Executive'

df2020.loc[:,['agencyAcronym','agencyBranch']]

In [None]:
# query df by branch
df2020.query('agencyBranch == ""')

In [None]:
# query df by multiple branches
df2020.query('agencyBranch == "Legislative" | agencyBranch == "Judicial" ')

In [None]:
# create dict for Parent Agencies
parent_dict = dict(LOC = ['LOC', 'COLC'], 
                   DHS = ['DHS', 'CISA', 'FEMA', 'TSA', 'USCBP', 'USCG', 'USCIS'],
                   DOC = ['DOC', 'BIS', 'ITA', 'NIST', 'NOAA', 'PTO', 'USBC'],
                   DOD = ['DOD', 'COE', 'DARS', 'USA', 'USAF'],
                   DOE = ['DOE', 'EERE'],
                   DOI = ['DOI', 'BIA', 'BLM', 'BOR', 'BSEE', 'FWS', 'NPS', 'OSM'],
                   DOJ = ['DOJ', 'BOP', 'DEA', 'EOIR'],
                   DOL = ['DOL', 'ETA', 'LMSO', 'MSHA', 'OFCCP', 'OSHA', 'WCPO'],
                   DOS = ['DOS'],
                   DOT = ['DOT', 'FAA', 'FHWA', 'FMCSA', 'FRA', 'FTA', 'MARAD', 'NHTSA', 'PHMSA'],
                   ED = ['ED'],
                   EOP = ['EOP', 'CEQ', 'OMB', 'USTR'],
                   EPA = ['EPA'],
                   FAR = ['FAR'],
                   HHS = ['HHS', 'ATSDR', 'CDC', 'CMS', 'FDA', 'HHSIG', 'HRSA'],
                   HUD = ['HUD'],
                   TREAS = ['TREAS', 'FINCEN', 'FISCAL', 'IRS', 'OCC', 'TTB'],
                   USDA = ['USDA', 'AMS', 'APHIS', 'CCC', 'FCIC', 'FNS', 'FS', 'FSA', 'FSIS', 'NRCS', 'RBS', 'RHS', 'RUS'],
                   VA = ['VA']
                  )

x = 1
print(list(parent_dict.keys())[x])
print(list(parent_dict.values())[x])

In [None]:
%%time

# create new column for parent agency
df2020['agencyParent'] = ''

# parent==acronym for judicial & independent agencies
df2020.loc[bool_jud,'agencyParent'] = df2020.loc[bool_jud,'agencyAcronym']
df2020.loc[bool_ind,'agencyParent'] = df2020.loc[bool_ind,'agencyAcronym']

# set parent for executive & legislative agencies
dictLength = len(parent_dict)
listValues = list(parent_dict.values())
listKeys = list(parent_dict.keys())

for key in range(dictLength):
    print(list(parent_dict.keys())[key])
    bool_array = [True if item in listValues[key] else False for item in df2020.loc[:,'agencyAcronym'].tolist()]
    df2020.loc[bool_array,'agencyParent'] = [listKeys[key] if item in listValues[key] else '' for item in df2020.loc[bool_array,'agencyAcronym'].tolist()]

df2020.loc[:,['agencyAcronym','agencyParent','agencyBranch']]

In [None]:
df2020.loc[:,['agencyAcronym','agencyParent','agencyBranch']].query('agencyParent == ""')

In [None]:
print(len(df2020.query('agencyBranch == "Independent"')) + 
      len(df2020.query('agencyBranch == "Judicial"')))

In [None]:
by_AgencyParent = pd.pivot_table(df2020,
                           values=['commentsPosted','commentsReceived'],
                           index=['agencyBranch','agencyParent'],
                           aggfunc=np.sum, margins=False)
print(len(by_AgencyParent))
by_AgencyBranch.query('agencyBranch == "Executive"')

###### break

In [None]:
lookup = ['documentId','title','organization','attachmentCount','commentsReceived','agencyAcronym']

df2020.loc[:,lookup].query('commentsReceived == 1 & agencyAcronym == "CEQ"')

In [None]:
by_YearMonth.plot(y='documentCount', kind='bar')

In [None]:
by_MonthAgency = pd.pivot_table(df2020,values=['documentCount','numberOfCommentsReceived'],
                                index=['postedMonth','agencyAcronym'],
                                aggfunc=np.sum, margins=True)

by_MonthAgency.query('agencyAcronym == "CEQ"')