# SI 608 Project – Workspace
<span style="font-size: 18px;">General scratchpad workspace that preloads all the dataframes.</span>
<br>See <code>./modules</code> to review how libraries are installed and imported, as well as where the data is loaded, cleaned, and formatted. This is only here as a helpful tool, make a copy and do whatever you'd like. Or don't use this at all if that's preferable.

[OpenSecrets Data Dictionary Index](../../docs/open_source_data_dictionary.md)
<br><small><em>(View the index with markdown preview)</em></small>

## Environment
Run these cells first to load global functions and variables, augment enivronment, and produce and load primary datasets for this project, overall.

#### Settings
Configure frequently modified behaviors in this notebook.

In [None]:
# PUT THIS IN .env?
DISPLAY_DF = True # for showdf() -> df.head()
SAVE_DF = True # for to_csv() -> pd.to_csv()

#### Init globals
Init file contains global variables and helper functions used throughout the project.

In [None]:
%run modules/init.ipynb

#### Local env
Contain things like API keys that should be excluded from the public repository.

In [None]:
env_vars = load_env_vars()

In [None]:
CONGRESS_API_KEY = env_vars['CONGRESS_API_KEY']
CONGRESS_API_URL = env_vars['CONGRESS_API_URL']

print(f"CONGRESS_API_KEY: {CONGRESS_API_KEY}, CONGRESS_API_URL: {CONGRESS_API_URL}")

In [None]:
CONGRESS_NUM = 116

## Data
From OpenSecrets.org bulk data collection. Contains data from IRS and the FEC.

### 527 data

#### cmtes527

In [None]:
# # OpenSecrets Data Definition: 527 Committees
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Cmtes.htm
# try:
#     df_cmtes527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_cmtes527 = pd.read_csv('../../data/open_secrets/527/cmtes527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__cmtes527', 'rpt__cmtes527', 'ein__cmtes527', 'crp527name__cmtes527', 'affiliate__cmtes527', 'ultorg__cmtes527', 
#                                       'recipcode__cmtes527', 'cmteid__cmtes527', 'cid__cmtes527', 'eccmteid__cmtes527', 'party__cmtes527', 
#                                       'primcode__cmtes527', 'source__cmtes527', 'ffreq__cmtes527', 'ctype__cmtes527', 'csource__cmtes527', 'viewpt__cmtes527',
#                                       'comments__cmtes527', 'state__cmtes527'])
#     print("Finished.")

# showdf(df_cmtes527)

In [None]:
# showdf(df_cmtes527)

#### expends527

In [None]:
# # OpenSecrets Data Dictionary 527 Expenditure Data - from IRS Form 8872B
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Expenditures.htm
# try:
#     df_expends527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_expends527 = pd.read_csv('../../data/open_secrets/527/expends527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names='rpt__expends527', 'formid__expends527', 'schbid__expends527', 'orgname__expends527', 'ein__expends527', 'recipient__expends527', 
#                                      'recipientcrp__expends527', 'amount__expends527', 'date__expends527', 'expcode__expends527', 'source__expends527', 
#                                      'purpose__expends527', 'addr1__expends527', 'addr2__expends527', 'city__expends527', 'state__expends527', 'zip__expends527',
#                                      'employer__expends527', 'occupation__expends527'])
#     print("Finished.")

# showdf(df_expends527)

#### rcpts527

In [None]:
# # OpenSecrets Data Dictionary 527 Contribution Data - from IRS Form 8872A
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Receipts.htm
# try:
#     df_rcpts527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_rcpts527 = pd.read_csv('../../data/open_secrets/527/rcpts527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['id__rcpts527', 'rpt__rcpts527', 'formid__rcpts527', 'schaid__rcpts527', 'contribid__rcpts527', 'contrib__rcpts527', 
#                                       'amount__rcpts527', 'date__rcpts527', 'orgname__rcpts527', 'ultorg__rcpts527', 'realcode__rcpts527', 
#                                       'recipid__rcpts527', 'recipcode__rcpts527', 'party__rcpts527', 'recipient__rcpts527', 'city__rcpts527', 'state__rcpts527',
#                                       'zip__rcpts527', 'zip4__rcpts527', 'pmsa__rcpts527', 'employer__rcpts527', 'occupation__rcpts527', 'ytd__rcpts527', 'gender__rcpts527', 'source__rcpts527'])
#     print("Finished.")

# showdf(df_rcpts527)

### Campaign Finance 18 data

#### cands18

In [None]:
# OpenSecrets Data Definition: Candidates
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Candidates%20Data.htm
try:
    df_cands18.head()
except NameError:
    print("Creating dataframe...")
    df_cands18 = pd.read_csv('../../data/open_secrets/CampaignFin18/cands18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__cands18', 'feccandid__cands18', 'cid__cands18', 'firstlastp__cands18', 'party__cands18', 'distidrunfor__cands18', 
                                      'distidcurr__cands18', 'currcand__cands18', 'cyclecand__cands18', 'crpico__cands18', 'recipcode__cands18', 
                                      'nopacs__cands18'])
    print("Finished.")

# Remove party labels from names: '3', 'R', 'D', 'I', 'L', 'U', 'i'
df_cands18['firstlast__cands18'] = df_cands18['firstlastp__cands18'].apply(
    lambda x: x.replace(" (3)", "").replace(" (R)", "").replace(" (D)", "").replace(" (I)", "").replace(" (L)", "").replace(" (U)", "").replace(" (i)", "") if isinstance(x, str) else x
)

showdf(df_cands18)

#### cmtes18

In [None]:
# OpenSecrets Table Definition: Committee table
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Cmtes.htm
try:
    df_cmtes18.head()
except NameError:
    print("Creating dataframe...")
    df_cmtes18 = pd.read_csv('../../data/open_secrets/CampaignFin18/cmtes18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__cmtes18', 'cmteid__cmtes18', 'pacshort__cmtes18', 'affiliate__cmtes18', 'ultorg__cmtes18', 'recipid__cmtes18', 
                                      'recipcode__cmtes18', 'feccandid__cmtes18', 'party__cmtes18', 'primcode__cmtes18', 'source__cmtes18', 'sensitive__cmtes18',
                                      'foreign__cmtes18', 'active__cmtes18'])
    print("Finished.")

showdf(df_cmtes18)

#### pac_other18

In [None]:
# OpenSecrets Data Definition for PAC to PAC Data (Pac_other table)
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20PAC%20to%20PAC%20Data.htm
try:
    df_pac_other18.head()
except NameError:
    print("Creating dataframe...")
    df_pac_other18 = pd.read_csv('../../data/open_secrets/CampaignFin18/pac_other18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__pac_other18', 'fecrecno__pac_other18', 'filerid__pac_other18', 'donorcmte__pac_other18', 'contriblendtrans__pac_other18', 'city__pac_other18', 'state__pac_other18', 
                                      'zip__pac_other18', 'fecoccemp__pac_other18', 'primcode__pac_other18', 'date__pac_other18', 'amount__pac_other18', 'recipid__pac_other18', 'party__pac_other18', 'otherid__pac_other18',
                                      'recipcode__pac_other18', 'recipprimcode__pac_other18', 'amend__pac_other18', 'report__pac_other18', 'pg__pac_other18', 'microfilm__pac_other18', 'type__pac_other18',
                                      'realcode__pac_other18', 'source__pac_other18'])
    print("Finished.")

showdf(df_pac_other18)

#### pacs18

In [None]:
# OpenSecrets Data Definition: PAC table (PACs to Candidates)
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20PAC%20to%20Cands%20Data.htm
try:
    df_pacs18.head()
except NameError:
    print("Creating dataframe...")
    df_pacs18 = pd.read_csv('../../data/open_secrets/CampaignFin18/pacs18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__pacs18', 'fecrecno__pacs18', 'pacid__pacs18', 'cid__pacs18', 'amount__pacs18', 'date__pacs18', 'realcode__pacs18', 
                                      'type__pacs18', 'di__pacs18', 'feccandid__pacs18'])
    print("Finished.")

showdf(df_pacs18)

#### indivs18

In [None]:
# OpenSecrets Data Definition: Individual Contribution Data
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Individual%20Contribution%20Data.htm
try:
    df_indivs18.head()
except NameError:
    print("Creating dataframe...")
    df_indivs18 = pd.read_csv('../../data/open_secrets/CampaignFin18/indivs18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__indivs18', 'fectransid__indivs18', 'contribid__indivs18', 'contrib__indivs18', 'recipid__indivs18', 'orgname__indivs18', 
                                      'ultorg__indivs18', 'realcode__indivs18', 'date__indivs18', 'amount__indivs18', 'street__indivs18', 'city__indivs18', 'state__indivs18',
                                      'zip__indivs18', 'recipcode__indivs18', 'type__indivs18', 'cmteid__indivs18', 'otherid__indivs18', 'gender__indivs18', 'microfilm__indivs18',
                                      'occupation__indivs18', 'employer__indivs18', 'source__indivs18'])
    print("Finished.")

showdf(df_indivs18)

### Expends18 data

#### expends18

In [None]:
# # OpenSecrets Data Dictionary for Expenditure Data - from FEC electronic filings
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Expenditures.htm
# try:
#     df_expends18.head()
# except NameError:
#     print("Creating dataframe...")
#     df_expends18 = pd.read_csv('../../data/open_secrets/Expend18/expends18.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__expends18', 'id__expends18', 'transid__expends18', 'crpfilerid__expends18', 
#                                       'recipcode__expends18', 'pacshort__expends18', 'crprecipname__expends18', 
#                                       'expcode__expends18', 'amount__expends18', 'date__expends18', 'city__expends18', 'state__expends18', 
#                                       'zip__expends18', 'cmteid_ef__expends18', 'candid__expends18', 'type__expends18',
#                                       'descrip__expends18', 'pg__expends18', 'elecother__expends18', 'enttype__expends18',
#                                       'source__expends18'])
#     print("Finished.")
    
# showdf(df_expends18)

### Lobby data

#### lob_agency

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby agencies
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_agency.htm
try:
    df_lob_agency.head()
except NameError:
    print("Creating dataframe...")
    df_lob_agency = pd.read_csv('../../data/open_secrets/Lobby/lob_agency.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_agency',
                                      'agencyid__lob_agency', 
                                      'agency__lob_agency'])
    print("Finished.")

showdf(df_lob_agency)

#### lob_bills

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby bills
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_bills.htm
try:
    df_lob_bills.head()
except NameError:
    print("Creating dataframe...")
    df_lob_bills = pd.read_csv('../../data/open_secrets/Lobby/lob_bills.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['b_id__lob_bills',
                                      'si_id__lob_bills', 
                                      'congno__lob_bills', 
                                      'bill_name__lob_bills'])
    df_lob_bills['bill_name__lob_bills'] = df_lob_bills['bill_name__lob_bills'].apply(lambda x: x[:-2])
    print("Finished.")

showdf(df_lob_bills)

#### lob_indus

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby industries
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_indus.htm
try:
    df_lob_indus.head()
except NameError:
    print("Creating dataframe...")
    df_lob_indus = pd.read_csv('../../data/open_secrets/Lobby/lob_indus.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['client__lob_indus',
                                      'sub__lob_indus', 
                                      'total__lob_indus', 
                                      'year__lob_indus', 
                                      'catcode__lob_indus'])
    print("Finished.")

showdf(df_lob_indus)

#### lob_issue

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby issues
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_issues.htm
try:
    df_lob_issue.head()
except NameError:
    print("Creating dataframe...")
    df_lob_issue = pd.read_csv('../../data/open_secrets/Lobby/lob_issue.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['si_id__lob_issue',
                                      'uniqid__lob_issue', 
                                      'issueid__lob_issue', 
                                      'issue__lob_issue', 
                                      'specificissue__lob_issue', 
                                      'year__lob_issue'])
    print("Finished.")

showdf(df_lob_issue)

#### lob_issue_no_specific

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby issues (no specific issue)
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_issues.htm
try:
    df_lob_issue_no_specific.head()
except NameError:
    print("Creating dataframe...")
    df_lob_issue_no_specific = pd.read_csv('../../data/open_secrets/Lobby/lob_issue_NoSpecficIssue.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['si_id__lob_issue_NoSpecficIssue', 'uniqid__lob_issue_NoSpecficIssue', 
                                      'issueid__lob_issue_NoSpecficIssue', 'issue__lob_issue_NoSpecficIssue', 
                                      'year__lob_issue_NoSpecficIssue'])
    print("Finished.")

showdf(df_lob_issue_no_specific)

#### lob_lobbying

In [None]:
# OpenSecrets Data Definitions for Lobbying Data: Lobbying
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_lobbying.htm
try:
    df_lob_lobbying.head()
except NameError:
    print("Creating dataframe...")
    df_lob_lobbying = pd.read_csv('../../data/open_secrets/Lobby/lob_lobbying.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_lobbying','registrant_raw__lob_lobbying','registrant__lob_lobbying','isfirm__lob_lobbying','client_raw__lob_lobbying','client__lob_lobbying','ultorg__lob_lobbying','amount__lob_lobbying',
                                      'catcode__lob_lobbying','source__lob_lobbying','self__lob_lobbying','includensfs__lob_lobbying','use__lob_lobbying',
                                      'ind__lob_lobbying', 'year__lob_lobbying', 'type__lob_lobbying', 'typelong__lob_lobbying', 'affiliate__lob_lobbying'])
    print("Finished.")

showdf(df_lob_lobbying)

#### lob_lobbyist

In [None]:
# OpenSecrets Data Definition for Lobbyists
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_lobbyists.htm
try:
    df_lob_lobbyist.head()
except NameError:
    print("Creating dataframe...")
    df_lob_lobbyist = pd.read_csv('../../data/open_secrets/Lobby/lob_lobbyist.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_lobbyist', 'lobbyist_lastname_std__lob_lobbyist', 'lobbyist_firstname_std__lob_lobbyist', 'lobbyist_lastname_raw__lob_lobbyist', 
                                      'lobbyist_firstname_raw__lob_lobbyist', 'lobbyist_id__lob_lobbyist', 'year__lob_lobbyist', 'officialposition__lob_lobbyist', 'cid__lob_lobbyist', 'formercongmem__lob_lobbyist'])
    print("Finished.")

showdf(df_lob_lobbyist)

#### lob_rpt

In [None]:
# OpenSecrets Data Definitions for Lobbying Data: Report types
# No documentation provided on OpenSecrets.com
try:
    df_lob_rpt.head()
except NameError:
    print("Creating dataframe...")
    df_lob_rpt = pd.read_csv('../../data/open_secrets/Lobby/lob_rpt.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['typelong__lob_rpt', 'typeshort__lob_rpt'])
    print("Finished.")

showdf(df_lob_rpt)

### IDs and categories

#### CRP_ID

In [None]:
install_if_needed('xlrd')
import xlrd

In [None]:
# Candidate ids
# This dataset is very different, so load it independently.
columns_crp_ids = ['blank_excel_column__crp_ids', 'cid__crp_ids', 'crpname__crp_ids', 'party__crp_ids', 'distidrunfor__crp_ids', 'feccandid__crp_ids'] # Blank excel column is necessary.
columns_crp_ids = dict(enumerate(columns_crp_ids))
df_crp_ids = pd.read_excel('../../data/open_secrets/CRP_IDs.xls', header=None, skiprows=15)
df_crp_ids = df_crp_ids.drop(df_crp_ids.columns[0], axis=1)
df_crp_ids = df_crp_ids.rename(columns=columns_crp_ids)

In [None]:
showdf(df_crp_ids)

#### CRP_Categories

In [None]:
from io import StringIO
crp_filepath = '../../data/open_secrets/CRP_Categories.txt'
with open(crp_filepath, 'r') as file:
    lines = file.readlines()

header_line_index = next(i for i, line in enumerate(lines) if line.startswith('Catcode'))
table_data = ''.join(lines[header_line_index:])
df_crp_cats = pd.read_csv(StringIO(table_data), sep='\t')
df_crp_cats.columns = df_crp_cats.columns.str.lower().str.replace(' ', '_')
df_crp_cats.columns = [col + '__crp_cats' for col in df_crp_cats.columns]

In [None]:
showdf(df_crp_cats)