# SI 608 Project – Workspace
<span style="font-size: 18px;">General scratchpad workspace that preloads all the dataframes.</span>
<br>See <code>./modules</code> to review how libraries are installed and imported, as well as where the data is loaded, cleaned, and formatted. This is only here as a helpful tool, make a copy and do whatever you'd like. Or don't use this at all if that's preferable.

[OpenSecrets Data Dictionary Index](../../docs/open_source_data_dictionary.md)
<br><small><em>(View the index with markdown preview)</em></small>

## Environment
Run these cells first to load global functions and variables, augment enivronment, and produce and load primary datasets for this project, overall.

#### Settings
Configure frequently modified behaviors in this notebook.

In [None]:
# PUT THIS IN .env?
DISPLAY_DF = True # for showdf() -> df.head()
SAVE_DF = True # for to_csv() -> pd.to_csv()

#### Init globals
Init file contains global variables and helper functions used throughout the project.

In [None]:
%run modules/init.ipynb

#### Local env
Contain things like API keys that should be excluded from the public repository.

In [None]:
env_vars = load_env_vars()

In [None]:
CONGRESS_API_KEY = env_vars['CONGRESS_API_KEY']
CONGRESS_API_URL = env_vars['CONGRESS_API_URL']

print(f"CONGRESS_API_KEY: {CONGRESS_API_KEY}, CONGRESS_API_URL: {CONGRESS_API_URL}")

In [None]:
CONGRESS_NUM = 116

## Data
From OpenSecrets.org bulk data collection. Contains data from IRS and the FEC.

### 527 data

#### cmtes527

In [None]:
# # OpenSecrets Data Definition: 527 Committees
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Cmtes.htm
# try:
#     df_cmtes527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_cmtes527 = pd.read_csv('../../data/open_secrets/527/cmtes527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__cmtes527', 'rpt__cmtes527', 'ein__cmtes527', 'crp527name__cmtes527', 'affiliate__cmtes527', 'ultorg__cmtes527', 
#                                       'recipcode__cmtes527', 'cmteid__cmtes527', 'cid__cmtes527', 'eccmteid__cmtes527', 'party__cmtes527', 
#                                       'primcode__cmtes527', 'source__cmtes527', 'ffreq__cmtes527', 'ctype__cmtes527', 'csource__cmtes527', 'viewpt__cmtes527',
#                                       'comments__cmtes527', 'state__cmtes527'])
#     print("Finished.")

# showdf(df_cmtes527)

In [None]:
# showdf(df_cmtes527)

#### expends527

In [None]:
# # OpenSecrets Data Dictionary 527 Expenditure Data - from IRS Form 8872B
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Expenditures.htm
# try:
#     df_expends527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_expends527 = pd.read_csv('../../data/open_secrets/527/expends527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names='rpt__expends527', 'formid__expends527', 'schbid__expends527', 'orgname__expends527', 'ein__expends527', 'recipient__expends527', 
#                                      'recipientcrp__expends527', 'amount__expends527', 'date__expends527', 'expcode__expends527', 'source__expends527', 
#                                      'purpose__expends527', 'addr1__expends527', 'addr2__expends527', 'city__expends527', 'state__expends527', 'zip__expends527',
#                                      'employer__expends527', 'occupation__expends527'])
#     print("Finished.")

# showdf(df_expends527)

#### rcpts527

In [None]:
# # OpenSecrets Data Dictionary 527 Contribution Data - from IRS Form 8872A
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Receipts.htm
# try:
#     df_rcpts527.head()
# except NameError:
#     print("Creating dataframe...")
#     df_rcpts527 = pd.read_csv('../../data/open_secrets/527/rcpts527.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['id__rcpts527', 'rpt__rcpts527', 'formid__rcpts527', 'schaid__rcpts527', 'contribid__rcpts527', 'contrib__rcpts527', 
#                                       'amount__rcpts527', 'date__rcpts527', 'orgname__rcpts527', 'ultorg__rcpts527', 'realcode__rcpts527', 
#                                       'recipid__rcpts527', 'recipcode__rcpts527', 'party__rcpts527', 'recipient__rcpts527', 'city__rcpts527', 'state__rcpts527',
#                                       'zip__rcpts527', 'zip4__rcpts527', 'pmsa__rcpts527', 'employer__rcpts527', 'occupation__rcpts527', 'ytd__rcpts527', 'gender__rcpts527', 'source__rcpts527'])
#     print("Finished.")

# showdf(df_rcpts527)

### Campaign Finance 18 data

#### cands18

In [None]:
# OpenSecrets Data Definition: Candidates
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Candidates%20Data.htm
try:
    df_cands18.head()
except NameError:
    print("Creating dataframe...")
    df_cands18 = pd.read_csv('../../data/open_secrets/CampaignFin18/cands18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__cands18', 'feccandid__cands18', 'cid__cands18', 'firstlastp__cands18', 'party__cands18', 'distidrunfor__cands18', 
                                      'distidcurr__cands18', 'currcand__cands18', 'cyclecand__cands18', 'crpico__cands18', 'recipcode__cands18', 
                                      'nopacs__cands18'])
    print("Finished.")

# Remove party labels from names: '3', 'R', 'D', 'I', 'L', 'U', 'i'
df_cands18['firstlast__cands18'] = df_cands18['firstlastp__cands18'].apply(
    lambda x: x.replace(" (3)", "").replace(" (R)", "").replace(" (D)", "").replace(" (I)", "").replace(" (L)", "").replace(" (U)", "").replace(" (i)", "") if isinstance(x, str) else x
)

showdf(df_cands18)

#### cmtes18

In [None]:
# OpenSecrets Table Definition: Committee table
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Cmtes.htm
try:
    df_cmtes18.head()
except NameError:
    print("Creating dataframe...")
    df_cmtes18 = pd.read_csv('../../data/open_secrets/CampaignFin18/cmtes18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__cmtes18', 'cmteid__cmtes18', 'pacshort__cmtes18', 'affiliate__cmtes18', 'ultorg__cmtes18', 'recipid__cmtes18', 
                                      'recipcode__cmtes18', 'feccandid__cmtes18', 'party__cmtes18', 'primcode__cmtes18', 'source__cmtes18', 'sensitive__cmtes18',
                                      'foreign__cmtes18', 'active__cmtes18'])
    print("Finished.")

showdf(df_cmtes18)

#### pac_other18

In [None]:
# # OpenSecrets Data Definition for PAC to PAC Data (Pac_other table)
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20PAC%20to%20PAC%20Data.htm
# try:
#     df_pac_other18.head()
# except NameError:
#     print("Creating dataframe...")
#     df_pac_other18 = pd.read_csv('../../data/open_secrets/CampaignFin18/pac_other18.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__pac_other18', 'fecrecno__pac_other18', 'filerid__pac_other18', 'donorcmte__pac_other18', 'contriblendtrans__pac_other18', 'city__pac_other18', 'state__pac_other18', 
#                                       'zip__pac_other18', 'fecoccemp__pac_other18', 'primcode__pac_other18', 'date__pac_other18', 'amount__pac_other18', 'recipid__pac_other18', 'party__pac_other18', 'otherid__pac_other18',
#                                       'recipcode__pac_other18', 'recipprimcode__pac_other18', 'amend__pac_other18', 'report__pac_other18', 'pg__pac_other18', 'microfilm__pac_other18', 'type__pac_other18',
#                                       'realcode__pac_other18', 'source__pac_other18'])
#     print("Finished.")

# showdf(df_pac_other18)

#### pacs18

In [None]:
# OpenSecrets Data Definition: PAC table (PACs to Candidates)
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20PAC%20to%20Cands%20Data.htm
try:
    df_pacs18.head()
except NameError:
    print("Creating dataframe...")
    df_pacs18 = pd.read_csv('../../data/open_secrets/CampaignFin18/pacs18.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['cycle__pacs18', 'fecrecno__pacs18', 'pacid__pacs18', 'cid__pacs18', 'amount__pacs18', 'date__pacs18', 'realcode__pacs18', 
                                      'type__pacs18', 'di__pacs18', 'feccandid__pacs18'])
    print("Finished.")

showdf(df_pacs18)

#### indivs18

In [None]:
# # OpenSecrets Data Definition: Individual Contribution Data
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Individual%20Contribution%20Data.htm
# try:
#     df_indivs18.head()
# except NameError:
#     print("Creating dataframe...")
#     df_indivs18 = pd.read_csv('../../data/open_secrets/CampaignFin18/indivs18.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__indivs18', 'fectransid__indivs18', 'contribid__indivs18', 'contrib__indivs18', 'recipid__indivs18', 'orgname__indivs18', 
#                                       'ultorg__indivs18', 'realcode__indivs18', 'date__indivs18', 'amount__indivs18', 'street__indivs18', 'city__indivs18', 'state__indivs18',
#                                       'zip__indivs18', 'recipcode__indivs18', 'type__indivs18', 'cmteid__indivs18', 'otherid__indivs18', 'gender__indivs18', 'microfilm__indivs18',
#                                       'occupation__indivs18', 'employer__indivs18', 'source__indivs18'])
#     print("Finished.")

# showdf(df_indivs18)

### Expends18 data

#### expends18

In [None]:
# # OpenSecrets Data Dictionary for Expenditure Data - from FEC electronic filings
# # https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Expenditures.htm
# try:
#     df_expends18.head()
# except NameError:
#     print("Creating dataframe...")
#     df_expends18 = pd.read_csv('../../data/open_secrets/Expend18/expends18.txt', 
#                                quotechar='|', 
#                                sep=',', 
#                                encoding='ISO-8859-1',
#                                header=None,
#                                names=['cycle__expends18', 'id__expends18', 'transid__expends18', 'crpfilerid__expends18', 
#                                       'recipcode__expends18', 'pacshort__expends18', 'crprecipname__expends18', 
#                                       'expcode__expends18', 'amount__expends18', 'date__expends18', 'city__expends18', 'state__expends18', 
#                                       'zip__expends18', 'cmteid_ef__expends18', 'candid__expends18', 'type__expends18',
#                                       'descrip__expends18', 'pg__expends18', 'elecother__expends18', 'enttype__expends18',
#                                       'source__expends18'])
#     print("Finished.")
    
# showdf(df_expends18)

### Lobby data

#### lob_agency

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby agencies
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_agency.htm
try:
    df_lob_agency.head()
except NameError:
    print("Creating dataframe...")
    df_lob_agency = pd.read_csv('../../data/open_secrets/Lobby/lob_agency.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_agency',
                                      'agencyid__lob_agency', 
                                      'agency__lob_agency'])
    print("Finished.")

showdf(df_lob_agency)

#### lob_bills

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby bills
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_bills.htm
try:
    df_lob_bills.head()
except NameError:
    print("Creating dataframe...")
    df_lob_bills = pd.read_csv('../../data/open_secrets/Lobby/lob_bills.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['b_id__lob_bills',
                                      'si_id__lob_bills', 
                                      'congno__lob_bills', 
                                      'bill_name__lob_bills'])
    df_lob_bills['bill_name__lob_bills'] = df_lob_bills['bill_name__lob_bills'].apply(lambda x: x[:-2])
    print("Finished.")

showdf(df_lob_bills)

#### lob_indus

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby industries
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_indus.htm
try:
    df_lob_indus.head()
except NameError:
    print("Creating dataframe...")
    df_lob_indus = pd.read_csv('../../data/open_secrets/Lobby/lob_indus.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['client__lob_indus',
                                      'sub__lob_indus', 
                                      'total__lob_indus', 
                                      'year__lob_indus', 
                                      'catcode__lob_indus'])
    print("Finished.")

showdf(df_lob_indus)

#### lob_issue

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby issues
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_issues.htm
try:
    df_lob_issue.head()
except NameError:
    print("Creating dataframe...")
    df_lob_issue = pd.read_csv('../../data/open_secrets/Lobby/lob_issue.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['si_id__lob_issue',
                                      'uniqid__lob_issue', 
                                      'issueid__lob_issue', 
                                      'issue__lob_issue', 
                                      'specificissue__lob_issue', 
                                      'year__lob_issue'])
    print("Finished.")

showdf(df_lob_issue)

#### lob_issue_no_specific

In [None]:
# OpenSecrets Data Definition for Lobbying Data: Lobby issues (no specific issue)
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_issues.htm
try:
    df_lob_issue_no_specific.head()
except NameError:
    print("Creating dataframe...")
    df_lob_issue_no_specific = pd.read_csv('../../data/open_secrets/Lobby/lob_issue_NoSpecficIssue.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['si_id__lob_issue_NoSpecficIssue', 'uniqid__lob_issue_NoSpecficIssue', 
                                      'issueid__lob_issue_NoSpecficIssue', 'issue__lob_issue_NoSpecficIssue', 
                                      'year__lob_issue_NoSpecficIssue'])
    print("Finished.")

showdf(df_lob_issue_no_specific)

#### lob_lobbying

In [None]:
# OpenSecrets Data Definitions for Lobbying Data: Lobbying
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_lobbying.htm
try:
    df_lob_lobbying.head()
except NameError:
    print("Creating dataframe...")
    df_lob_lobbying = pd.read_csv('../../data/open_secrets/Lobby/lob_lobbying.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_lobbying','registrant_raw__lob_lobbying','registrant__lob_lobbying','isfirm__lob_lobbying','client_raw__lob_lobbying','client__lob_lobbying','ultorg__lob_lobbying','amount__lob_lobbying',
                                      'catcode__lob_lobbying','source__lob_lobbying','self__lob_lobbying','includensfs__lob_lobbying','use__lob_lobbying',
                                      'ind__lob_lobbying', 'year__lob_lobbying', 'type__lob_lobbying', 'typelong__lob_lobbying', 'affiliate__lob_lobbying'])
    print("Finished.")

showdf(df_lob_lobbying)

#### lob_lobbyist

In [None]:
# OpenSecrets Data Definition for Lobbyists
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20lob_lobbyists.htm
try:
    df_lob_lobbyist.head()
except NameError:
    print("Creating dataframe...")
    df_lob_lobbyist = pd.read_csv('../../data/open_secrets/Lobby/lob_lobbyist.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['uniqid__lob_lobbyist', 'lobbyist_lastname_std__lob_lobbyist', 'lobbyist_firstname_std__lob_lobbyist', 'lobbyist_lastname_raw__lob_lobbyist', 
                                      'lobbyist_firstname_raw__lob_lobbyist', 'lobbyist_id__lob_lobbyist', 'year__lob_lobbyist', 'officialposition__lob_lobbyist', 'cid__lob_lobbyist', 'formercongmem__lob_lobbyist'])
    print("Finished.")

showdf(df_lob_lobbyist)

#### lob_rpt

In [None]:
# OpenSecrets Data Definitions for Lobbying Data: Report types
# No documentation provided on OpenSecrets.com
try:
    df_lob_rpt.head()
except NameError:
    print("Creating dataframe...")
    df_lob_rpt = pd.read_csv('../../data/open_secrets/Lobby/lob_rpt.txt', 
                               quotechar='|', 
                               sep=',', 
                               encoding='ISO-8859-1',
                               header=None,
                               names=['typelong__lob_rpt', 'typeshort__lob_rpt'])
    print("Finished.")

showdf(df_lob_rpt)

### IDs and categories

#### CRP_ID

In [None]:
install_if_needed('xlrd')
import xlrd

In [None]:
# Candidate ids
# This dataset is very different, so load it independently.
columns_crp_ids = ['blank_excel_column__crp_ids', 'cid__crp_ids', 'crpname__crp_ids', 'party__crp_ids', 'distidrunfor__crp_ids', 'feccandid__crp_ids'] # Blank excel column is necessary.
columns_crp_ids = dict(enumerate(columns_crp_ids))
df_crp_ids = pd.read_excel('../../data/open_secrets/CRP_IDs.xls', header=None, skiprows=15)
df_crp_ids = df_crp_ids.drop(df_crp_ids.columns[0], axis=1)
df_crp_ids = df_crp_ids.rename(columns=columns_crp_ids)
showdf(df_crp_ids)

#### CRP_Categories

In [None]:
from io import StringIO
crp_filepath = '../../data/open_secrets/CRP_Categories.txt'
with open(crp_filepath, 'r') as file:
    lines = file.readlines()

header_line_index = next(i for i, line in enumerate(lines) if line.startswith('Catcode'))
table_data = ''.join(lines[header_line_index:])
df_crp_cats = pd.read_csv(StringIO(table_data), sep='\t')
df_crp_cats.columns = df_crp_cats.columns.str.lower().str.replace(' ', '_')
df_crp_cats.columns = [col + '__crp_cats' for col in df_crp_cats.columns]

showdf(df_crp_cats)

## Ways and Means Network
*Toy network & full network of member campaign contributions from the 2018 election, resulting in the 116th Congress*

#### Dataframe of committee members

In [None]:
wm_dem_members = []
with open('../../data/wm_members_dem.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        wm_dem_members.append(row)

df_wm_dem_members = df_cands18[df_cands18['firstlast__cands18'].isin(wm_dem_members[0])]

In [None]:
wm_rep_members = []
with open('../../data/wm_members_rep.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        wm_rep_members.append(row)

df_wm_rep_members = df_cands18[df_cands18['firstlast__cands18'].isin(wm_rep_members[0])]

In [None]:
df_wm_members = pd.concat([df_wm_dem_members, df_wm_rep_members])
df_wm_members = df_wm_members.drop_duplicates(subset='firstlast__cands18', keep='first') # Some are duplicates, safe to remove.
showdf(df_wm_members)

#### Pre-defined members for toy network

Richard E Neal (R), chairman

In [None]:
member_cid_1 = 'N00000153'

Kevin Brady (D), ranking member

In [None]:
member_cid_2 = 'N00005883'

Lloyd Doggett (D)

In [None]:
member_cid_3 = 'N00006023'

Devin Nunes (R)

In [None]:
member_cid_4 = 'N00007248'

John B Larson (D)

In [None]:
member_cid_5 = 'N00000575'

Vernon Buchanan (R) 	

In [None]:
member_cid_6 = 'N00027626'

#### Network initialization

In [None]:
# Limiting to the two members above.
df_toy_network = df_wm_members[
    (df_wm_members['cid__cands18'] == member_cid_1) | \
    (df_wm_members['cid__cands18'] == member_cid_2) | \
    (df_wm_members['cid__cands18'] == member_cid_3) | \
    (df_wm_members['cid__cands18'] == member_cid_4) | \
    (df_wm_members['cid__cands18'] == member_cid_5) | \
    (df_wm_members['cid__cands18'] == member_cid_6)
]

# No limits for the full network.
df_full_network = df_wm_members.copy()

showdf(df_toy_network)

#### Candidate pacs
Extract candidate pacs/cmtes from the others and augment column names.

In [None]:
df_cand_cmtes18 = df_cmtes18[df_cmtes18['recipid__cmtes18'].str.startswith('N')] # Only candidate committees.
df_cand_cmtes18.columns = df_cand_cmtes18.columns.str.replace(r"(cmtes18)", r"cand_\1", regex=True)

#### Non-candidate pacs
While we're at it, extract all the non-partisan/candidate pacs and augment column names for later steps.

In [None]:
df_noncand_cmtes18 = df_cmtes18[df_cmtes18['party__cmtes18'].isna()] # Excludes party, joint fundraising, leadership, or candidate committees.
df_noncand_cmtes18['sensitive__cmtes18'] = df_noncand_cmtes18['sensitive__cmtes18'].apply(lambda x: x.upper() if isinstance(x, str) else x)
df_noncand_cmtes18.columns = df_noncand_cmtes18.columns.str.replace(r"(cmtes18)", r"noncand_\1", regex=True)

#### Join candidates and candidate pacs

In [None]:
df_toy_network = pd.merge(df_toy_network, df_cand_cmtes18, left_on='cid__cands18', right_on='recipid__cand_cmtes18', how='inner')
df_full_network = pd.merge(df_full_network, df_cand_cmtes18, left_on='cid__cands18', right_on='recipid__cand_cmtes18', how='inner')
showdf(df_toy_network)

#### Join inflows for each candidate pac

In [None]:
df_inflow_pacs18 = df_pacs18[df_pacs18['amount__pacs18'] > 500] # Exclude outflows and small contributions.
df_toy_network = pd.merge(df_toy_network, df_inflow_pacs18, left_on='cid__cands18', right_on='cid__pacs18', how='inner')
df_full_network = pd.merge(df_full_network, df_inflow_pacs18, left_on='cid__cands18', right_on='cid__pacs18', how='inner')
showdf(df_toy_network)

#### Join sources of inflows

In [None]:
df_toy_network = pd.merge(df_toy_network, df_noncand_cmtes18, left_on='pacid__pacs18', right_on='cmteid__noncand_cmtes18', how='inner')
df_full_network = pd.merge(df_full_network, df_noncand_cmtes18, left_on='pacid__pacs18', right_on='cmteid__noncand_cmtes18', how='inner')
showdf(df_toy_network)

#### Join source's industry category codes

In [None]:
df_toy_network = pd.merge(df_toy_network, df_crp_cats, left_on='primcode__noncand_cmtes18', right_on='catcode__crp_cats', how='inner')
df_full_network = pd.merge(df_full_network, df_crp_cats, left_on='primcode__noncand_cmtes18', right_on='catcode__crp_cats', how='inner')
showdf(df_toy_network)

#### Join industry details of category codes

In [None]:
df_lob_indus_2018 = df_lob_indus[df_lob_indus['year__lob_indus'] == 2018]
df_toy_network = pd.merge(df_toy_network, df_lob_indus_2018, left_on='ultorg__noncand_cmtes18', right_on='client__lob_indus', how='left')
df_full_network = pd.merge(df_full_network, df_lob_indus_2018, left_on='ultorg__noncand_cmtes18', right_on='client__lob_indus', how='left')
showdf(df_toy_network)

#### Final cleanup

In [None]:
drop_columns = ['cycle__cands18', 'feccandid__cands18', 'firstlastp__cands18', 'distidrunfor__cands18', 'distidcurr__cands18',
                'currcand__cands18', 'cyclecand__cands18', 'recipcode__cands18', 'nopacs__cands18', 'cycle__cand_cmtes18', 'pacshort__cand_cmtes18',
                'affiliate__cand_cmtes18', 'recipid__cand_cmtes18', 'recipcode__cand_cmtes18',
                'feccandid__cand_cmtes18', 'party__cand_cmtes18', 'primcode__cand_cmtes18', 'source__cand_cmtes18',
                'sensitive__cand_cmtes18', 'foreign__cand_cmtes18', 'active__cand_cmtes18', 'cycle__pacs18', 'fecrecno__pacs18',
                'pacid__pacs18', 'cid__pacs18', 'realcode__pacs18', 'type__pacs18', 'di__pacs18',
                'feccandid__pacs18', 'cycle__noncand_cmtes18', 'pacshort__noncand_cmtes18', 'affiliate__noncand_cmtes18', 'recipid__noncand_cmtes18', 
                'feccandid__noncand_cmtes18', 'party__noncand_cmtes18', 'primcode__noncand_cmtes18', 
                'source__noncand_cmtes18', 'active__noncand_cmtes18', 'catcode__crp_cats', 'catorder__crp_cats', 'year__lob_indus']

df_toy_network = df_toy_network.drop(drop_columns, axis=1)
df_full_network = df_full_network.drop(drop_columns, axis=1)

showdf(df_toy_network)
to_csv(df_toy_network)
to_csv(df_full_network)

### Prepare data
Create a dataframe connected a table of personal candidate info and their candidate pac.

In [None]:
df_graph_network = df_full_network.copy()

#### Fields to be graphed

In [None]:
# Target, source, weight, attribute
df_candpacs_to_pactrans = df_graph_network[['ultorg__cand_cmtes18', 'ultorg__noncand_cmtes18', 'amount__pacs18', 'party__cands18', 'recipcode__noncand_cmtes18', 'sector__crp_cats']]
showdf(df_candpacs_to_pactrans)

#### Join same-party candidate pacs

In [None]:
df_dem_candpacs = df_candpacs_to_pactrans[df_candpacs_to_pactrans['party__cands18'] == 'D'][['ultorg__cand_cmtes18']].drop_duplicates()
df_dem_candpacs = df_candpacs_to_pactrans[df_candpacs_to_pactrans['party__cands18'] == 'D'][['ultorg__cand_cmtes18']].drop_duplicates()
df_dem_candpacs_cross = pd.merge(df_dem_candpacs, df_dem_candpacs, how='cross')
df_dem_candpacs_cross = df_dem_candpacs_cross[df_dem_candpacs_cross['ultorg__cand_cmtes18_x'] != df_dem_candpacs_cross['ultorg__cand_cmtes18_y']].rename(columns={'ultorg__cand_cmtes18_x': 'ultorg_x__cand_cmtes18', 'ultorg__cand_cmtes18_y': 'ultorg_y__cand_cmtes18'})
showdf(df_dem_candpacs_cross)

In [None]:
df_rep_candpacs = df_candpacs_to_pactrans[df_candpacs_to_pactrans['party__cands18'] == 'R'][['ultorg__cand_cmtes18']].drop_duplicates()
df_rep_candpacs = df_candpacs_to_pactrans[df_candpacs_to_pactrans['party__cands18'] == 'R'][['ultorg__cand_cmtes18']].drop_duplicates()
df_rep_candpacs_cross = pd.merge(df_rep_candpacs, df_rep_candpacs, how='cross')
df_rep_candpacs_cross = df_rep_candpacs_cross[df_rep_candpacs_cross['ultorg__cand_cmtes18_x'] != df_rep_candpacs_cross['ultorg__cand_cmtes18_y']].rename(columns={'ultorg__cand_cmtes18_x': 'ultorg_x__cand_cmtes18', 'ultorg__cand_cmtes18_y': 'ultorg_y__cand_cmtes18'})
showdf(df_rep_candpacs_cross)

### Candidates to Bills
Use data from api.congress.gov to lookup bills from a specific session of Congress, and join associated candidate [co]sponsoring information. 

#### Get list of member numbers
The list is different than Cands22 in that it maps a different ID to candidates, which we need to lookup bills that they've [co]sponsored.

In [None]:
cong_member_limit = '250'

In [None]:
filepath = '../../data/congress_api/congress_members116.json'

if not os.path.exists(filepath):
    print(f"Data source at {filepath} not found, generating from Congress.gov's remote API..")

    api_endpoint = f"member/congress/{CONGRESS_NUM}"
    api_params = {
        "limit": cong_member_limit,
        "api_key": CONGRESS_API_KEY,
        "offset": '0',
    }
    
    # Get number of pages (offset) to loop through, where
    # each page has up to 250 rows.
    api_page_params = api_params.copy()
    api_page_params['limit'] = '1'
    page_response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_page_params)
    page_count = (page_response.json()['pagination']['count']) // int(cong_member_limit) + 1 # add 1 b/c we are rounding down

    members = []
    offset = 0
    page_num = 0
    for i in list(range(page_count)):
        page_num += 1

        print(f"Reading page {page_num}...") 
  
        api_params['offset'] = offset
        response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_params)
        members.append(response.json()['members'])
        offset += int(cong_member_limit)

    print(f"Formatting data...")

    # Convert lists of dicts into one dict.
    members_dict = [item for sublist in members for item in sublist]

    # Save as json.    
    with open(filepath, 'w') as json_file:
        print(f"Saving data...")
        json.dump(members_dict, json_file, indent=4)

    print(f"Data saved at {filepath}")

# Create and process df.
print(f"Loading dataframe...")

df_cong_members = pd.read_json(filepath)

print(f"Finished.")

showdf(df_cong_members)

#### Extract WM members

**Match Congress.gov records to CampaignFin18 records**
<br>Because the data is real shitty, we'll need to match on last name, state abbreviation, and party abbreviation, which should be plenty enough accurate.

In [None]:
# Map states to abbreviations.
state_abbrev = pd.read_csv('../../data/state_abbrev.csv')
state_abbrev = state_abbrev.rename(columns={'State': 'state', 'Abbreviation': 'state_abbrev'})
showdf(state_abbrev)

In [None]:
# Remove accents from names.
import unicodedata
def remove_accents(input_str):
    return ''.join(
        c for c in unicodedata.normalize('NFKD', input_str)
        if unicodedata.category(c) != 'Mn'
    )

# Note: df_cong_members is created a few cells up.
# Below, the three new fields should only be set once to prevent fudging up the df.
if 'last_name' not in df_cong_members.columns:
    df_cong_members = pd.merge(df_cong_members, state_abbrev, on='state', how='inner')
    df_cong_members['last_name'] = df_cong_members['name'].apply(lambda x: x.split(",")[0])
    df_cong_members['last_name'] = df_cong_members['last_name'].apply(lambda x: remove_accents(x)) # remove accents
    df_cong_members['party_abbrev'] = df_cong_members['partyName'].apply(lambda x: x[:1])

showdf(df_cong_members)

In [None]:
df_wm_members = df_wm_members.merge(
    df_cong_members[['last_name', 'state_abbrev', 'party_abbrev', 'bioguideId']],
    on=['last_name', 'state_abbrev', 'party_abbrev'],
    how='left'
)
showdf(df_wm_members)

#### Find bills [co]sponsored by WM members

**Make into one function**

In [None]:
filepath = '../../data/congress_api/congress_sponsors116.json'
limit = 250

if not os.path.exists(filepath):
    print(f"Data source at {filepath} not found, generating from Congress.gov's remote API..")

    # Get [co]sponsored bills for each member.
    rows = []
    for index, row in df_wm_members.iterrows():
        print(f"Processing member {index + 1} of {len(df_wm_members)}...")
        
        bioguide_id = row['bioguideId']
        api_endpoint = f"member/{bioguide_id}/sponsored-legislation"
        api_params = {
            "limit": limit,
            "api_key": CONGRESS_API_KEY,
            "offset": '0',
        }
        
        # Get number of pages.
        api_page_params = api_params.copy()
        api_page_params['limit'] = '1'
        page_response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_page_params)
        page_count = (page_response.json()['pagination']['count']) // int(limit) + 1 # add 1 b/c we are rounding down
    
        offset = 0
        page_num = 0
        for i in list(range(page_count)):
            page_num += 1
    
            print(f"Reading page {page_num}...") 
      
            api_params['offset'] = offset
            response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_params)
            response_json = response.json()
            
            # Add metadata from the member to each item in the 'sponsoredLegislation'
            for item in response_json.get('sponsoredLegislation', []):
                item['member_cid'] = row['cid__cands18']
                item['member_bioguideId'] = row['bioguideId']
    
            # Append the items to rows
            rows.append(response_json.get('sponsoredLegislation', []))
            
            offset += int(limit)

    print(f"Formatting data...")

    # Convert lists of dicts into one dict.
    rows_dict = [item for sublist in rows for item in sublist]

    # Save as json.    
    with open(filepath, 'w') as json_file:
        print(f"Saving data...")
        json.dump(rows_dict, json_file, indent=4)

    print(f"Data saved at {filepath}")

# Create and process df.
print(f"Loading dataframe...")

df_wm_sponsors = pd.read_json(filepath)

print(f"Finished.")

**CURRENTLY THIS CONTAINS THE SAME OUTPUT AS ABOVE!!!**

In [None]:
filepath = '../../data/congress_api/congress_cosponsors116.json'
limit = 250

if not os.path.exists(filepath):
    print(f"Data source at {filepath} not found, generating from Congress.gov's remote API..")

    # Get [co]sponsored bills for each member.
    rows = []
    for index, row in df_wm_members.iterrows():
        print(f"Processing member {index + 1} of {len(df_wm_members)}...")
        
        bioguide_id = row['bioguideId']
        api_endpoint = f"member/{bioguide_id}/cosponsored-legislation"
        api_params = {
            "limit": limit,
            "api_key": CONGRESS_API_KEY,
            "offset": '0',
        }
        
        # Get number of pages.
        api_page_params = api_params.copy()
        api_page_params['limit'] = '1'
        page_response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_page_params)
        page_count = (page_response.json()['pagination']['count']) // int(limit) + 1 # add 1 b/c we are rounding down
    
        offset = 0
        page_num = 0
        for i in list(range(page_count)):
            page_num += 1
    
            print(f"Reading page {page_num}...") 
      
            api_params['offset'] = offset
            response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_params)
            response_json = response.json()
            
            # Add metadata from the member to each item in the 'sponsoredLegislation'
            for item in response_json.get('cosponsoredLegislation', []):
                item['member_cid'] = row['cid__cands18']
                item['member_bioguideId'] = row['bioguideId']
    
            # Append the items to rows
            rows.append(response_json.get('cosponsoredLegislation', []))
            
            offset += int(limit)

    print(f"Formatting data...")

    # Convert lists of dicts into one dict.
    rows_dict = [item for sublist in rows for item in sublist]

    # Save as json.    
    with open(filepath, 'w') as json_file:
        print(f"Saving data...")
        json.dump(rows_dict, json_file, indent=4)

    print(f"Data saved at {filepath}")

# Create and process df.
print(f"Loading dataframe...")

df_wm_cosponsors = pd.read_json(filepath)

print(f"Finished.")

In [None]:
# Limit to 116th
df_wm_sponsors116 = df_wm_sponsors[df_wm_sponsors['congress'] == 116].reset_index(drop=True)
df_wm_cosponsors116 = df_wm_sponsors[df_wm_sponsors['congress'] == 116].reset_index(drop=True)

# Some bill numbers are nan.
df_wm_sponsors116 = df_wm_sponsors116.dropna(subset=['number'])
df_wm_cosponsors116 = df_wm_cosponsors116.dropna(subset=['number'])

In [None]:
# df_wm_sponsors116_copy = df_wm_sponsors116.copy()

In [None]:
# showdf(df_wm_sponsors116_copy)

#### Find details of each [co]sponsored bill

**Make into one function**

In [None]:
congress_chamber = 'hr'
filepath = '../../data/congress_api/congress_wm_spons_bills116.json'

if not os.path.exists(filepath):
    print(f"Data source at {filepath} not found, generating from Congress.gov's remote API..")

    # Get [co]sponsored bills for each member.
    rows = []
    for index, row in df_wm_sponsors116.iterrows():
        if i % (len(df_wm_sponsors116) // 100) == 0:
            print(f"Processing bill {index + 1} of {len(df_wm_sponsors116)}...")

        bill_num = int(row['number'])
        api_endpoint = f"bill/{CONGRESS_NUM}/{congress_chamber}/{bill_num}"
        api_params = {
            "api_key": CONGRESS_API_KEY,
        }
        response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_params)
        response_json = response.json()
        rows.append(response_json['bill'])

    print(f"Formatting data...")

    # Convert lists of dicts into one dict.
    # But if rows is already flat, skip flattening into one dict.
    if any(isinstance(item, list) for item in rows):
        rows_dict = [item for sublist in rows for item in sublist]
    else:
        rows_dict = rows

    # Save as json.    
    with open(filepath, 'w') as json_file:
        print(f"Saving data...")
        json.dump(rows_dict, json_file, indent=4)

    print(f"Data saved at {filepath}")

# Create and process df.
print(f"Loading dataframe...")

df_wm_spons_bills = pd.read_json(filepath)

print(f"Finished.")

showdf(df_wm_spons_bills)

In [None]:
congress_chamber = 'hr'
filepath = '../../data/congress_api/congress_wm_cospons_bills116.json'

if not os.path.exists(filepath):
    print(f"Data source at {filepath} not found, generating from Congress.gov's remote API..")

    # Get [co]sponsored bills for each member.
    rows = []
    for index, row in df_wm_cosponsors116.iterrows():
        if i % (len(df_wm_cosponsors116) // 100) == 0:
            print(f"Processing bill {index + 1} of {len(df_wm_cosponsors116)}...")

        bill_num = int(row['number'])
        api_endpoint = f"bill/{CONGRESS_NUM}/{congress_chamber}/{bill_num}"
        api_params = {
            "api_key": CONGRESS_API_KEY,
        }
        response = requests.get(CONGRESS_API_URL + api_endpoint, params=api_params)
        response_json = response.json()
        rows.append(response_json['bill'])

    print(f"Formatting data...")

    # Convert lists of dicts into one dict.
    # But if rows is already flat, skip flattening into one dict.
    if any(isinstance(item, list) for item in rows):
        rows_dict = [item for sublist in rows for item in sublist]
    else:
        rows_dict = rows

    # Save as json.    
    with open(filepath, 'w') as json_file:
        print(f"Saving data...")
        json.dump(rows_dict, json_file, indent=4)

    print(f"Data saved at {filepath}")

# Create and process df.
print(f"Loading dataframe...")

df_wm_cospons_bills = pd.read_json(filepath)

print(f"Finished.")

showdf(df_wm_cospons_bills)

#### Join [co]sponsors to bill details

In [None]:
# # I'm trying to extract the "name" from "policyArea"
# df_wm_spons_bills.iloc[1]['policyArea']

In [None]:
df_wm_spons_bills['bioguidID'] = df_wm_spons_bills['sponsors'].apply(lambda x: x[0]['bioguideId'])

# # TRYING TO EXTRACT POLICY AREA!
# df_wm_spons_bills['policyArea'] = df_wm_spons_bills['policyArea'].apply(lambda x: str(x))
# df_wm_spons_bills['policyArea_clean'] = df_wm_spons_bills['policyArea'].apply(lambda x: json.loads(x)['name'] if pd.notnull(x) else None)

df_wm_spons_bills = df_wm_spons_bills[['congress', 'introducedDate', 'number', 'title', 'bioguidID']]
showdf(df_wm_spons_bills)

#### Join candidates to [co]sponsors

In [None]:
df_wm_candspons_bills = pd.merge(df_wm_spons_bills, df_wm_members, left_on='bioguidID', right_on='bioguideId', how='inner')
df_wm_candspons_bills = pd.merge(df_wm_candspons_bills, df_cmtes18, left_on='cid__cands18', right_on='recipid__cmtes18', how='inner')
showdf(df_wm_candspons_bills)

### Bills to Issues, Policy Areas

#### Bills from 116th Congress

In [None]:
try:
    df_lob_bills116.head()
except:
    print("Creating dataframe...")
    df_lob_bills116 = df_lob_bills[(df_lob_bills['congno__lob_bills'].notna()) & (df_lob_bills['b_id__lob_bills'].str.startswith('hr'))]
    df_lob_bills116['congno__lob_bills'] = df_lob_bills116['congno__lob_bills'].astype(int)
    df_lob_bills116 = df_lob_bills116[df_lob_bills116['congno__lob_bills'] == CONGRESS_NUM]
    df_lob_bills116 = df_lob_bills116[~ df_lob_bills116['b_id__lob_bills'].str.contains('hres')]
    # Extract bill number text between "hr" and "-116".
    df_lob_bills116['b_id_num__lob_bills'] = df_lob_bills116['b_id__lob_bills'].apply(lambda x: re.search(r'[a-zA-Z]+(\d+)-', x).group(1))
    df_lob_bills116 = df_lob_bills116.sort_values(by='b_id_num__lob_bills', ascending=True)
    print("Finished")

showdf(df_lob_bills116)

#### Lob issues
filtered Jan-3-2019 to Jan-3-2021

In [None]:
df_lob_issue116 = df_lob_issue.copy()
df_lob_issue116['year__lob_issue'] = df_lob_issue116['year__lob_issue'].astype(int)
df_lob_issue116 = df_lob_issue[(df_lob_issue116['year__lob_issue'] == 2019) | (df_lob_issue116['year__lob_issue'] == 2020)]

#### Join bills with issues

In [None]:
df_lob_bills_issues116 = pd.merge(df_lob_bills116, df_lob_issue116, left_on='si_id__lob_bills', right_on='si_id__lob_issue', how='inner')
showdf(df_lob_bills_issues116)

## Ways and Means Visualization
Intention is to create a very large and changeable network where specified notes/classes can be toggled in various ways to create different expressions of various network concepts. This should really be a class object.

In [None]:
# ============================================================
# SETUP GRAPH
# ============================================================

# Add transactional cand_pac-noncand_pac edges.
G = nx.from_pandas_edgelist(df_candpacs_to_pactrans, source='ultorg__noncand_cmtes18', target='ultorg__cand_cmtes18', 
                            edge_attr='amount__pacs18', create_using=nx.DiGraph())

# USE A DIGRAPH FOR PROGRAMATIC ANALSYIS, BUT NOT FOR VIZ.
# G = nx.from_pandas_edgelist(df_candpacs_to_pactrans, source='ultorg__noncand_cmtes18', target='ultorg__cand_cmtes18', 
                            # edge_attr='amount__pacs18', create_using=nx.DiGraph())

# Add Democrat cand_pac-cand_pac edges.
for _, row in df_dem_candpacs_cross.iterrows():
    G.add_edge(row['ultorg_x__cand_cmtes18'], row['ultorg_y__cand_cmtes18'], weight=1, attribute='D')

# Add Republican cand_pac-cand_pac edges.
for _, row in df_rep_candpacs_cross.iterrows():
    G.add_edge(row['ultorg_x__cand_cmtes18'], row['ultorg_y__cand_cmtes18'], weight=1, attribute='R')

# Add [co]sponsored bills and details.
for _, row in df_wm_candspons_bills.iterrows():
    G.add_edge(row['number'], row['cmteid__cmtes18'], weight=1, attribute=row['title'])

# # Add issues to bills.
# for _, row in df_lob_bills_issues116.iterrows():
#     G.add_edge(row['si_id__lob_issue'], row['b_id_num__lob_bills'], weight=1, attribute=row['issue__lob_issue'])

# Edge weights ~ dollar amounts.
edge_weights = []
for edge in G.edges(data=True):
    source, target, attributes = edge
    if 'amount__pacs18' in attributes:
        edge_weights.append(attributes['amount__pacs18'])

# Spread weight range.
min_thickness, max_thickness = 0.0025, 0.75
min_weight, max_weight = min(edge_weights), max(edge_weights)
threshold_1 = 100000  # Start of steep outliers
threshold_2 = 7000    # Flatten smaller values below this

# Compress weights.
def transform_weight(weight):
    if weight > threshold_1:
        # Heavy punishment for extreme amounts.
        return np.log1p(weight) / np.log1p(max_weight)
    elif weight > threshold_2:
        # Moderate punishment for moderate amounts.
        return (weight - threshold_2) / (threshold_1 - threshold_2)
    else:
        # Linear scaling for miniscule amounts.
        return (weight - min_weight) / (threshold_2 - min_weight)

# Normalize weights with piecewise scaling
normalized_weights = [transform_weight(weight) for weight in edge_weights]

# Exponent < 1 reduces larger values more
normalized_weights = [weight ** 0.5 for weight in normalized_weights]  

# Scale normalized weights to the thickness range
edge_widths = [min_thickness + (max_thickness - min_thickness) * weight for weight in normalized_weights]

# Node pac-to-candidate transaction attributes.
for index, row in df_candpacs_to_pactrans.iterrows():
    # Party info
    G.nodes[row['ultorg__cand_cmtes18']]['party__cands18'] = row['party__cands18']
    G.nodes[row['ultorg__noncand_cmtes18']]['party__cands18'] = G.nodes[row['ultorg__noncand_cmtes18']].get('party__cands18', None) # Why is this here?
    # Link contributor non-candidate pacs to candidate pacs.
    G.nodes[row['ultorg__noncand_cmtes18']]['recipcode__noncand_cmtes18'] = row['recipcode__noncand_cmtes18']

# Add [co]sponsored bills attributes.
for index, row in df_wm_candspons_bills.iterrows():
    # Party info
    G.nodes[row['number']]['party__cands18'] = row['party__cands18']

# Party colors.
color_map = []
for node in G.nodes(data=True):
    if node[1].get('party__cands18') == 'D':
        color_map.append('skyblue')
    elif node[1].get('party__cands18') == 'R':
        color_map.append('lightcoral')
    elif node[1].get('recipcode__noncand_cmtes18') == 'PB':
        color_map.append('#cc00cc')
    elif node[1].get('recipcode__noncand_cmtes18') == 'PL':
        color_map.append('#33cc33')
    elif node[1].get('recipcode__noncand_cmtes18') == 'PI':
        color_map.append('#ff9900')
    elif node[1].get('recipcode__noncand_cmtes18') == 'OB':
        color_map.append('#ff66ff')
    elif node[1].get('recipcode__noncand_cmtes18') == 'OL':
        color_map.append('#99ff99')
    elif node[1].get('recipcode__noncand_cmtes18') == 'OI':
        color_map.append('#ffcc66')
    else:
        color_map.append('gray')


# ============================================================
# DISPLAY GRAPH
# ============================================================

layouts = {
    # "random": nx.random_layout,
    # "spring": nx.spring_layout,
    "kamada_kawai": nx.kamada_kawai_layout,
    # "graphviz": nx.nx_agraph.graphviz_layout,
    # "circular": nx.circular_layout,
    # "spectral": nx.spectral_layout,
}

# Calculate node sizes
node_sizes = {}

# Size PAC nodes (contributors) based on total amount contributed
for node in G.nodes():
    total_contribution = sum(
        data['amount__pacs18'] for _, target, data in G.edges(node, data=True) if 'amount__pacs18' in data
    )
    # Scale the size (adjust scaling factor as needed)
    node_sizes[node] = 100 + 10 * np.sqrt(total_contribution)  # Example scaling

for node in G.nodes():
    if node not in node_sizes:
        node_sizes[node] = 300  # Default size for unmatched nodes

# For graphviz_layout:
# $ brew install graphviz (bash terminal)
# conda install -c conda-forge pygraphviz (in notebook cell)
# pip install pydot (in notebook cell)

for layout_name, layout_func in layouts.items():
    plt.figure(figsize=(40, 30))
    
    if layout_name == 'nx_agraph.graphviz_layout':
        pos = layout_func(G, prog='dot')
    else:
        pos = layout_func(G)

    degrees = dict(G.degree())
    sorted_nodes = sorted(G.nodes, key=lambda x: degrees[x]) # high degree nodes in front
    
    nx.draw(G, pos, with_labels=False, node_color=color_map, font_size=7, font_color="black",
            width=edge_widths, node_size=[node_sizes.get(node, 300) for node in G.nodes()], bbox=dict(facecolor='white', edgecolor='black',
            boxstyle='round,pad=0.3')) # node_size=[node_sizes[node] for node in G.nodes()]

    # Only PAC to Candidate PAC edges have a weight derived from the amount field.
    # edge_labels = {
    #     (u, v): f"{d['amount__pacs18']}"
    #     for u, v, d in G.edges(data=True)
    #     if 'amount__pacs18' in d
    # }
    # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=5)
    
    plt.title("PACs to Candidates – " + layout_name + " layout", fontsize=50)
    plt.savefig('../../outputs/viz_' + layout_name + '.png', format='png', dpi=300)
    plt.show()

## Analysis

### Network Analysis

For beggining an exploration of which features and patterns might be useful for making predictions or finding other insights.

In [None]:
# Global clustering coefficient
# (ratio of closed triplets to triplets)
print("Transitivity:", nx.transitivity(G))

In [None]:
# Global clustering coefficient
print("Global Clustering Coefficient:", nx.transitivity(G))

In [None]:
# Average clustering coefficient
print("Average Clustering Coefficient:", nx.average_clustering(G))

In [None]:
# Fraction of possible edges
print("Density:", round(nx.density(G), 3))

In [None]:
# Network diameter.
if nx.is_connected(G):
    print("Diameter:", nx.diameter(G))
else:
    print("Graph is not connected.")

In [None]:
# Average path length
if nx.is_connected(G):
    print("Average Path Length:", round(nx.average_shortest_path_length(G), 3))
else:
    print("Graph is not connected.")

In [None]:
# Node with the minimum eccentricity.
if nx.is_connected(G):
    print("Radius:", nx.radius(G))

In [None]:
# Number of triangles
print("Total Triangles:", sum(nx.triangles(G).values()) // 3)

In [None]:
# Assortativity coefficient
print("Assortativity Coefficient:", round(nx.degree_assortativity_coefficient(G), 3))

In [None]:
# Connected components
print("Connected Components:", len(list(nx.connected_components(G))))

In [None]:
# Edge connectivity
# Minimum number of edges that need to be removed to disconnect the graph.
print("Edge Connectivity:", nx.edge_connectivity(G))

### Contribution analysis

#### Fundraising figures

In [None]:
# Num contributions
print(f"Overall, members received {len(df_full_network['amount__pacs18']):,} contributions")
print(f"Democrats received {len(df_full_network[df_full_network['party__cands18'] == 'D']):,} contributions")
print(f"Democrats received {len(df_full_network[df_full_network['party__cands18'] == 'R']):,} contributions")
print(f"All others received {len(df_full_network[(df_full_network['party__cands18'] == 'D') & (df_full_network['party__cands18'] == 'R')]):,} contributions")

print('-' * 100)

# Money raised
print(f"Overall, members raised ${df_full_network['amount__pacs18'].sum():,}")
print(f"Democrats raised ${df_full_network[df_full_network['party__cands18'] == 'D']['amount__pacs18'].sum():,}")
print(f"Republicans raised ${df_full_network[df_full_network['party__cands18'] == 'R']['amount__pacs18'].sum():,}")
print(f"All others raised ${df_full_network[(df_full_network['party__cands18'] != 'D') & (df_full_network['party__cands18'] != 'R')]['amount__pacs18'].sum():,}")

print('-' * 100)

# Averages
print(f"Overall, members raised ${round(df_full_network['amount__pacs18'].sum()/len(df_full_network), 2):,} on average")
print(f"Democrats raised ${round(df_full_network[df_full_network['party__cands18'] == 'D']['amount__pacs18'].sum()/len(df_full_network[df_full_network['party__cands18'] == 'D']), 2):,} on average")
print(f"Republicans raised ${round(df_full_network[df_full_network['party__cands18'] == 'R']['amount__pacs18'].sum()/len(df_full_network[df_full_network['party__cands18'] == 'R']), 2):,} on average")
print(f"All other raised $0 on average")

Extreme outliers are present. For example, Steven Horsford (running for an open seat) has a single, massive \\$2,663,800 donation that vastly overwhelms other transactions. After that, Vern Buchanan has a rather large \\$588,015 donation. Transactions eventually settle at around \\$5,000 per contribution after the first 155 contributions.

#### Distribution of contributions
Earnings are extremely right-skewed before it the curve goes linear. 

In [None]:
df_all_amounts = df_full_network.copy()
df_all_amounts = df_all_amounts.sort_values(by='amount__pacs18', ascending=False)
df_all_amounts = df_all_amounts.reset_index(drop=True)

In [None]:
showdf(df_all_amounts)

In [None]:
from matplotlib.ticker import FuncFormatter

# Format ticks as dollars.
def dollar_formatter(x, pos):
    return f'${x:,.0f}'

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

axes[0].plot(df_all_amounts.index[:10], df_all_amounts['amount__pacs18'][:10])
axes[0].set_xlabel('Num of individual contributions')
axes[0].set_ylabel('Contribution dollar amount')
axes[0].set_title('Top 10 contributions')
axes[0].yaxis.set_major_formatter(FuncFormatter(dollar_formatter))

axes[1].plot(df_all_amounts.index[:50], df_all_amounts['amount__pacs18'][:50])
axes[1].set_xlabel('Num of individual contributions')
axes[1].set_ylabel('Contribution dollar amount')
axes[1].set_title('Top 50 contributions')
axes[1].yaxis.set_major_formatter(FuncFormatter(dollar_formatter))

axes[2].plot(df_all_amounts.index[:500], df_all_amounts['amount__pacs18'][:500])  # Same plot for demonstration
axes[2].set_xlabel('Num of individual contributions')
axes[2].set_ylabel('Contribution dollar amount')
axes[2].set_title('Top 500 contributions')
axes[2].yaxis.set_major_formatter(FuncFormatter(dollar_formatter))

plt.tight_layout()
plt.show()

#### Top recipients
Receipts represent all PAC to Candidate transfers, and 

In [None]:
# Look at this thing.
df_all_amounts = df_graph_network.sort_values(by='amount__pacs18', ascending=False).reset_index(drop=True)
print(f"Number of transactions:\n{len(df_all_amounts)}\n")
print(f"Seems the top earner is going for an open seat!\n")
print(f"The five earners are:\n{df_all_amounts['ultorg__cand_cmtes18'].unique()[:10]}\n")
df_top_amounts = df_all_amounts.sort_values(by='amount__pacs18')[['amount__pacs18']].reset_index(drop=True)
print(f"The largest contribution:\n{df_top_amounts['amount__pacs18'].max()}\n")
print(f"The top ten largest contributions:\n{df_top_amounts.nlargest(10, 'amount__pacs18')}")