In [1]:
import numpy as np
import pandas as pd

We will prepare our data and save it to CSVs to avoid having to recalculate them everytime we want to use it.

1. FedReporter data needs to be aggregated at the congressional district level from 2010 to 2018
2. Congressional Term data needs to be transformed to show the political party makeup of each district from 2010 to 2018

In [2]:
states = {'AK': 'Alaska','AL': 'Alabama','AR': 'Arkansas','AS': 'American Samoa','AZ': 'Arizona','CA': 'California',
          'CO': 'Colorado','CT': 'Connecticut','DC': 'District of Columbia','DE': 'Delaware','FL': 'Florida','GA': 'Georgia',
          'GU': 'Guam','HI': 'Hawaii','IA': 'Iowa','ID': 'Idaho','IL': 'Illinois','IN': 'Indiana','KS': 'Kansas','KY': 'Kentucky',
          'LA': 'Louisiana','MA': 'Massachusetts','MD': 'Maryland','ME': 'Maine','MI': 'Michigan','MN': 'Minnesota','MO': 'Missouri',
          'MP': 'Northern Mariana Islands','MS': 'Mississippi','MT': 'Montana','NA': 'National','NC': 'North Carolina',
          'ND': 'North Dakota','NE': 'Nebraska','NH': 'New Hampshire','NJ': 'New Jersey','NM': 'New Mexico','NV': 'Nevada',
          'NY': 'New York','OH': 'Ohio','OK': 'Oklahoma','OR': 'Oregon','PA': 'Pennsylvania','PR': 'Puerto Rico',
          'RI': 'Rhode Island','SC': 'South Carolina','SD': 'South Dakota','TN': 'Tennessee','TX': 'Texas','UT': 'Utah',
          'VA': 'Virginia','VI': 'Virgin Islands','VT': 'Vermont','WA': 'Washington','WI': 'Wisconsin','WV': 'West Virginia',
          'WY': 'Wyoming'
}

state_name_map = {v: k for k, v in states.items()}

In [3]:
# Prepping fed reporter data to be used for analysis
def to_cd(row):
    if row['CONGRESSIONAL_DISTRICT'] == 0:
        return row["ORGANIZATION_STATE"] + "-" +"AL"
    else:
        return row["ORGANIZATION_STATE"] + "-" + "{:02d}".format(row['CONGRESSIONAL_DISTRICT'])

fedrep = pd.DataFrame()
for year in range(2010, 2019):
    temp = pd.read_csv("/home/jovyan/Yandex.Disk/BigDataPubPol/data/projects/FedRePORTER_PRJ_C_FY{}.csv".format(year), 
                 encoding='utf-8', skipinitialspace=True, low_memory=False)
    
    temp = temp[  (temp['ORGANIZATION_COUNTRY'] == 'UNITED STATES') 
                & (temp['DEPARTMENT'] == 'NSF')
                & (temp['FY_TOTAL_COST'] > 0.0)
               ][
        ["CONGRESSIONAL_DISTRICT", "ORGANIZATION_STATE","DEPARTMENT", "AGENCY", "FY", "FY_TOTAL_COST"]].groupby(
        ["CONGRESSIONAL_DISTRICT","ORGANIZATION_STATE","DEPARTMENT", "AGENCY", "FY"], as_index=False).sum()
   
    temp["CONGRESSIONAL_DISTRICT"] = pd.to_numeric(temp["CONGRESSIONAL_DISTRICT"])
    temp["FY_TOTAL_COST"] = pd.to_numeric(temp["FY_TOTAL_COST"])
    fedrep = fedrep.append(temp)

fedrep['CONGRESSIONAL_DISTRICT'] = fedrep['CONGRESSIONAL_DISTRICT'].astype(int)
fedrep["CD"] = fedrep.apply(to_cd, axis=1)

fedrep.to_csv('fedReporter_sum.csv', index=False, encoding='utf8')

fedrep.head()

Unnamed: 0,CONGRESSIONAL_DISTRICT,ORGANIZATION_STATE,DEPARTMENT,AGENCY,FY,FY_TOTAL_COST,CD
0,0,AK,NSF,NSF,2010,51603573.0,AK-AL
1,0,AS,NSF,NSF,2010,25504.0,AS-AL
2,0,CA,NSF,NSF,2010,1733993.0,CA-AL
3,0,CO,NSF,NSF,2010,19500.0,CO-AL
4,0,CT,NSF,NSF,2010,2547090.0,CT-AL


In [8]:
congress_terms = pd.read_csv('/home/jovyan/Yandex.Disk/Personal/data/congress_111_116.csv',float_precision='round_trip')
congress_terms.head()

def create_cd_col(row):
    if row['chamber'].strip() == 'House':
        if row['district'] == 'At Large':
            return state_name_map[row['state']] + "-" + "AL"
        return state_name_map[row['state']] + "-" + "{:02d}".format(int(row['district']))
    elif row['chamber'] == 'Senate':
        return state_name_map[row['state']] + "-" + "XX"
    return "Not found"

congress_terms["CD"] = congress_terms.apply(create_cd_col, axis=1)
congress_terms = congress_terms[congress_terms['end'] > 2009]

congress_terms.head()

Unnamed: 0,name,url,party,state,district,chamber,start,end,CD
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,1991,2011.0,HI-01
2,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Republican,Louisiana,5,House,2015,inf,LA-05
3,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,5,House,1993,2013.0,NY-05
5,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...,Democratic,North Carolina,12,House,2014,inf,NC-12
6,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...,Republican,Florida,24,House,2011,2013.0,FL-24


In [10]:
party_counts = pd.DataFrame()
for cd in congress_terms[congress_terms['chamber'] == 'House']['CD'].unique():
    entry = {'CD': cd, 'Democratic': 0, 'Republican': 0, 'Other': 0}
    for yr in range(2010, 2019):
        try:
            rep = congress_terms[(congress_terms['CD'] == cd)
                & (congress_terms['start'] <= yr) & (congress_terms['end'] > yr)].iloc[0]
            if rep['party'] == 'Democratic' or rep['party'] == 'Republican':
                entry[rep['party']] = entry[rep['party']] + 1
            else:
                entry['Other'] = entry['Other'] + 1
        except: 
            pass
#             print(cd, yr)
    if entry['Democratic'] + entry['Republican'] + entry['Other'] > 0:
        party_counts = party_counts.append([entry])
        
party_counts['Total'] = party_counts['Democratic']+ party_counts['Republican']+party_counts['Other']

party_counts.to_csv('cd_111_116_party_counts.csv', index=False, encoding='utf8')

party_counts.head()    

Unnamed: 0,CD,Democratic,Republican,Other,Total
0,HI-01,9,0,0,9
0,LA-05,0,9,0,9
0,NY-05,9,0,0,9
0,NC-12,9,0,0,9
0,FL-24,7,2,0,9
