In [1]:
import os
import pandas as pd
import numpy as np
from parsons import Table, Redshift
import json
import requests
import base64
import zipfile
import io
from urllib.parse import urlencode, quote_plus

rs = Redshift()
table = Table()

In [None]:
##### pylegiscan functions adapted from https://github.com/poliquin/pylegiscan/blob/master/pylegiscan/legiscan.py

class LegiScanError(Exception):
    pass

class LegiScan(object):
    BASE_URL = 'http://api.legiscan.com/?key={0}&op={1}&{2}'

    def __init__(self, apikey=None):
        """LegiScan API.  State parameters should always be passed as
           USPS abbreviations.  Bill numbers and abbreviations are case
           insensitive.  Register for API at http://legiscan.com/legiscan
        """
        # see if API key available as environment variable
        if apikey is None:
            apikey = os.getenv('LEGISCAN_API_KEY')
        self.key = apikey.strip()

    def _url(self, operation, params=None):
        """Build a URL for querying the API."""
        if not isinstance(params, str) and params is not None:
            params = urlencode(params)
        elif params is None:
            params = ''
        return self.BASE_URL.format(self.key, operation, params)

    def _get(self, url):
        """Get and parse JSON from API for a url."""
        req = requests.get(url)
        if not req.ok:
            raise LegiScanError('Request returned {0}: {1}'\
                    .format(req.status_code, url))
        data = json.loads(req.content)
        if data['status'] == "ERROR":
            raise LegiScanError(data['alert']['message'])
        return data

    def get_session_list(self, state):
        """Get list of available sessions for a state."""
        url = self._url('getSessionList', {'state': state})
        data = self._get(url)
        return data['sessions']

    def get_dataset_list(self, state=None, year=None):
        """Get list of available datasets, with optional state and year filtering.
        """
        if state is not None:
            url = self._url('getDatasetList', {'state': state})
        elif year is not None:
            url = self._url('getDatasetList', {'year': year})
        else:
            url = self._url('getDatasetList')
        data = self._get(url)
        return data['datasetlist']

    def get_dataset(self, id, access_key):
        """Get list of available datasets, with optional state and year filtering.
        """
        url = self._url('getDataset', {'id': id, 'access_key': access_key})
        data = self._get(url)
        return data['dataset']
    
    def get_session_people(self, session_id=None):
        """Get list of bills for the current session in a state or for
           a given session identifier.
        """
        if session_id is not None:
            url = self._url('getSessionPeople', {'id': session_id})
        else:
            raise ValueError('Must specify session identifier.')
        data = self._get(url)
        return data['sessionpeople']
      
    def get_master_list(self, state=None, session_id=None):
        """Get list of bills for the current session in a state or for
           a given session identifier.
        """
        if state is not None:
            url = self._url('getMasterList', {'state': state})
        elif session_id is not None:
            url = self._url('getMasterList', {'id': session_id})
        else:
            raise ValueError('Must specify session identifier or state.')
        data = self._get(url)
        return [data['masterlist'][i] for i in data['masterlist']]
    
    def get_master_list_raw(self, state=None, session_id=None):
        """Get list of bills for the current session in a state or for
           a given session identifier, optimized for change hash detection.
        """
        if state is not None:
            url = self._url('getMasterListRaw', {'state': state})
        elif session_id is not None:
            url = self._url('getMasterListRaw', {'id': session_id})
        else:
            raise ValueError('Must specify session identifier or state.')
        data = self._get(url)
        return [data['masterlist'][i] for i in data['masterlist']]

    def get_bill(self, bill_id=None, state=None, bill_number=None):
        """Get primary bill detail information including sponsors, committee
           references, full history, bill text, and roll call information.

           This function expects either a bill identifier or a state and bill
           number combination.  The bill identifier is preferred, and required
           for fetching bills from prior sessions.
        """
        if bill_id is not None:
            url = self._url('getBill', {'id': bill_id})
        elif state is not None and bill_number is not None:
            url = self._url('getBill', {'state': state, 'bill': bill_number})
        else:
            raise ValueError('Must specify bill_id or state and bill_number.')
        return self._get(url)['bill']

    def get_roll_call(self, roll_call_id):
        """Roll call detail for individual votes and summary information."""
        data = self._get(self._url('getRollcall', {'id': roll_call_id}))
        return data['roll_call']

legis = LegiScan()

In [None]:
""" If datasets is not defined in the namespace, this cell will create one API call """

statecode = 'me'

if 'datasets' not in locals():
    datasets = legis.get_dataset_list(state = statecode)

if 'current_session_id' not in locals():
    current_session_id = datasets[0]['session_id']

In [None]:
""" Running this cell will create one API call """

""" Orignal bulk import: use session_id and session_name to extract all files
    for each dataset."""

votes = []
bills = []
people = []
    
for dataset in datasets:
    session_id = dataset['session_id']
    access_key = dataset['access_key'] 
    api_output = legis.get_dataset(session_id, access_key)
    encoded = base64.b64decode(api_output['zip'])
    zipped = zipfile.ZipFile(io.BytesIO(encoded))
    files = zipped.namelist()
    
    for file in files:
        content = zipped.read(file).decode("utf-8")
        try:
            data = json.loads(content)
            if '/bill/' in file:
                bills.append(data['bill'])
            elif '/vote/' in file:
                votes.append(data['roll_call'])
            elif '/people/' in file:
                data['person']['session_id'] = session_id
                people.append(data['person'])       
        except:
            pass

In [None]:
""" Unpack nested columns into separate tables + clean column names """

bill_df = pd.json_normalize(bills)
vote_df = pd.json_normalize(votes)
people_df = pd.json_normalize(people)


def simplify_df(df):
    
    nested_cols = []
    
    for col in df.columns:
        if isinstance(df[col][0],list):
            nested_cols.append(col)
            
    new_names = []
    old_names = df.columns

    for old_name in old_names:
        new_name = old_name.split('.')[-1]
        new_names.append(new_name)
            
    name_dict = dict(zip(old_names, new_names))
    
    output = df.rename(columns = name_dict)
    
    output.drop(columns = nested_cols, axis = 1, inplace = True)
    
    output.mask(output.applymap(type).eq(list) & ~output.astype(bool), inplace = True)
    output.replace({np.nan: None}, inplace = True)
    
    return output

def clean_names(tbl, prefix = ''):
    old_names = tbl.columns
    
    for old_name in tbl.columns:
        new_name = old_name.replace(prefix,'')
        try:
            tbl.rename_column(old_name, new_name)
        except:
            pass
        
    return tbl

# roll call
roll_call_tbl = table.from_dataframe(bill_df).long_table('bill_id','votes')
clean_names(roll_call_tbl, 'votes_') #note: this creates a column called desc, which redshift will not create

# subjects
subjects_tbl = table.from_dataframe(bill_df).long_table('bill_id','subjects')
clean_names(subjects_tbl, 'subjects_')

# sponsors
sponsors_tbl = table.from_dataframe(bill_df).long_table('bill_id','sponsors')
clean_names(sponsors_tbl, 'sponsors_')

# vote
vote_tbl = table.from_dataframe(vote_df).long_table(['roll_call_id','bill_id'],'votes')
clean_names(vote_tbl, 'votes_')

# bill
simplified_bill_df = simplify_df(bill_df)
bill_tbl = table.from_dataframe(simplified_bill_df)

# person
simplified_people_df = simplify_df(people_df)
people_tbl = table.from_dataframe(simplified_people_df)

In [None]:
""" This cell will completely overwrite all legiscan tables in redshift.
    Use this when loading data for the first time, or if redshift tables 
    become corrupted and need to be replaced from stored JSON files."""

legiscan_tables = [
    {
        'table':'legiscan_people',
        'schema': 'lkesich',
        'sortkey': 'people_id',
        'distkey': 'people_id',
        'tbl': people_tbl
    },
    {
        'table':'legiscan_roll_calls',
        'schema': 'lkesich',
        'sortkey': 'date',
        'distkey': 'roll_call_id',
        'tbl': roll_call_tbl
    },
    {
        'table':'legiscan_bills',
        'schema': 'lkesich',
        'sortkey': 'session_id',
        'distkey': 'bill_id',
        'tbl': bill_tbl
    },
    {
        'table':'legiscan_votes',
        'schema': 'lkesich',
        'sortkey': 'roll_call_id, people_id',
        'distkey': 'roll_call_id',
        'tbl': vote_tbl
    },
    {
        'table':'legiscan_subjects',
        'schema': 'lkesich',
        'sortkey': 'subject_id',
        'distkey': 'bill_id',
        'tbl': subjects_tbl
    },
    {
        'table':'legiscan_sponsors',
        'schema': 'lkesich',
        'sortkey': 'people_id',
        'distkey': 'bill_id',
        'tbl': sponsors_tbl
    }
]

""" Send to redshift or Civis. If you have not defined a distkey or sortkey, 
    you will get a warning that can be safely ignored."""

for key in legiscan_tables:
    destination = key['schema'] + '.' + key['table']
    tbl = key['tbl']
    dist = key['distkey']
    sort = key['sortkey']
    tbl.to_redshift(
        destination, 
        distkey = dist, 
        sortkey = sort, 
        if_exists = 'drop')