## Load packages and data

In [1]:
## load packages
import os 
import pathlib
import gdown
import pyarrow
import pandas as pd
import numpy as np
import itertools
import re

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
## download data from google drive dir
path = pathlib.Path('data')
path.mkdir(exist_ok=True)
data_url = "https://drive.google.com/drive/folders/1JLxwurbx0ys4DUDB2o-WCtWWsjisVi8L?usp=sharing"
gdown.download_folder(data_url, output = "data")

In [3]:
#load data
block_as = pd.read_parquet("data/block_assignments.parquet")
positions = pd.read_parquet("data/positions.parquet")
clients = pd.read_parquet("data/clients.parquet")
bills = pd.read_parquet("data/bills.parquet")

## data: BILLS

In [None]:
#bills[bills.duplicated("legiscan_bill_id", keep = False)].dropna(how="all")
#bills[bills.duplicated("state_unified_bill_id", keep = False)].dropna(how="all")
bills[bills["state_unified_bill_id"] == "NE_L_0000000941_2013"]

Unnamed: 0,legiscan_bill_id,legiscan_bill,legiscan_url,status_date,status,last_action_date,last_action,title,description,year_start,year_end,prefile,sine_die,prior,session_title,state,bill_chamber,bill_number,bill_suffix,sponsor_list,description_ncsl,ncsl_summary,date_overlap_jaccard,title_overlap_jaccard,ncsl_topics,ncsl_databases,ncsl_metatopics,state_unified_bill_id
8830,591946.0,LB941,https://legiscan.com/NE/bill/LB941/2013,2014-03-31,4.0,2014-03-31,"Approved by Governor on March 28, 2014",Provide for a dairy growth study,Provide for a dairy growth study,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,NE,LB,941,,['Ken Schilz'],,,,,,,,NE_L_0000000941_2013
8831,633963.0,LB941A,https://legiscan.com/NE/bill/LB941A/2013,2014-03-31,4.0,2014-03-31,"Approved by Governor on March 28, 2014",Appropriation Bill,Appropriation Bill,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,NE,LB,941,A,['Ken Schilz'],,,,,,,,NE_L_0000000941_2013


Unnamed: 0,legiscan_bill_id,legiscan_bill,legiscan_url,status_date,status,last_action_date,last_action,title,description,year_start,year_end,prefile,sine_die,prior,session_title,state,bill_chamber,bill_number,bill_suffix,sponsor_list,description_ncsl,ncsl_summary,date_overlap_jaccard,title_overlap_jaccard,ncsl_topics,ncsl_databases,ncsl_metatopics,state_unified_bill_id
8830,591946.0,LB941,https://legiscan.com/NE/bill/LB941/2013,2014-03-31,4.0,2014-03-31,"Approved by Governor on March 28, 2014",Provide for a dairy growth study,Provide for a dairy growth study,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,NE,LB,941,,['Ken Schilz'],,,,,,,,NE_L_0000000941_2013
8831,633963.0,LB941A,https://legiscan.com/NE/bill/LB941A/2013,2014-03-31,4.0,2014-03-31,"Approved by Governor on March 28, 2014",Appropriation Bill,Appropriation Bill,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,NE,LB,941,A,['Ken Schilz'],,,,,,,,NE_L_0000000941_2013


In [14]:
##OVERVIEW
#column labels
print(bills.columns)
#length (bills)
print(len(bills))
#length per state
print(bills.value_counts(subset=["state"]))

Index(['legiscan_bill_id', 'legiscan_bill', 'legiscan_url', 'status_date',
       'status', 'last_action_date', 'last_action', 'title', 'description',
       'year_start', 'year_end', 'prefile', 'sine_die', 'prior',
       'session_title', 'state', 'bill_chamber', 'bill_number', 'bill_suffix',
       'sponsor_list', 'description_ncsl', 'ncsl_summary',
       'date_overlap_jaccard', 'title_overlap_jaccard', 'ncsl_topics',
       'ncsl_databases', 'ncsl_metatopics', 'state_unified_bill_id'],
      dtype='object')
588786
state
TX       96555
IL       92779
NJ       67914
MA       52677
FL       36595
MD       34312
RI       30057
MO       28700
IA       23035
WI       21952
AZ       21768
NE       17523
OH       16032
CO       13757
SD       13465
MT       11417
KS       10248
Name: count, dtype: int64


In [20]:
#OVERVIEW: status
bills["status"].isin([4,5]).value_counts()

status
False    450203
True     138583
Name: count, dtype: int64

In [4]:
#identify unique combis of metatopic, topic and database
bills_topics = bills[["ncsl_metatopics","ncsl_topics"]].dropna(how = "all", ignore_index = True) #extract topics and metatopics
#split both cols on "; "
bills_topics["ncsl_topics"] = bills_topics["ncsl_topics"].str.split("; ")
bills_topics["ncsl_metatopics"] = bills_topics["ncsl_metatopics"].str.split("; ")
#explode both cols to make sure that each unique combo has a row
bills_topics = bills_topics.explode("ncsl_topics", ignore_index=True)
bills_topics = bills_topics.explode("ncsl_metatopics", ignore_index=True)
#remove 'ncsl_metatopics" from metatopic col
bills_topics["ncsl_metatopics"] = bills_topics["ncsl_metatopics"].str.replace("ncsl_metatopic__", "")
#remove duplicates
print(f"number of rows before dropping duplicates: {len(bills_topics)}") #153220
bills_topics = bills_topics.drop_duplicates(ignore_index = True)
print(f"number of rows after dropping duplicates: {len(bills_topics)}")  #5373

number of rows before dropping duplicates: 153220
number of rows after dropping duplicates: 5373


In [5]:
#split topics into database and topics
topics_split = bills_topics["ncsl_topics"].str.split("__", expand = True).rename(columns={1: "database", 3: "ncsl_topic"})
print(topics_split[0].unique())                         #unique : ncsl_database & None
print(topics_split[2].unique())                         #unique : ncsl_topic & None
topics_split.drop(columns=[0,2], inplace=True)          #remove redunant cols
bills_topics.drop(columns=["ncsl_topics"], inplace=True)#remove ncsl_topic col from bills_topics_all

#merge the topics split with bills_toppics
bills_topics = bills_topics.merge(topics_split, left_index=True, right_index=True)

['ncsl_database' None]
['ncsl_topic' None]


In [7]:
bills_topics["ncsl_metatopics"].unique()

array(['military_and_veterans_affairs',
       'environment_and_natural_resources', 'elections_and_campaigns',
       'human_services', 'education', 'transportation',
       'labor_and_employment', 'energy', 'financial_services', 'fiscal',
       'quad_caucus', 'civil_and_criminal_justice', 'ethics', 'health',
       'technology_and_communication', 'immigration',
       'redistricting_and_census'], dtype=object)

In [29]:
bills_topics

Unnamed: 0,ncsl_metatopics,database,ncsl_topic
0,military_and_veterans_affairs,military_and_veterans_state_legislation_database,education
1,environment_and_natural_resources,,
2,elections_and_campaigns,state_elections_legislation_database,election_crimes
3,human_services,child_support_and_family_law_legislation_database,enforcement
4,education,education_legislation_bill_tracking,career_and_technical_education
...,...,...,...
5368,technology_and_communication,injury_prevention_legislation_database_opioid_...,prescription_drug_monitoring_program
5369,military_and_veterans_affairs,health_innovations_state_law_archive_database_...,medicaid_expansion_and_waivers
5370,military_and_veterans_affairs,child_welfare_enacted_legislation,courts_and_legal_representation
5371,technology_and_communication,covid_19_archive_database,legislative_operations


In [18]:
bills_topics_envi = bills_topics[bills_topics["ncsl_metatopics"] == "environment_and_natural_resources"]["ncsl_topic"].drop_duplicates().to_list()

In [19]:
bills_topics_envi

[None,
 'food_safety',
 'water_resources/planning',
 'body_art_tattoo',
 'eh_management',
 'indoor_air_quality',
 'water_waste_water',
 'drinking_water',
 'pesticides',
 'toxics_&_chemicals',
 'waste_and_recycling',
 'indoor_air_quality_–_radon',
 "children's_environmental_health",
 'waste_and_recycling_product_stewardship',
 'waste_and_recycling_composting',
 'transportation',
 'transportation_alt_fuel/hybrid',
 'energy_security_and_critical_infrastructure',
 '9_1_1_administration,_plans,_boards_&_commissions',
 '9_1_1_funding_and_appropriations',
 'enhanced_9_1_1',
 'waste_and_recycling_beverage_containers',
 'asthma',
 'energy_efficiency',
 'energy_efficiency_building_codes_and_standards',
 'water_conservation/efficiency',
 'nuclear_/_radioactive_waste',
 'nuclear_energy_facilities',
 'waste_nuclear/radioactive',
 'tax_credits_&_incentives_evaluation,_reporting_&_rqmts',
 'tax_credits_&_incentives_small_&_minority_businesses',
 'tax_credits_&_incentives_sunset_provisions',
 'tax_cre

In [None]:
# identify unique ncsl metatopics
bills_meta = bills["ncsl_metatopics"].dropna()
bills_meta_unique = bills_meta.str.split("; ").explode().unique()
bills_meta_unique = [x.replace("ncsl_metatopic__", "") for x in bills_meta_unique]
print(len(bills_meta_unique)) #17
bills_meta_unique

17


['military_and_veterans_affairs',
 'environment_and_natural_resources',
 'elections_and_campaigns',
 'human_services',
 'education',
 'transportation',
 'labor_and_employment',
 'energy',
 'financial_services',
 'fiscal',
 'quad_caucus',
 'civil_and_criminal_justice',
 'ethics',
 'health',
 'technology_and_communication',
 'immigration',
 'redistricting_and_census']

In [None]:
#bills_meta = [x.split("; ") for x in bills_meta] # split strings where there are multiple topics
#bills_meta = list(itertools.chain.from_iterable(bills_meta)) # flatten list

In [None]:
bills_topics["ncsl_topic"].str.contains("recycling|waste|plastic")

0       False
1        None
2       False
3       False
4       False
        ...  
5368    False
5369    False
5370    False
5371    False
5372    False
Name: ncsl_topic, Length: 5373, dtype: object

## data: POSITIONS

In [None]:
positions