In [1]:
## load packages
import os 
import pathlib
import gdown
import pyarrow
import pandas as pd
import numpy as np
import itertools
import re

In [None]:
## download data from google drive dir
path = pathlib.Path('data')
path.mkdir(exist_ok=True)
data_url = "https://drive.google.com/drive/folders/1JLxwurbx0ys4DUDB2o-WCtWWsjisVi8L?usp=sharing"
gdown.download_folder(data_url, output = "data")

In [2]:
#load data
block_as = pd.read_parquet("data/block_assignments.parquet")
positions = pd.read_parquet("data/positions.parquet")
clients = pd.read_parquet("data/clients.parquet")
bills = pd.read_parquet("data/bills.parquet")

In [None]:
#identify unique combis of metatopic, topic and database
bills_topics = bills[["ncsl_metatopics","ncsl_topics"]].dropna(how = "all", ignore_index = True) #extract topics and metatopics
#split both cols on "; "
bills_topics["ncsl_topics"] = bills_topics["ncsl_topics"].str.split("; ")
bills_topics["ncsl_metatopics"] = bills_topics["ncsl_metatopics"].str.split("; ")
#explode both cols to make sure that each unique combo has a row
bills_topics = bills_topics.explode("ncsl_topics", ignore_index=True)
bills_topics = bills_topics.explode("ncsl_metatopics", ignore_index=True)
#remove 'ncsl_metatopics" from metatopic col
bills_topics["ncsl_metatopics"] = bills_topics["ncsl_metatopics"].str.replace("ncsl_metatopic__", "")
#remove duplicates
print(f"number of rows before dropping duplicates: {len(bills_topics)}") #153220
bills_topics = bills_topics.drop_duplicates(ignore_index = True)
print(f"number of rows after dropping duplicates: {len(bills_topics)}")  #5373

number of rows before dropping duplicates: 153220
number of rows after dropping duplicates: 5373


In [None]:
#split topics into database and topics
topics_split = bills_topics["ncsl_topics"].str.split("__", expand = True).rename(columns={1: "database", 3: "ncsl_topic"})
print(topics_split[0].unique())                         #unique : ncsl_database & None
print(topics_split[2].unique())                         #unique : ncsl_topic & None
topics_split.drop(columns=[0,2], inplace=True)          #remove redunant cols
bills_topics.drop(columns=["ncsl_topics"], inplace=True)#remove ncsl_topic col from bills_topics_all

#merge the topics split with bills_toppics
bills_topics = bills_topics.merge(topics_split, left_index=True, right_index=True)

In [141]:
print(bills_topics.value_counts(subset=["ncsl_metatopics", "ncsl_topic"]))

ncsl_metatopics                    ncsl_topic                  
elections_and_campaigns            miscellaneous                   9
civil_and_criminal_justice         other                           7
education                          miscellaneous                   7
civil_and_criminal_justice         miscellaneous                   7
human_services                     miscellaneous                   6
                                                                  ..
environment_and_natural_resources  justice_system_related          1
                                   job_access                      1
                                   jails                           1
                                   insurance_and_death_benefits    1
                                   kinship_care                    1
Name: count, Length: 5117, dtype: int64


In [None]:
# identify unique ncsl metatopics
bills_meta = bills["ncsl_metatopics"].dropna()
bills_meta_unique = bills_meta.str.split("; ").explode().unique()
bills_meta_unique = [x.replace("ncsl_metatopic__", "") for x in bills_meta_unique]
print(len(bills_meta_unique)) #17
bills_meta_unique

17


['military_and_veterans_affairs',
 'environment_and_natural_resources',
 'elections_and_campaigns',
 'human_services',
 'education',
 'transportation',
 'labor_and_employment',
 'energy',
 'financial_services',
 'fiscal',
 'quad_caucus',
 'civil_and_criminal_justice',
 'ethics',
 'health',
 'technology_and_communication',
 'immigration',
 'redistricting_and_census']

['ncsl_database' None]
['ncsl_topic' None]
number of rows before dropping duplicates: 6158
number of rows after dropping duplicates: 6158


Unnamed: 0,ncsl_metatopics,database,ncsl_topic
0,military_and_veterans_affairs,military_and_veterans_state_legislation_database,education
1,environment_and_natural_resources,,
2,elections_and_campaigns,state_elections_legislation_database,election_crimes
3,human_services,child_support_and_family_law_legislation_database,enforcement
4,education,education_legislation_bill_tracking,career_and_technical_education
...,...,...,...
6153,health; technology_and_communication,covid_19_archive_database,fiscal:_federal_relief_funding
6154,health; technology_and_communication,covid_19_archive_database,fiscal:_liability
6155,health; technology_and_communication,covid_19_archive_database,health:_access_and_coverage
6156,health; technology_and_communication,covid_19_archive_database,legislative_operations


In [None]:
bills_topics_all.nunique()

ncsl_metatopics    310
database            53
ncsl_topic         967
dtype: int64

In [None]:
#bills_meta = [x.split("; ") for x in bills_meta] # split strings where there are multiple topics
#bills_meta = list(itertools.chain.from_iterable(bills_meta)) # flatten list