In [1]:
## load packages
import os 
import pathlib
import gdown
import sklearn
import pyarrow
import pandas as pd
import numpy as np
import itertools
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from feature_processing import encode_multi_features, encode_categorical

pd.set_option('display.max_columns', None)

In [28]:
bills = pd.read_parquet("data/bills.parquet")
bills_test = bills.sample(n = 2000, random_state=400).dropna(thresh = 27)

In [None]:
##sumarry of passing bills and status
len(bills["status"])                                        # length all = 588786
len(bills["status"]) -  len(bills[bills["status"].notna()]) # length status: na = 52079

bills["status"].value_counts(dropna = False)


status
1.0    289724
4.0    134720
6.0     75121
NaN     52079
2.0     25679
0.0      5016
5.0      3863
3.0      2584
Name: count, dtype: int64

In [30]:
path = open("bills_status.txt", 'w')
path.write(bills_test[["status", "last_action", "state_unified_bill_id", "sine_die"]].head(1000).sort_values(by = "status").to_string())
path.close()

29        6.0
30        4.0
31        4.0
32        4.0
33        4.0
         ... 
707285    NaN
707286    NaN
707287    NaN
707288    NaN
707289    NaN
Name: status, Length: 588786, dtype: float64

In [12]:
positions = pd.read_parquet("data/positions.parquet")
blocks = pd.read_parquet("data/block_assignments.parquet")

In [14]:
blocks

Unnamed: 0,entity_id,block_0,block_1,block_2,block_3,block_4,block_5,block_6,state,record_type
0,MA_1,3283,150,0,12,3.0,3.0,2.0,MA,lobbying
1,MA_10,4139,20,27,26,0.0,3.0,2.0,MA,lobbying
2,MA_100,3291,209,42,6,0.0,3.0,2.0,MA,lobbying
3,MA_1000,1311,128,62,26,0.0,3.0,2.0,MA,lobbying
4,MA_1001,97,169,68,6,0.0,3.0,2.0,MA,lobbying
...,...,...,...,...,...,...,...,...,...,...
82631,RI_S_0000002245_2022,261,5,2,0,0.0,,,RI,lobbying
82632,RI_H_0000007427_2018,261,5,2,0,0.0,,,RI,lobbying
82633,RI_S_0000002475_2018,261,5,2,0,0.0,,,RI,lobbying
82634,RI_H_0000005127_2019,261,5,2,0,0.0,,,RI,lobbying


In [9]:
positions.loc[positions["state_unified_bill_id"].isin(bills_test["state_unified_bill_id"])]

Unnamed: 0,bill_version,client_name,lobbyist_rep_name,lobbyist_firm_name,position,start_date,end_date,year,record_type,session,description,committee,state,position_numeric,docket_number,docket_prefix,legiscan_bill_id,state_unified_bill_id,state_client_id,ncsl_metatopics,ncsl_topics
758626,,Nebraska State Education Association,"McGuire, Mark D.",,SUPPORT,2010-04-21 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
759457,,Fremont Public Schools,"Nabb, Douglas L.",,SUPPORT,2010-04-12 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_326,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760523,,Nebraska State Education Association,"Sears, Jay",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760576,,Nebraska State Education Association,"Guinan, Trish",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760629,,Nebraska State Education Association,"Christiansen, Craig R.",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760781,,Nebraska State Education Association,"Clayburn, Neal",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760834,,Nebraska State Education Association,"Kilgarin, Karen",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760887,,Nebraska State Education Association,"Scherer, Larry",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760940,,Nebraska State Education Association,"Hoffman, Jerry",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...
760993,,Nebraska State Education Association,"Mikkelsen, Brian",,SUPPORT,2010-05-17 00:00:00+00:00,NaT,2010,lobbying,2010R,Amendments: \nDescription: Authorize leases on...,,NE,1.0,,,150449.0,NE_L_0000000916_2009,NE_27,ncsl_metatopic__energy,ncsl_database__energy_legislation_tracking_dat...


In [3]:
#encoding features with different number of values
encode_multi_features(bills_test, "ncsl_metatopics", "; ")
encode_categorical(bills_test, ["state", "bill_chamber"])

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 28 stored elements and shape (14, 16)>,
 array(['state_AZ', 'state_FL', 'state_IL', 'state_MD', 'state_MO',
        'state_MT', 'state_NE', 'state_NJ', 'state_RI', 'state_WI',
        'bill_chamber_A', 'bill_chamber_H', 'bill_chamber_HB',
        'bill_chamber_LB', 'bill_chamber_S', 'bill_chamber_SB'],
       dtype=object))

In [None]:
#extract representative subset to test feature preperation
#make metatopic coolumn a list instead of a single string 
#bills_test["ncsl_metatopics"] = bills_test["ncsl_metatopics"].str.split("; ")
#bills_test_sub = bills_test[["state", "ncsl_metatopics"]].to_numpy() #extract columns and turn df to array
#bills_test_sub

In [114]:
print(bills_test_sub)
print(bills_test_sub.shape)

enc = OneHotEncoder(sparse_output=True)                             #define one hot encode
enc.fit(bills_test_sub)                                              #fit enc to subset
bills_test_sub_tr = enc.transform(bills_test_sub)                    #transform subset
enc.get_feature_names_out()                                       

[['NE' list(['ncsl_metatopic__energy'])]
 ['MO' list(['ncsl_metatopic__elections_and_campaigns'])]
 ['MT' list(['ncsl_metatopic__energy'])]
 ['IL' list(['ncsl_metatopic__energy'])]
 ['FL' list(['ncsl_metatopic__technology_and_communication'])]
 ['MO' list(['ncsl_metatopic__elections_and_campaigns'])]
 ['IL' list(['ncsl_metatopic__ethics'])]
 ['MO' list(['ncsl_metatopic__transportation'])]
 ['MD' list(['ncsl_metatopic__transportation'])]
 ['IL' list(['ncsl_metatopic__technology_and_communication'])]
 ['WI'
  list(['ncsl_metatopic__fiscal', 'ncsl_metatopic__labor_and_employment'])]
 ['NJ' list(['ncsl_metatopic__transportation'])]
 ['RI' list(['ncsl_metatopic__labor_and_employment'])]
 ['AZ' list(['ncsl_metatopic__energy'])]]
(14, 2)


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['list']

In [96]:
enc.categories_

[array(['AZ', 'FL', 'IL', 'MD', 'MO', 'MT', 'NE', 'NJ', 'RI', 'WI'],
       dtype=object),
 array(['ncsl_metatopic__elections_and_campaigns',
        'ncsl_metatopic__energy', 'ncsl_metatopic__ethics',
        'ncsl_metatopic__fiscal; ncsl_metatopic__labor_and_employment',
        'ncsl_metatopic__labor_and_employment',
        'ncsl_metatopic__technology_and_communication',
        'ncsl_metatopic__transportation'], dtype=object)]

In [35]:
#one hot encoding of states
states = bills["state"].unique()
enc = OneHotEncoder(categories=[states])

b_states = bills["state"][29:100].array.reshape(-1, 1)
b_states_new = enc.fit_transform(b_states)