In [2]:
## load packages
import os 
import pathlib
import pickle as pkl
import gdown

import sklearn
import seaborn as sns
import pyarrow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import re

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from feature_processing import encode_multi_features, encode_categorical
pd.set_option('display.max_columns', None)

variables needed to match across files: 
- status on bill                    (bills)
- client id from positions on bills (blocks, positions)
- bill id                           (bills, positions)
- block of the client id            (blocks)

In [39]:
#states that only have lobby records
states_lobby = ["IA", "MA", "NE", "NJ", "RI", "WI"]

In [73]:
bills = pd.read_parquet("../data/raw/chorus/bills.parquet") #load bills data
bills_lob = bills[bills["state"].isin(states_lobby)]        #subset states with lobby records
bills_lob = bills_lob[bills_lob["status"].notna()]          #remove rows with no status metadata

In [79]:
positions

Unnamed: 0,bill_version,client_name,lobbyist_rep_name,lobbyist_firm_name,position,start_date,end_date,year,record_type,session,description,committee,state,position_numeric,docket_number,docket_prefix,legiscan_bill_id,state_unified_bill_id,state_client_id,ncsl_metatopics,ncsl_topics
0,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,"Agriculture and Food - ""The """"Preserve New Jer...",,NJ,oppose,,,595945.0,NJ_A_0000001363_2014,NJ_0,,
1,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,Agriculture and Food - Imposes water consumpti...,,NJ,oppose,,,595591.0,NJ_A_0000001364_2014,NJ_0,,
2,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,"Agriculture and Food - """"""Water Supply Open Sp...",,NJ,oppose,,,597169.0,NJ_A_0000001483_2014,NJ_0,ncsl_metatopic__transportation,ncsl_database__state_traffic_safety_legislatio...
3,,NJBA,,,P,NaT,NaT,2014,lobbying,2014-2015,Agriculture and Food - Permits construction of...,,NJ,support,,,597358.0,NJ_A_0000001711_2014,NJ_0,,
6,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,Agriculture and Food - Provides that solar and...,,NJ,oppose,,,636438.0,NJ_A_0000002788_2014,NJ_0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13619400,A,CLEAN WATER ACTION,JED THORP,,PROMOTE,2022-07-01 00:00:00+00:00,2022-09-30 00:00:00+00:00,2022,lobbying,2022,,,RI,support,,,,RI_H_0000007233_2022,RI_140,,
13619401,A,CLEAN WATER ACTION,JED THORP,,PROMOTE,2022-07-01 00:00:00+00:00,2022-09-30 00:00:00+00:00,2022,lobbying,2022,,,RI,support,,,,RI_H_0000007438_2022,RI_140,,
13619402,,CLEAN WATER ACTION,JED THORP,,OPPOSE,2022-07-01 00:00:00+00:00,2022-09-30 00:00:00+00:00,2022,lobbying,2022,,,RI,oppose,,,,RI_H_0000008089_2022,RI_140,,
13619403,,CLEAN WATER ACTION,JED THORP,,PROMOTE,2022-07-01 00:00:00+00:00,2022-09-30 00:00:00+00:00,2022,lobbying,2022,,,RI,support,,,,RI_S_0000002044_2022,RI_140,,


In [72]:
bills_lob.isna().value_counts("state")

state
False    213158
Name: count, dtype: int64

In [53]:
(bills[bills["state"].isin(states_lobby)].value_counts(["state", "year_start"], dropna=False, sort=False))#.to_csv("data.txt", sep="\t")

state  year_start
IA     2009.0         2426
       2011.0         2293
       2013.0         3012
       2015.0         3055
       2017.0         3076
       2019.0         3603
       2021.0         3602
       NaN            1968
MA     2009.0         6437
       2011.0         7007
       2013.0         6947
       2015.0         7302
       2017.0         7723
       2019.0         8218
       2021.0         8206
       NaN             837
NE     2009.0          702
       2011.0         1879
       2013.0         1825
       2015.0         1841
       2017.0         1680
       2019.0         1767
       2021.0         1850
       NaN            5979
NJ     2010.0         8234
       2012.0         8408
       2014.0         9971
       2016.0         9908
       2018.0        11459
       2020.0        11429
       2022.0         8090
       NaN             415
RI     2010.0         2189
       2011.0         2420
       2012.0         1386
       2013.0         2389
       201

In [3]:
#BLOCKS
blocks = pd.read_parquet("../data/raw/chorus/block_assignments.parquet")        #load block assignment data
blocks_clients = blocks[blocks["entity_id"].str.match(r"[A-Z][A-Z]_\d+")]       #subset rows with state_client_ids
blocks_clients = blocks_clients.rename(columns={"entity_id": "state_client_id"})#rename to state_client_id 
blocks_clients["block_1"] = blocks_clients["block_1"].astype(str)
blocks_clients.head(3)

Unnamed: 0,state_client_id,block_0,block_1,block_2,block_3,block_4,block_5,block_6,state,record_type
0,MA_1,3283,150,0,12,3.0,3.0,2.0,MA,lobbying
1,MA_10,4139,20,27,26,0.0,3.0,2.0,MA,lobbying
2,MA_100,3291,209,42,6,0.0,3.0,2.0,MA,lobbying


In [None]:
#BILLS
bills = pd.read_parquet("../data/raw/chorus/bills.parquet")             #load bills data
bills = bills[bills["status"].notna()]                                  #remove rows where status=NA
bills["pass"] = bills.index.map(bills["status"].isin([4,5]).to_dict())  # create new col with binary pass/fail status
bills = bills.replace(to_replace={"ncsl_metatopics":{None: list(["M"])},#replace none with 'M' (so it can be made to list)
                                  "ncsl_topics":{None: list(["M"])}})
bills["ncsl_topics"] = bills["ncsl_topics"].str.split("; ")             #split topics into lists 
bills["ncsl_metatopics"] = bills["ncsl_metatopics"].str.split("; ")     #split topics into lists 
bills.head(3)

Unnamed: 0,legiscan_bill_id,legiscan_bill,legiscan_url,status_date,status,last_action_date,last_action,title,description,year_start,year_end,prefile,sine_die,prior,session_title,state,bill_chamber,bill_number,bill_suffix,sponsor_list,description_ncsl,ncsl_summary,date_overlap_jaccard,title_overlap_jaccard,ncsl_topics,ncsl_databases,ncsl_metatopics,state_unified_bill_id,pass
29,450474.0,HB2001,https://legiscan.com/KS/bill/HB2001/2013,2014-05-30,6.0,2014-05-30,House Died in Committee,"Substitute HB 2001 by Committee on Veterans, M...","Substitute HB 2001 by Committee on Veterans, M...",2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,KS,HB,2001,,"['Veterans, Military and Homeland Security']",National Guard Educational Assistance Act,Relates to the national guard educational assi...,0.067164,0.263158,[ncsl_database__military_and_veterans_state_le...,ncsl_database__military_and_veterans_state_leg...,[ncsl_metatopic__military_and_veterans_affairs],KS_HB_0000002001_2013,False
30,450475.0,HB2002,https://legiscan.com/KS/bill/HB2002/2013,2014-04-30,4.0,2014-04-30,"House Approved by Governor on Thursday, 10 Apr...",Substitute for HB 2002 by Committee on Federal...,Substitute for HB 2002 by Committee on Federal...,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,KS,HB,2002,,['Federal and State Affairs'],,,,,[M],,[M],KS_HB_0000002002_2013,True
31,451743.0,HB2011,https://legiscan.com/KS/bill/HB2011/2013,2013-05-08,4.0,2013-05-08,"House Approved by Governor on Tuesday, 16 Apri...",Senate Substitute for HB 2011 by Committee on ...,Senate Substitute for HB 2011 by Committee on ...,2013.0,2014.0,0.0,1.0,1.0,2013-2014 Regular Session,KS,HB,2011,,['Transportation'],,,,,[M],,[M],KS_HB_0000002011_2013,True


In [13]:
#POSITIONS
positions = pd.read_parquet("../data/raw/chorus/positions.parquet")                           #load positions
positions = positions[positions["state_client_id"].isin(blocks_clients["state_client_id"])]   #incl only client represented in BLOCKS
positions = positions[positions["state_unified_bill_id"].isin(bills["state_unified_bill_id"])]#incl only bills included in BILLS
positions.head(3)

Unnamed: 0,bill_version,client_name,lobbyist_rep_name,lobbyist_firm_name,position,start_date,end_date,year,record_type,session,description,committee,state,position_numeric,docket_number,docket_prefix,legiscan_bill_id,state_unified_bill_id,state_client_id,ncsl_metatopics,ncsl_topics
0,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,"Agriculture and Food - ""The """"Preserve New Jer...",,NJ,-1.0,,,595945.0,NJ_A_0000001363_2014,NJ_0,,
1,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,Agriculture and Food - Imposes water consumpti...,,NJ,-1.0,,,595591.0,NJ_A_0000001364_2014,NJ_0,,
2,,NJBA,,,O,NaT,NaT,2014,lobbying,2014-2015,"Agriculture and Food - """"""Water Supply Open Sp...",,NJ,-1.0,,,597169.0,NJ_A_0000001483_2014,NJ_0,ncsl_metatopic__transportation,ncsl_database__state_traffic_safety_legislatio...


In [14]:
#MERGE POSITIONS & BLOCKS
positions = positions.replace(to_replace={"position_numeric": {-1.0: "oppose", 0.0: "neutral", 1.0: "support"}})
positions_blocks = positions[["state_unified_bill_id", "state_client_id", "state", "record_type", "year", "position_numeric"]].merge(blocks_clients[["state_client_id", "block_1", "record_type"]], how="left", on=["state_client_id", "record_type"])
positions_blocks.head(3)

Unnamed: 0,state_unified_bill_id,state_client_id,state,record_type,year,position_numeric,block_1
0,NJ_A_0000001363_2014,NJ_0,NJ,lobbying,2014,oppose,63
1,NJ_A_0000001364_2014,NJ_0,NJ,lobbying,2014,oppose,63
2,NJ_A_0000001483_2014,NJ_0,NJ,lobbying,2014,oppose,63


In [38]:
#WRANGLE POSITION_BLOCKS TO: LIST_BLOCKS & LIST_COUNTS
#positions_blocks_counts = 
positions_blocks.value_counts(["state_unified_bill_id","position_numeric", "block_1", "record_type"], sort = False).to_frame().reset_index()

#positions_blocks_counts.groupby(["state_unified_bill_id", "position_numeric"]).agg(list).reset_index().pivot(columns="position_numeric", values="block_1", index="state_unified_bill_id").reset_index()

#test = test.replace({None: list(["N"])})

Unnamed: 0,state_unified_bill_id,position_numeric,block_1,record_type,count
0,AZ_HB_0000002001_2010,neutral,113,testimony,11
1,AZ_HB_0000002001_2010,neutral,157,testimony,22
2,AZ_HB_0000002001_2010,support,94,testimony,11
3,AZ_HB_0000002001_2011-X1,neutral,63,testimony,5
4,AZ_HB_0000002001_2011-X1,neutral,99,testimony,5
...,...,...,...,...,...
723877,WI_SR_0000000004_2021,support,88,lobbying,2
723878,WI_SR_0000000006_2015,neutral,88,lobbying,1
723879,WI_SR_0000000011_2021,oppose,202,lobbying,2
723880,WI_SR_0000000021_2011,support,42,lobbying,1


In [15]:
#SUBSET BILLS
bills_subset = bills[["state_unified_bill_id", "state", "pass", "ncsl_topics", "ncsl_metatopics", "status_date"]]

In [32]:
#MERGE BILLS_SUBSET & POSITIONS_BLOCKS
test

position_numeric,state_unified_bill_id,neutral,oppose,support
0,AZ_HB_0000002001_2010,"[113, 157]",N,[94]
1,AZ_HB_0000002001_2011-X1,"[63, 99]",N,N
2,AZ_HB_0000002001_2011-X2,N,N,"[1, 147, 160, 75]"
3,AZ_HB_0000002001_2014,N,N,[128]
4,AZ_HB_0000002001_2015,N,N,"[1, 147, 65, 75, 94, 99]"
...,...,...,...,...
167080,WI_SR_0000000004_2021,N,N,"[155, 219, 88]"
167081,WI_SR_0000000006_2015,[88],N,N
167082,WI_SR_0000000011_2021,N,[202],N
167083,WI_SR_0000000021_2011,N,N,[42]


In [31]:
positions_blocks

Unnamed: 0,state_unified_bill_id,state_client_id,state,record_type,year,position_numeric,block_1
0,NJ_A_0000001363_2014,NJ_0,NJ,lobbying,2014,oppose,63
1,NJ_A_0000001364_2014,NJ_0,NJ,lobbying,2014,oppose,63
2,NJ_A_0000001483_2014,NJ_0,NJ,lobbying,2014,oppose,63
3,NJ_A_0000001711_2014,NJ_0,NJ,lobbying,2014,support,63
4,NJ_A_0000002788_2014,NJ_0,NJ,lobbying,2014,oppose,63
...,...,...,...,...,...,...,...
4467652,RI_H_0000007233_2022,RI_140,RI,lobbying,2022,support,8
4467653,RI_H_0000007438_2022,RI_140,RI,lobbying,2022,support,8
4467654,RI_H_0000008089_2022,RI_140,RI,lobbying,2022,oppose,8
4467655,RI_S_0000002044_2022,RI_140,RI,lobbying,2022,support,8


Fix rows in BILLS:
- Remove rows where 'status' = NA
- Remove duplicates (of 'state_unified_bill_id')
- Set index as 'state_unified_bill_id'

In [72]:
bills = pd.read_parquet("../data/raw/chorus/bills.parquet") # load data frame
print(bills.columns)

Index(['legiscan_bill_id', 'legiscan_bill', 'legiscan_url', 'status_date',
       'status', 'last_action_date', 'last_action', 'title', 'description',
       'year_start', 'year_end', 'prefile', 'sine_die', 'prior',
       'session_title', 'state', 'bill_chamber', 'bill_number', 'bill_suffix',
       'sponsor_list', 'description_ncsl', 'ncsl_summary',
       'date_overlap_jaccard', 'title_overlap_jaccard', 'ncsl_topics',
       'ncsl_databases', 'ncsl_metatopics', 'state_unified_bill_id'],
      dtype='object')


In [75]:
bills["state_unified_bill_id"].isin(blocks["entity_id"]).value_counts()

state_unified_bill_id
False    527375
True      61411
Name: count, dtype: int64

In [3]:
# remove rows where status = NA
bills = bills[bills["status"].notna()] 

# duplicated state_unified_bill_id:
bills_dupl = bills[bills.duplicated(subset = "state_unified_bill_id", keep=False)]
print(f"number of state_unified_bill_id values that are present twice: {len(bills_dupl)/2}")

# define index values of rows to remove
ind_suffix_A = bills_dupl[bills_dupl["bill_suffix"] == "A"].index.tolist() #(468)
ind_IL = bills_dupl.loc[(bills_dupl["state"] == "IL") & (bills_dupl["bill_chamber"] == "HJR")].index.tolist() #(2)
ind_TX_NJ = bills_dupl[(bills_dupl["state"].isin(["TX", "NJ"])) & (bills_dupl["date_overlap_jaccard"].isna())].index.tolist() #(896)
ind_WI = bills_dupl[(bills_dupl["state"] == "WI") & (bills_dupl["bill_suffix"].notna())].index.tolist() # (3)
dupl = ind_WI + ind_IL + ind_TX_NJ + ind_suffix_A

# remove rows from BILLS
bills = bills.drop(dupl)
del ind_IL, ind_suffix_A, ind_WI, ind_TX_NJ, dupl, bills_dupl

number of state_unified_bill_id values that are present twice: 1369.0


In [4]:
#set index as state_unified_bill_id
bills.set_index(keys="state_unified_bill_id", drop=False, inplace=True, verify_integrity=True)

Subset BILLS - keep feature columns in BILLS_F:
- 'status' - 'state' - 'ncsl_topics' - 'ncsl_metatopics' - 'status_date'

In [5]:
bills_f = bills[["state", "status", "ncsl_topics", "ncsl_metatopics", "status_date"]]
#write cleaned bills to file
bills_f.to_parquet("../data/preprocessed/bills.parquet", engine = "pyarrow")
bills_f

Unnamed: 0_level_0,state,status,ncsl_topics,ncsl_metatopics,status_date
state_unified_bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KS_HB_0000002001_2013,KS,6.0,ncsl_database__military_and_veterans_state_leg...,ncsl_metatopic__military_and_veterans_affairs,2014-05-30
KS_HB_0000002002_2013,KS,4.0,,,2014-04-30
KS_HB_0000002011_2013,KS,4.0,,,2013-05-08
KS_HB_0000002101_2013,KS,4.0,,,2014-04-30
KS_SB_0000000118_2013,KS,4.0,,,2013-04-05
...,...,...,...,...,...
MD_HB_0000001806_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001807_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001808_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001809_2012-X1,MD,1.0,,,2012-05-14


Retreive POSITIONS on bills in BILLS_F and subset relevant columns to POSITIONS_F
- rename position numeric: oppose - neutral - support 

In [6]:
positions = pd.read_parquet("../data/raw/chorus/positions.parquet")
positions = positions[positions["state_unified_bill_id"].isin(bills_f.index)]

In [46]:
positions_f = positions[["state_unified_bill_id", "state_client_id", "position_numeric"]]
pos_dict = {"position_numeric": {-1.0: "oppose", 0.0: "neutral", 1.0: "support"}}
positions_f = positions_f.replace(to_replace=pos_dict)
positions_f

Unnamed: 0,state_unified_bill_id,state_client_id,position_numeric
0,NJ_A_0000001363_2014,NJ_0,oppose
1,NJ_A_0000001364_2014,NJ_0,oppose
2,NJ_A_0000001483_2014,NJ_0,oppose
3,NJ_A_0000001711_2014,NJ_0,support
4,NJ_A_0000001814_2014,NJ_1,oppose
...,...,...,...
13619404,RI_S_0000002298_2022,RI_140,support
13619405,RI_S_0000002011_2022,RI_93,support
13619406,RI_S_0000002012_2022,RI_93,support
13619407,RI_S_0000002135_2022,RI_93,oppose


Load BLOCKS and subset relevant columns: 
- 'entity_id' - 'block_1' - 'record_type'
- rename 'entity_id' to 'state_client_id'
- make 'block_1' type str

In [42]:
blocks = pd.read_parquet("../data/raw/chorus/block_assignments.parquet")
blocks_f = blocks[["entity_id", "block_1", "record_type"]]
blocks_f = blocks_f.rename(columns={'entity_id': 'state_client_id'})
blocks_f["block_1"] = blocks_f["block_1"].astype(str)
blocks_f

Unnamed: 0,state_client_id,block_1,record_type
0,MA_1,150,lobbying
1,MA_10,20,lobbying
2,MA_100,209,lobbying
3,MA_1000,128,lobbying
4,MA_1001,169,lobbying
...,...,...,...
82631,RI_S_0000002245_2022,5,lobbying
82632,RI_H_0000007427_2018,5,lobbying
82633,RI_S_0000002475_2018,5,lobbying
82634,RI_H_0000005127_2019,5,lobbying


Match positions from POSITIONS_F with 'state_client_id' in BLOCKS
- 'block_1' is used for the orgs
- nans are changed to 'N' - (block value indicating no orgs lobbyed)

In [47]:
positions_f = positions_f.merge(blocks_f, on="state_client_id")
positions_f

Unnamed: 0,state_unified_bill_id,state_client_id,position_numeric,block_1,record_type
0,NJ_A_0000001363_2014,NJ_0,oppose,63,lobbying
1,NJ_A_0000001364_2014,NJ_0,oppose,63,lobbying
2,NJ_A_0000001483_2014,NJ_0,oppose,63,lobbying
3,NJ_A_0000001711_2014,NJ_0,support,63,lobbying
4,NJ_A_0000002788_2014,NJ_0,oppose,63,lobbying
...,...,...,...,...,...
4878667,RI_H_0000007233_2022,RI_140,support,8,lobbying
4878668,RI_H_0000007438_2022,RI_140,support,8,lobbying
4878669,RI_H_0000008089_2022,RI_140,oppose,8,lobbying
4878670,RI_S_0000002044_2022,RI_140,support,8,lobbying


In [48]:
positions_count = pd.DataFrame(positions_f.value_counts(["state_unified_bill_id", "position_numeric", "block_1"], sort = False)).reset_index()
positions_count

Unnamed: 0,state_unified_bill_id,position_numeric,block_1,count
0,AZ_HB_0000002001_2010,neutral,113,11
1,AZ_HB_0000002001_2010,neutral,157,22
2,AZ_HB_0000002001_2010,support,94,11
3,AZ_HB_0000002001_2011-X1,neutral,63,5
4,AZ_HB_0000002001_2011-X1,neutral,99,5
...,...,...,...,...
758926,WI_SR_0000000004_2021,support,88,2
758927,WI_SR_0000000006_2015,neutral,88,1
758928,WI_SR_0000000011_2021,oppose,202,2
758929,WI_SR_0000000021_2011,support,42,1


In [66]:
len(blocks["block_1"].unique())

381

In [67]:
len(positions_count["block_1"].unique())

250

In [62]:
positions_blocks = pd.DataFrame(positions_count.groupby(["state_unified_bill_id", "position_numeric"])["block_1"].apply(list)).reset_index().pivot(index = "state_unified_bill_id", columns= "position_numeric", values="block_1").reset_index().replace({None: list(["N"])})

In [57]:
positions_blocks

position_numeric,state_unified_bill_id,neutral,oppose,support
0,AZ_HB_0000002001_2010,"[113, 157]",N,[94]
1,AZ_HB_0000002001_2011-X1,"[63, 99]",N,N
2,AZ_HB_0000002001_2011-X2,N,N,"[1, 147, 160, 75]"
3,AZ_HB_0000002001_2014,N,N,[128]
4,AZ_HB_0000002001_2015,N,N,"[1, 147, 65, 75, 94, 99]"
...,...,...,...,...
167117,WI_SR_0000000004_2021,N,N,"[155, 219, 88]"
167118,WI_SR_0000000006_2015,[88],N,N
167119,WI_SR_0000000011_2021,N,[202],N
167120,WI_SR_0000000021_2011,N,N,[42]


In [71]:
features = positions_blocks["neutral"]
enc = CountVectorizer(analyzer=lambda lst: lst)
transformed_features = enc.fit_transform(features).toarray()
feature_names = enc.get_feature_names_out() 
feature_names

array(['0', '1', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '128', '13', '130', '131', '132', '133', '134',
       '135', '137', '138', '139', '14', '140', '141', '142', '144',
       '145', '146', '147', '148', '149', '15', '150', '154', '155',
       '157', '158', '159', '16', '160', '161', '162', '163', '165',
       '166', '168', '169', '17', '172', '174', '175', '176', '179', '18',
       '180', '181', '185', '186', '189', '19', '190', '191', '194',
       '195', '196', '199', '2', '20', '200', '201', '202', '203', '204',
       '207', '208', '209', '21', '210', '211', '212', '213', '214',
       '215', '216', '218', '219', '22', '220', '223', '224', '229', '23',
       '230', '231', '232', '233', '234', '236', '238', '24', '240',
       '243', '25', '250', '253', '255', '257', '26', '262', '265', '267',
       

In [61]:
enc.get_params()

{'analyzer': <function __main__.<lambda>(lst)>,
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [60]:

transformed_features[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1])

In [50]:
block_1_ls = positions_f["block_1"].unique()
block_1_ls.sort()
block_1_ls

array([  0,   1,   2,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  48,  49,  50,  51,  52,  53,  54,
        55,  57,  58,  59,  61,  62,  63,  64,  65,  66,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
        97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
       111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
       124, 125, 128, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
       141, 142, 144, 145, 146, 147, 148, 149, 150, 154, 155, 157, 158,
       159, 160, 161, 162, 163, 165, 166, 168, 169, 172, 174, 175, 176,
       179, 180, 181, 185, 186, 189, 190, 191, 194, 195, 196, 199, 200,
       201, 202, 203, 204, 207, 208, 209, 210, 211, 212, 213, 21

In [None]:
features = dataframe[col_name].str.split(split_chr)
enc = CountVectorizer(analyzer=lambda lst: lst)
transformed_features = enc.fit_transform(features).toarray()
feature_names = enc.get_feature_names_out() 

Unnamed: 0_level_0,state,status,ncsl_topics,ncsl_metatopics,status_date
state_unified_bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KS_HB_0000002001_2013,KS,6.0,ncsl_database__military_and_veterans_state_leg...,ncsl_metatopic__military_and_veterans_affairs,2014-05-30
KS_HB_0000002002_2013,KS,4.0,,,2014-04-30
KS_HB_0000002011_2013,KS,4.0,,,2013-05-08
KS_HB_0000002101_2013,KS,4.0,,,2014-04-30
KS_SB_0000000118_2013,KS,4.0,,,2013-04-05
...,...,...,...,...,...
MD_HB_0000001806_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001807_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001808_2012-X1,MD,1.0,,,2012-05-14
MD_HB_0000001809_2012-X1,MD,1.0,,,2012-05-14


In [55]:
pd.DataFrame(positions_f.groupby(["state_unified_bill_id", "position_numeric", "state"])["state_client_id"].apply(list)).reset_index().pivot(index = "state_unified_bill_id", columns="position_numeric", values = "state_client_id").reset_index()#.drop("position_numeric")

position_numeric,state_unified_bill_id,-1.0,0.0,1.0
0,AZ_HB_0000002001_2010,,"[AZ_51, AZ_243, AZ_242, AZ_51, AZ_243, AZ_242,...","[AZ_24, AZ_24, AZ_24, AZ_24, AZ_24, AZ_24, AZ_..."
1,AZ_HB_0000002001_2011-X1,,"[AZ_35, AZ_559, AZ_35, AZ_559, AZ_35, AZ_559, ...",
2,AZ_HB_0000002001_2011-X2,,"[None, None, None, None, None, None]","[AZ_268, None, AZ_63, AZ_136, AZ_6, AZ_420, AZ..."
3,AZ_HB_0000002001_2014,,"[AZ_650, AZ_650, AZ_650, AZ_650, AZ_650, AZ_65...","[None, AZ_852, AZ_41, None, AZ_40, None, AZ_85..."
4,AZ_HB_0000002001_2015,,"[None, None, None, None, None, None, None, Non...","[AZ_890, AZ_4, None, AZ_545, AZ_545, None, AZ_..."
...,...,...,...,...
185785,WI_SR_0000000006_2015,,[WI_360],
185786,WI_SR_0000000010_2021,,,"[WI_1524, WI_1650]"
185787,WI_SR_0000000011_2021,"[WI_231, WI_519]",,
185788,WI_SR_0000000021_2011,,,[WI_142]
