## EDA on ACLU Bill Data + Merging All Data Together

In [2]:
# IMPORT PACKAGES
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re
import wordninja
from nltk.corpus import stopwords
import ast
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miamayerhofer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# LOAD IN THE DATA
data = pd.read_csv("../modified_data/merged_bill_data.csv")
# Drop the unnamed column
data = data.drop(data.columns[0], axis = 1)
# Get the number of characters in each bill
data["number_characters"] = data["text"].str.len()

In [4]:
# Find section of the pdf text after the "be it enacted by"
def text_shortener(list_of_strings):
    # Make all lowercase
    long_string = list_of_strings.replace(" ", "")
    # Convert to real list
    long_list = ast.literal_eval(long_string)
    starting_index = 0
    for i in range(len(long_list)):
        # Set the starting index of the content of the bill
        if "beitenactedby" in long_list[i].lower(): 
            starting_index = i
    shortened_list = long_list[starting_index:]
    new_string = ""
    for i in range(len(shortened_list)):
        # If there is nothing in the string
        if len(list_of_strings[i]) == 0:
            continue
        # If the string just contains space characters
        if list_of_strings[i].isspace():
            continue
        # If the string just contains digits
        if list_of_strings[i].isdigit():
            continue
        # Remove any digits and add to the shortened bill text string
        curr_string = re.sub(r'\d+', '', shortened_list[i])
        new_string = new_string + curr_string
    if "NewTextUnderlinedDELETEDTEXTBRACKETED" in new_string:
        new_string.replace("NewTextUnderlinedDELETEDTEXTBRACKETED", "")
    return(new_string)

In [5]:
# Make a new column with the shortened bill text using the function above
data["shortened_text"] = data["text"].apply(text_shortener)

### Tokenize Option: Word Ninja Inference Tokenization

In [6]:
# Function to tokenize each line of a bill with word ninja
def word_ninja_tokenize(string):
    return wordninja.split(string)

In [7]:
# Make a new column of infered words
data["infered_wordninja_words"] = data["shortened_text"].apply(word_ninja_tokenize)

In [8]:
# Removing stop words
stop_words = stopwords.words('english')
data["infered_wordninja_words_no_stopwords"] = ""
for i in range(len(data["infered_wordninja_words"])):
    curr_tokens = data["infered_wordninja_words"][i]
    tokens_no_stopwords = [word for word in curr_tokens if word.lower() not in stop_words]
    data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords
# Get the number of tokens with NLTK in each bill
data["number_wordninja_tokens"] = [len(token_list) for token_list in data["infered_wordninja_words"]]
data["number_wordninja_tokens_no_stopwords"] = [len(token_list) for token_list in data["infered_wordninja_words_no_stopwords"]]
# Get the number of bills with less than or equal to 512 tokens
print(len(data[(data["number_wordninja_tokens"] < 512)]))
print(len(data[(data["number_wordninja_tokens_no_stopwords"] < 512)]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords


141
195


In [9]:
data

Unnamed: 0,full_state,link,bill_name,category,status,state,text,number_characters,shortened_text,infered_wordninja_words,infered_wordninja_words_no_stopwords,number_wordninja_tokens,number_wordninja_tokens_no_stopwords
0,Alaska,https://www.akleg.gov/basis/Bill/Detail/33?Roo...,HB27,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
1,Alaska,https://www.akleg.gov/basis/Bill/Detail/33?Roo...,HB105,Schools & Education,"First read and referred to committee, 03/08/2023",AK,"['', 'HB0105a1HB105', 'NewTextUnderlinedDELETE...",6759,ABILLFORANACTENTITLEDAnActrelatingtoparentalri...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, parental,...",1076,631
2,Alaska,https://www.akleg.gov/basis/Bill/Detail/33?Roo...,SB96,Schools & Education,"First read and referred to committee, 03/08/2023",AK,"['', 'SB0096A1SB96', 'NewTextUnderlinedDELETED...",6753,ABILLFORANACTENTITLEDAnActrelatingtoparentalri...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, parental,...",1078,628
3,Arizona,https://apps.azleg.gov/BillStatus/BillOverview...,SB1028,Free Speech & Expression,"Passed Senate; House second read, 03/22/2023",AZ,"['', 'iSenateEngrossedadultcabaretperformances...",1339,BeitenactedbytheLegislatureoftheStateofArizona...,"[Be, it, enacted, by, the, Legislature, of, th...","[enacted, Legislature, State, Arizona, Section...",188,103
4,Arizona,https://apps.azleg.gov/BillStatus/BillOverview...,SB1026,Free Speech & Expression,"Passed Senate; House second read, 03/09/2023",AZ,"['', 'iSenateEngrossedstatemoniesdragshowsmino...",1922,BeitenactedbytheLegislatureoftheStateofArizona...,"[Be, it, enacted, by, the, Legislature, of, th...","[enacted, Legislature, State, Arizona, Section...",243,148
...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,Wyoming,https://wyoleg.gov/Legislation/2023/SF0111,SF111,Healthcare,Passed Senate; House did not consider for intr...,WY,"['2023STATEOFWYOMING23LSO0215', '1SF0111SENATE...",2160,BeItEnactedbytheLegislatureoftheStateofWyoming...,"[Be, It, Enacted, by, the, Legislature, of, th...","[Enacted, Legislature, State, Wyoming, dA, per...",214,115
428,Wyoming,https://wyoleg.gov/Legislation/2023/SF0117,SF117,Schools & Education,Passed Senate; House did not consider for intr...,WY,"['2023STATEOFWYOMING23LSO0156', 'ENGROSSED', '...",7938,BeItEnactedbytheLegislatureoftheStateofWyoming...,"[Be, It, Enacted, by, the, Legislature, of, th...","[Enacted, Legislature, State, Wyoming, STATE, ...",967,559
429,Wyoming,https://wyoleg.gov/Legislation/2023/HB0262,HB262,Civil Rights,"Died in committee, 02/07/2023",WY,"['2023STATEOFWYOMING23LSO0575', '1HB0262HOUSEB...",3739,BeItEnactedbytheLegislatureoftheStateofWyoming...,"[Be, It, Enacted, by, the, Legislature, of, th...","[Enacted, Legislature, State, Wyoming, RELIGIO...",386,234
430,Wyoming,https://wyoleg.gov/Legislation/2023/SF0133,SF133,Schools & Education,"Became act without Governor’s signature, 03/17...",WY,"['ORIGINALSENATEENGROSSED', 'FILENOSF0133', 'E...",17664,BeItEnactedbytheLegislatureoftheStateofWyoming...,"[Be, It, Enacted, by, the, Legislature, of, th...","[Enacted, Legislature, State, Wyoming, Section...",2611,1546


### Merge with Legiscan Summaries

In [11]:
# Read in legiscan summaries for merging
summarydf = pd.read_csv("../modified_data/legiscan_summaries.csv")
summarydf.drop(['Unnamed: 0'], axis = 1, inplace = True)
# Merge
all_data = pd.merge(summarydf, data, on = ["full_state", "bill_name"])

In [12]:
all_data

Unnamed: 0,full_state,bill_name,legiscan_summary,link,category,status,state,text,number_characters,shortened_text,infered_wordninja_words,infered_wordninja_words_no_stopwords,number_wordninja_tokens,number_wordninja_tokens_no_stopwords
0,Alaska,HB27,"An Act relating to school athletics, recreatio...",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
1,Alaska,HB27,"An Act relating to school athletics, recreatio...",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
2,Alaska,HB27,"An Act relating to school athletics, recreatio...",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
3,Alaska,HB27,"An Act relating to school athletics, recreatio...",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
4,Alaska,HB27,"An Act relating to school athletics, recreatio...",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,Schools & Education,"Referred to committee, 01/19/2023",AK,"['', 'HB0027a1HB27', 'NewTextUnderlinedDELETED...",2475,ABILLFORANACTENTITLEDAnActrelatingtoschoolathl...,"[A, BILL, FOR, AN, ACT, ENTITLED, An, Act, rel...","[BILL, ACT, ENTITLED, Act, relating, school, a...",158,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,New Hampshire,HB619,Prohibiting gender transition procedures for m...,https://www.gencourt.state.nh.us/bill_status/b...,Schools & Education,"Retained in committee, 03/15/2023",NH,"['HB619FNASINTRODUCED', '2023SESSION', '230071...",26874,BeitEnactedbytheSenateandHouseofRepresentative...,"[Be, it, Enacted, by, the, Senate, and, House,...","[Enacted, Senate, House, Representatives, Gene...",3938,2363
395,New Hampshire,SB272,Establishing a parents' bill of rights in educ...,https://www.gencourt.state.nh.us/bill_status/b...,Schools & Education,"Passed Senate; House public hearing, 04/18/2023",NH,"['SB272FNASINTRODUCED', '2023SESSION', '231062...",13921,BeitEnactedbytheSenateandHouseofRepresentative...,"[Be, it, Enacted, by, the, Senate, and, House,...","[Enacted, Senate, House, Representatives, Gene...",1930,1096
396,New Hampshire,HB417,Relative to the definition of child abuse.,https://gencourt.state.nh.us/bill_status/legac...,Healthcare,"Inexpedient to legislate, 03/22/2023",NH,"['HB417FNASINTRODUCED', '2023SESSION', '230257...",5680,BeitEnactedbytheSenateandHouseofRepresentative...,"[Be, it, Enacted, by, the, Senate, and, House,...","[Enacted, Senate, House, Representatives, Gene...",609,398
397,New Jersey,S3076,"Establishes ""Child Protection and Anti-Mutilat...",https://www.njleg.state.nj.us/bill-search/2022...,Healthcare,"Active, 01/10/2023",NJ,"['', 'SENATENo3076', '', 'STATEOFNEWJERSEY', '...",4556,BEITENACTEDbytheSenateandGeneralAssemblyoftheS...,"[BE, IT, ENACTED, by, the, Senate, and, Genera...","[ENACTED, Senate, General, Assembly, State, Ne...",591,343


In [13]:
all_data.to_csv("../modified_data/all_data_initial.csv")