## EDA on ACLU Bill Data

In [103]:
# IMPORT PACKAGES
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re
import wordninja
from nltk.corpus import stopwords
import ast
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miamayerhofer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
# LOAD IN THE DATA
data = pd.read_csv("../modified_data/merged_bill_data.csv")
# Drop the unnamed column
data = data.drop(data.columns[0], axis = 1)
# Get the number of characters in each bill
data["number_characters"] = data["text"].str.len()

In [128]:
# Find section of the pdf text after the "be it enacted by"
def text_shortener(list_of_strings):
    # Make all lowercase
    long_string = list_of_strings.replace(" ", "")
    # Convert to real list
    long_list = ast.literal_eval(long_string)
    starting_index = 0
    for i in range(len(long_list)):
        # Set the starting index of the content of the bill
        if "beitenactedby" in long_list[i].lower(): 
            starting_index = i
    shortened_list = long_list[starting_index:]
    new_string = ""
    for i in range(len(shortened_list)):
        # If there is nothing in the string
        if len(list_of_strings[i]) == 0:
            continue
        # If the string just contains space characters
        if list_of_strings[i].isspace():
            continue
        # If the string just contains digits
        if list_of_strings[i].isdigit():
            continue
        # Remove any digits and add to the shortened bill text string
        curr_string = re.sub(r'\d+', '', shortened_list[i])
        new_string = new_string + curr_string
    if "NewTextUnderlinedDELETEDTEXTBRACKETED" in new_string:
        new_string.replace("NewTextUnderlinedDELETEDTEXTBRACKETED", "")
    return(new_string)

In [119]:
# Make a new column with the shortened bill text using the function above
data["shortened_text"] = data["text"].apply(text_shortener)

In [127]:
data["shortened_text"][0]

'ABILLFORANACTENTITLEDAnActrelatingtoschoolathleticsrecreationathleticteamsandsportsBEITENACTEDBYTHELEGISLATUREOFTHESTATEOFALASKASectionASisamendedbyaddinganewsubsectiontoreaddInthissectionsexmeansbiologicalsexSecASisamendedbyaddingnewsectionstoreadArticleDesignationofAthleticTeamsandSportsSecAthleticteamandsportdesignationaApublicschooloraprivateschoolwhosestudentsorteamscompeteagainstapublicschoolmustdesignateeachschoolsponsoredathleticteamorsportamalemenorboysteamorsportfemalewomenorgirlsteamorsportorcoeducationalormixedteamorsportbAstudentwhoparticipatesinanathleticteamorsportdesignatedfemalewomenorgirlsmustbefemalebasedontheparticipantsbiologicalsexSecComplianceprotectedAgovernmentalentitylicensingorLSAHBHBaNewTextUnderlinedDELETEDTEXTBRACKETED'

### Tokenize Option: Word Ninja Inference Tokenization

In [132]:
# Function to tokenize each line of a bill with word ninja
def word_ninja_tokenize(string):
    return wordninja.split(string)

In [133]:
# Make a new column of infered words
data["infered_wordninja_words"] = data["shortened_text"].apply(word_ninja_tokenize)

In [135]:
# Removing stop words
stop_words = stopwords.words('english')
data["infered_wordninja_words_no_stopwords"] = ""
for i in range(len(data["infered_wordninja_words"])):
    curr_tokens = data["infered_wordninja_words"][i]
    tokens_no_stopwords = [word for word in curr_tokens if word.lower() not in stop_words]
    data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords
# Get the number of tokens with NLTK in each bill
data["number_wordninja_tokens"] = [len(token_list) for token_list in data["infered_wordninja_words"]]
data["number_wordninja_tokens_no_stopwords"] = [len(token_list) for token_list in data["infered_wordninja_words_no_stopwords"]]
# Get the number of bills with less than or equal to 512 tokens
print(len(data[(data["number_wordninja_tokens"] < 512)]))
print(len(data[(data["number_wordninja_tokens_no_stopwords"] < 512)]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords


141
195


## Testing Transformers

Source: https://anubhav20057.medium.com/step-by-step-guide-abstractive-text-summarization-using-roberta-e93978234a90

In [66]:
import transformers
from transformers import RobertaTokenizerFast, Seq2SeqTrainer, EncoderDecoderModel, TrainingArguments
from typing import Optional

In [67]:
# TOKENIZING WITH ROBERTA
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
batch_size = 256  
encoder_max_length = 40
decoder_max_length = 8

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [68]:
def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch