## EDA on ACLU Bill Data

In [60]:
# IMPORT PACKAGES
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re
import wordninja
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miamayerhofer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# LOAD IN THE DATA
data = pd.read_csv("../modified_data/merged_bill_data.csv")
# Drop the unnamed column
data = data.drop(data.columns[0], axis = 1)
# Get the number of characters in each bill
data["number_characters"] = data["text"].str.len()

### Tokenize Option 1: NLTK word_tokenize()

In [62]:
data["nltk_tokens"] = [word_tokenize(text) for text in data["text"]]
# Removing stop words
data["nltk_tokens_no_stopwords"] = ""
stop_words = set(stopwords.words("english"))
for i in range(len(data["nltk_tokens"])):
    curr_tokens = data["nltk_tokens"][i]
    tokens_no_stopwords = [word for word in curr_tokens if word.lower() not in stop_words]
    data["nltk_tokens_no_stopwords"][i] = tokens_no_stopwords
# Get the number of tokens with NLTK in each bill
data["number_nltk_tokens"] = [len(token_list) for token_list in data["nltk_tokens"]]
data["number_nltk_tokens_no_stopwords"] = [len(token_list) for token_list in data["nltk_tokens_no_stopwords"]]
# Get the number of bills with less than or equal to 512 tokens
print(len(data[(data["number_nltk_tokens"] < 512)]))
print(len(data[(data["number_nltk_tokens_no_stopwords"] < 512)]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["nltk_tokens_no_stopwords"][i] = tokens_no_stopwords


86
132


### Tokenize Option 2: Word Ninja Inference Tokenization (after removing all whitespace)

In [49]:
# Make a column of the text without spaces
data["text_no_spaces"] = data["text"].str.replace(r'\s+', '')
# Make a new column of infered words
data["infered_wordninja_words"] = ""
# Infer the words from this new column
for i in range(len(data.text_no_spaces)):
    data.infered_wordninja_words[i] = wordninja.split(data.text_no_spaces[i])

  data["text_no_spaces"] = data["text"].str.replace(r'\s+', '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.infered_wordninja_words[i] = wordninja.split(data.text_no_spaces[i])


In [63]:
# Removing stop words
data["infered_wordninja_words_no_stopwords"] = ""
for i in range(len(data["infered_wordninja_words"])):
    curr_tokens = data["infered_wordninja_words"][i]
    tokens_no_stopwords = [word for word in curr_tokens if word.lower() not in stop_words]
    data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords
# Get the number of tokens with NLTK in each bill
data["number_wordninja_tokens"] = [len(token_list) for token_list in data["infered_wordninja_words"]]
data["number_wordninja_tokens_no_stopwords"] = [len(token_list) for token_list in data["infered_wordninja_words_no_stopwords"]]
# Get the number of bills with less than or equal to 512 tokens
print(len(data[(data["number_wordninja_tokens"] < 512)]))
print(len(data[(data["number_wordninja_tokens_no_stopwords"] < 512)]))

85
150


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["infered_wordninja_words_no_stopwords"][i] = tokens_no_stopwords


## Testing Transformers

Source: https://anubhav20057.medium.com/step-by-step-guide-abstractive-text-summarization-using-roberta-e93978234a90

In [66]:
import transformers
from transformers import RobertaTokenizerFast, Seq2SeqTrainer, EncoderDecoderModel, TrainingArguments
from typing import Optional

In [67]:
# TOKENIZING WITH ROBERTA
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
batch_size = 256  
encoder_max_length = 40
decoder_max_length = 8

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [68]:
def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch