In [1]:
!pip install datasets
!pip install transformers 
!pip install evaluate
!pip install rouge-score
!pip install torch

You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import datasets
import re
import time
import math

# Read in files and subset

In [7]:
# read in text and summaries
all_text = pd.read_csv(Path().absolute().parents[1]/Path("modified_data")/"text_and_summaries_filtered_split.csv")

all_text.head()
# remove the rows that don't have a valid summary
no_summary_filter = all_text.state == "IN" # subset to just one state for the test run
# no_summary_filter = (summary_df.keep == 1) & (summary_df.summary != 'still need') & (~summary_df.summary.isna()) & (~summary_df.summary.fillna('').str.startswith('NA'))
all_text = all_text[no_summary_filter].copy(deep=True)
cols_keep = ['state', 'state_name', 'bill_name', 'summary', 'doc_number', 'split_text']

all_text = all_text[cols_keep]
all_text.rename({'bill_name':'title',
                 'split_text':'text',
                }, axis='columns', inplace=True)
all_text.head()

Unnamed: 0,state,state_name,title,summary,doc_number,text
294,IN,Indiana,HB1118,This bill prohibits specified health care prof...,1,Introduced Version HOUSE BILL No DIGEST OF ...
295,IN,Indiana,HB1118,This bill prohibits specified health care prof...,2,A nurse including an advanced practice regist...
296,IN,Indiana,HB1118,This bill prohibits specified health care prof...,3,professional and may bring an action in a cour...
297,IN,Indiana,HB1220,This bill prohibits a physician or other pract...,1,Introduced Version HOUSE BILL No DIGEST OF ...
298,IN,Indiana,HB1220,This bill prohibits a physician or other pract...,2,reassignment surgery or nongenital gender reas...


## Split into training and testing

In [8]:
train, test = train_test_split(all_text[['text', 'summary']], test_size = 0.2)


In [9]:
print(len(train))
print(len(test))

48
12


# Create HuggingFace Objects

In [10]:
train_dataset = datasets.Dataset.from_dict(train)
test_dataset = datasets.Dataset.from_dict(test)
billsum = datasets.DatasetDict({"train": train_dataset,
                                "test": test_dataset
                               })

In [8]:
# from datasets import load_dataset

# Data Loading

In [9]:
# # Load the smaller California state bill subset of the BillSum dataset
# billsum = load_dataset("billsum", split="ca_test")

# # Split into train and test dataset
# billsum = billsum.train_test_split(test_size=0.2)

In [11]:
billsum['test'][0]['summary']

"This bill provides that the state of Indiana, a political subdivision or other governmental entity of the state of Indiana, a government official, or any other person acting under the color of law shall not infringe on the fundamental right of a parent to direct the upbringing, education, health care, and mental health of the parent's child without demonstrating that the infringement: (1) is required by a compelling governmental interest of the highest order as long recognized in the history and traditions of the state of Indiana; and (2) as applied to the child, is narrowly tailored and not otherwise served by a less restrictive means. This bill also creates a right of action for violation of a parent's rights with respect to the upbringing, education, and health care of the parent's child. This bill provides that a child is not a child in need of services due to the child's parent, guardian, or custodian: (1) referring to and raising the child consistent with the child's biological 

In [11]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# Load Tokenizer

In [12]:
## PREPROCESSING
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

2023-04-17 23:29:28.580142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
#The preprocessing function you want to create needs to:

# 1. Prefix the input with a prompt so T5 knows this is a summarization task. 
#    Some models capable of multiple NLP tasks require prompting for specific tasks.
# 2. Use the keyword text_target argument when tokenizing labels.
# 3. Truncate sequences to be no longer than the maximum length set by the max_length parameter.

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    # Tokenizes and truncates
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
# To apply the preprocessing function over the entire dataset
# You can speed up the map function by setting batched=True to process multiple elements of the dataset at once
tokenized_billsum = billsum.map(preprocess_function, batched=True)
# tokenized_billsum = billsum.map(preprocess_function)

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [15]:
tokenized_billsum['train']

Dataset({
    features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 48
})

In [16]:
# Create batch of examples using DataCollatorForSeq2Seq. It's better to dynamically pad the sentences to the longest length
# in a batch during collation instead of padding to the model's maximum length
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Set Up Evaluation

In [17]:
## SET UP EVALUATOR

import evaluate
import numpy as np

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Create function that passes predictions and labels to compute the ROUGE metric
# This is what is used for model training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Train Model

In [18]:
#model.save_weights("/content/gdrive/My Drive/weights.h5")

In [18]:
# output_directory = "/content/gdrive/My Drive/my_awesome_billsum_model"
output_directory = Path().absolute().parents[1]/Path("modified_data")
model_directory =  Path().absolute()/Path("summarizer_model")

In [19]:
import torch

In [20]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
#     output_dir="/content/gdrive/My Drive/my_awesome_billsum_model",
    output_dir = output_directory,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
#     fp16=True,
    push_to_hub=False, # changed to false
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [21]:
start = time.time()
trainer.train()
end = time.time()
print(end - start)

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 48
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.825474,0.1661,0.0587,0.1327,0.1357,19.0
2,No log,3.755203,0.1687,0.0587,0.1366,0.1392,19.0
3,No log,3.710465,0.1696,0.0622,0.1395,0.1415,19.0
4,No log,3.692454,0.1696,0.0622,0.1395,0.1415,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12
 

310.2452349662781


In [22]:
print((end - start)/60)

5.170753916104634


In [24]:
model_directory

PosixPath('/Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model')

In [25]:
# Save the model
# trainer.save_model("/content/gdrive/My Drive/my_awesome_billsum_model")
trainer.save_model(model_directory)

Saving model checkpoint to /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model
Configuration saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/config.json
Model weights saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/pytorch_model.bin
tokenizer config file saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/tokenizer_config.json
Special tokens file saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/special_tokens_map.json
Copy vocab file to /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/spiece.model


# Test Model

* Test one one bill

In [33]:
# text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
test_bill_text = all_text[(all_text.state == 'IN') & (all_text.title == 'HB1118')]
test_bill_text

Unnamed: 0,state,state_name,title,summary,doc_number,text
294,IN,Indiana,HB1118,This bill prohibits specified health care prof...,1,Introduced Version HOUSE BILL No DIGEST OF ...
295,IN,Indiana,HB1118,This bill prohibits specified health care prof...,2,A nurse including an advanced practice regist...
296,IN,Indiana,HB1118,This bill prohibits specified health care prof...,3,professional and may bring an action in a cour...


In [43]:
from transformers import pipeline

summarizer = pipeline("summarization", model=str(model_directory))

compiled_summary = []

for each_bill_part in list(test_bill_text.text):
    print("BILL SUBSET:")
    print("\t", each_bill_part)
    print("BILL SUMMARY:")
    model_summary = summarizer(each_bill_part)
    print(model_summary)
    compiled_summary.append(model_summary[0]['summary_text'])
    print("--------------------------------------------------------------------------")
    

loading configuration file /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/config.json
Model config T5Config {
  "_name_or_path": "/Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
     

BILL SUBSET:
	   Introduced Version HOUSE BILL No DIGEST OF  INTRODUCED B ILL Citations Affected   IC IC IC Synopsis   Prohibited services relating to care of minors  Prohibits specified health care professionals from performing  or causing to be performed  certain medical procedures on a minor  or subjecting a minor to certain activities that purposely attempt to change  reinforce  or affirm a minor s perception of the minor s own sexual attraction or sexual behavior  or attempt to change  reinforce  or affirm a minor s gender identity when the identity is inconsistent with the minor s biological sex   Effective   July DI PRINTING CODE  Amendments  Whenever an existing statute  or a section of the Indiana Constitution  is being amended  the text of the existing provision will appear in this style type  additions will appear in this sty le type  and deletions will appear in this style type    Additions  Whenever a new statutory provision is being enacted  or a new constitutional provis

In [45]:
print("Actual Summary:\n\t", test_bill_text.summary.unique())
print("Model Summary:\n\t", ' '.join(compiled_summary))

Actual Summary:
	 ["This bill prohibits specified health care professionals from performing, or causing to be performed, certain medical procedures on a minor or subjecting a minor to certain activities that purposely attempt to change, reinforce, or affirm a minor's perception of the minor's own sexual attraction or sexual behavior, or attempt to change, reinforce, or affirm a minor's gender identity when the identity is inconsistent with the minor's biological sex."]
Model Summary:
	 HOUSE BILL No DIGEST OF INTRODUCED B ILL Citations Affected IC IC Synopsis Prohibited services relating to care of minors Prohibits specified health care professionals from performing or subjecting a minor to certain activities that purposely attempt to change reinforce or affirma minor s own sexual attraction or sexual behavior . health care professional may not purposely attempt to change reinforce or affirm a minor's perception of the minor s own sexual attraction or sexual behavior . a person who has