In [1]:
!pip install datasets
!pip install transformers 
!pip install evaluate
!pip install rouge-score
!pip install torch

You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/corrina/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import datasets
import re
import time
import math

# Read in files and subset

In [3]:
# read in text and summaries
summary_df = pd.read_csv(Path().absolute().parents[1]/Path("modified_data")/"text_and_summaries.csv")

# remove the rows that don't have a valid summary
# no_summary_filter = full_text.state == "IN" # subset to just one state for the test run
no_summary_filter = (summary_df.keep == 1) & (summary_df.summary != 'still need') & (~summary_df.summary.isna()) & (~summary_df.summary.fillna('').str.startswith('NA'))
summary_df = summary_df[no_summary_filter].copy(deep=True)
summary_cols_keep = ['state_code', 'state_name', 'bill_name', 'summary']

# read in the full text of the bills scraped from the PDFs
full_text =  pd.read_csv(Path().absolute().parents[1]/Path("modified_data")/"bill_texts_full_text.csv")
full_text_cols_keep = ['state', 'bill_name', 'cleaned_text']

all_text = summary_df[summary_cols_keep].merge(full_text[full_text_cols_keep],
                                               how = 'inner',
                                               left_on = ['state_code', 'bill_name'],
                                               right_on = ['state', 'bill_name'])
all_text.rename({'bill_name':'title',
                 'cleaned_text':'text',
                }, axis='columns', inplace=True)
all_text.head()

Unnamed: 0,state_code,state_name,title,summary,state,text
0,AK,Alaska,HB105,This bill adds to the list of parental rights ...,AK,Section AS 03 a is amended to read a A local s...
1,AK,Alaska,SB96,This bill adds to the list of parental rights ...,AK,Section AS 03 a is amended to read a A local s...
2,AR,Arkansas,HB1156,This bill would apply to multiple-occupancy re...,AR,SECTION Arkansas Code Title Chapter Sub chapte...
3,AR,Arkansas,SB199,This bill imposes severe consequences aimed so...,AR,SECTION Arkansas Code Title Chapter is amended...
4,AR,Arkansas,SB294,This bill is a comprehensive education reform ...,AR,SECTION DO NOT CODIFY Title This act shall be ...


## Split into training and testing

In [4]:
train, test = train_test_split(all_text[['text', 'summary']], test_size = 0.2)


In [6]:
print(len(train))
print(len(test))

188
47


# Create HuggingFace Objects

In [7]:
train_dataset = datasets.Dataset.from_dict(train)
test_dataset = datasets.Dataset.from_dict(test)
billsum = datasets.DatasetDict({"train": train_dataset,
                                "test": test_dataset
                               })

In [8]:
# from datasets import load_dataset

# Data Loading

In [9]:
# # Load the smaller California state bill subset of the BillSum dataset
# billsum = load_dataset("billsum", split="ca_test")

# # Split into train and test dataset
# billsum = billsum.train_test_split(test_size=0.2)

In [10]:
billsum['test'][0]['summary']

'A BILL for an Act to create and enact a new section to chapter 14-02.4, a new section to chapter 15.1-07, and two new sections to chapter 15.1-21 of the North Dakota Century Code, relating to school discrimination, parental rights and involvement in school, curbing of social emotional learning, and the review and recommendation of instructional materials; to amend and reenact subsection 6 of section 14-02.4-02 and section 15.1-21-24 of the North Dakota Century Code, relating to the definition of a discriminatory practice and reproductive health education requirements; and to provide a penalty.'

In [11]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# Load Tokenizer

In [12]:
## PREPROCESSING
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
#The preprocessing function you want to create needs to:

# 1. Prefix the input with a prompt so T5 knows this is a summarization task. 
#    Some models capable of multiple NLP tasks require prompting for specific tasks.
# 2. Use the keyword text_target argument when tokenizing labels.
# 3. Truncate sequences to be no longer than the maximum length set by the max_length parameter.

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    # Tokenizes and truncates
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
# To apply the preprocessing function over the entire dataset
# You can speed up the map function by setting batched=True to process multiple elements of the dataset at once
tokenized_billsum = billsum.map(preprocess_function, batched=True)
# tokenized_billsum = billsum.map(preprocess_function)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
tokenized_billsum['train']

Dataset({
    features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 188
})

In [16]:
# Create batch of examples using DataCollatorForSeq2Seq. It's better to dynamically pad the sentences to the longest length
# in a batch during collation instead of padding to the model's maximum length
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Set Up Evaluation

In [17]:
## SET UP EVALUATOR

import evaluate
import numpy as np

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Create function that passes predictions and labels to compute the ROUGE metric
# This is what is used for model training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Train Model

In [18]:
#model.save_weights("/content/gdrive/My Drive/weights.h5")

In [32]:
# output_directory = "/content/gdrive/My Drive/my_awesome_billsum_model"
output_directory = Path().absolute().parents[1]/Path("modified_data")
model_directory =  Path().absolute()/Path("summarizer_model")

In [20]:
import torch

In [21]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
#     output_dir="/content/gdrive/My Drive/my_awesome_billsum_model",
    output_dir = output_directory,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
#     fp16=True,
    push_to_hub=False, # changed to false
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



NOTE: Redirects are currently not supported in Windows or MacOs.


In [22]:
start = time.time()
trainer.train()
end = time.time()
print(end - start)

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 188
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 48
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.662995,0.1806,0.0848,0.1487,0.1484,19.0
2,No log,3.307576,0.1731,0.0788,0.1428,0.1425,19.0
3,No log,3.127627,0.1732,0.0795,0.1421,0.1419,19.0
4,No log,3.077767,0.1728,0.0773,0.1419,0.1419,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 47
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 47
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 47
 

6893.898317098618


In [51]:
print((end - start)/60)

114.89830528497696


In [27]:
output_directory

PosixPath('/Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/modified_data')

In [33]:
# Save the model
# trainer.save_model("/content/gdrive/My Drive/my_awesome_billsum_model")
trainer.save_model(model_directory)

Saving model checkpoint to /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model
Configuration saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/config.json
Model weights saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/pytorch_model.bin
tokenizer config file saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/tokenizer_config.json
Special tokens file saved in /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/special_tokens_map.json
Copy vocab file to /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/spiece.model


# Test Model

In [37]:
# text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
bill_text = list(all_text.text)[4]
chunk_size = 512
chunks = len(bill_text) // chunk_size
bill_text_parts = [ bill_text[i:i + chunk_size] for i in range(0, len(bill_text), chunk_size)]
bill_text_parts

['SECTION DO NOT CODIFY Title This act shall be known and may be cited as the LEARNS Act 35 SECTION Arkansas Code 10 d concerning school resource As Engrossed H 27 SB 2 27 11 52 T NL officer training requirements is amended to read as follows d Sworn non supervisory law enforcement personnel including without limitation school resource officers who are assigned to a public school campus during the instructional day or employed by a public school district shall A Within eighteen months of being assigned or emp',
 'loyed by the public school district i a Complete a forty hour basic school resource officer training program developed and provided or approved by the Arkansas Center for School Safety of the Criminal Justice Institute b The training required under subdivision d A i a of this section shall in cl ude without limitation 1 The roles and responsibilities of school resource officers in public schools 2 Laws that are specific to public schools and students in public schools and 3 Ad

In [52]:
# from transformers import pipeline

# summarizer = pipeline("summarization", model=str(model_directory))


print(summarizer(bill_text_parts[0]))
#     print("--------------------------------------------------------------------------")
    

Your max_length is set to 200, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


[{'summary_text': 'this act shall be known and may be cited as the LEARNS Act 35 SECTION Arkansas Code 10 d concerning school resource . d Sworn non supervisory law enforcement personnel including without limitation school resource officers who are assigned to a public school campus during the instructional day or employed by an public school district shall A Within eighteen months of being assigned or emprisoned .'}]


In [48]:
from transformers import pipeline

summarizer = pipeline("summarization", model=str(model_directory))

for each_bill_part in bill_text_parts[0:5]:
    print("BILL SUBSET:")
    print("\t", each_bill_part)
    print("BILL SUMMARY:")
    print(summarizer(each_bill_part))
    print("--------------------------------------------------------------------------")
    

loading configuration file /Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model/config.json
Model config T5Config {
  "_name_or_path": "/Users/corrina/Documents/anly-521/final_project/ANLY521_Final_Project/code/summarization/summarizer_model",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
     

BILL SUBSET:
	 SECTION DO NOT CODIFY Title This act shall be known and may be cited as the LEARNS Act 35 SECTION Arkansas Code 10 d concerning school resource As Engrossed H 27 SB 2 27 11 52 T NL officer training requirements is amended to read as follows d Sworn non supervisory law enforcement personnel including without limitation school resource officers who are assigned to a public school campus during the instructional day or employed by a public school district shall A Within eighteen months of being assigned or emp
BILL SUMMARY:


Your max_length is set to 200, but you input_length is only 106. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


[{'summary_text': 'this act shall be known and may be cited as the LEARNS Act 35 SECTION Arkansas Code 10 d concerning school resource . d Sworn non supervisory law enforcement personnel including without limitation school resource officers who are assigned to a public school campus during the instructional day or employed by an public school district shall A Within eighteen months of being assigned or emprisoned .'}]
--------------------------------------------------------------------------
BILL SUBSET:
	 loyed by the public school district i a Complete a forty hour basic school resource officer training program developed and provided or approved by the Arkansas Center for School Safety of the Criminal Justice Institute b The training required under subdivision d A i a of this section shall in cl ude without limitation 1 The roles and responsibilities of school resource officers in public schools 2 Laws that are specific to public schools and students in public schools and 3 Adolescen

Your max_length is set to 200, but you input_length is only 115. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


[{'summary_text': 'the training required under subdivision d A i a of this section shall in cl ude without limitation 1 The roles and responsibilities of school resource officers in public schools 2 Laws that are specific to public schools . 3 Adolescent behavior and developmeme .'}]
--------------------------------------------------------------------------
BILL SUBSET:
	 nt and ii a Obtain certification in Youth Mental Health First Aid Attend a training in youth mental health as required by the State Board of Education b Youth Mental Health First Aid certification shall be maintained and renewed The youth mental he a l th training required under subdivision d A ii a of this section shall be obtained every four years if the school resource officer remains assigned to or employed by a public school district B i Within five years after receiving the initial basic school resour
BILL SUMMARY:


Your max_length is set to 200, but you input_length is only 106. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


[{'summary_text': 'nt and ii a Obtain certification in Youth Mental Health First Aid Attend a training in youth mental health as required by the state board of education . The youth mental training required under subdivision d A ia of this section shall be obtained every four years if the school resource officer remains assigned to or employed by a public school district .'}]
--------------------------------------------------------------------------
BILL SUBSET:
	 ce officer training program complete a sixteen hour school resource officer refresher training developed and provided or approved by the Arkansas Center for School Safety of the Criminal Justice Institute ii The s choo l resource officer refresher training required under subdivision d B i of this section shall be completed every five years and C i Annually complete twelve hours of public school specific continuing education developed and provided or approved by As Engrossed H 27 SB 3 27 11 52 T NL the Arkan
BILL SUMMARY:


Your max_length is set to 200, but you input_length is only 103. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': 'ce officer training program complete a sixteen hour school resource officer refresher training developed and provided or approved by the Arkansas Center for School Safety of the Criminal Justice Institute . the s choo l training required under subdivision d B i of this section shall be completed every five years and C i Annually complete twelve hours of public school specific continuing education developed .'}]
--------------------------------------------------------------------------
BILL SUBSET:
	 sas Center for School Safety of the Criminal Justice Institute ii The Youth Mental Health First Aid The youth mental health training required under subdivision d A ii of this section and the school resource officer refresher training required under subdivision d B of this section shall count towards the twelve 5 hours of public school specific continuing education required under subdivision d C i of this section in the years during which the Youth Mental Health First Aid 

In [49]:
billtext = "An Act concerning education; relating to school districts and employees thereof; requiring parental consent for use of a student's pronouns; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; requiring school districts to adopt policies thereon. Be it enacted by the Legislature of the State of Kansas: Section 1. (a) An employee or independent contractor of a school district shall not knowingly address, identify or refer to a student who is less than 18 years of age by a pronoun that differs from the pronoun that aligns with the student's biological sex unless the school district has received written permission from the student's parent or guardian. (b) A school district shall not require an employee or independent contractor of such school district to address, identify or refer to an individual by a pronoun that differs from the pronoun that aligns with the student's biological sex if doing so is contrary to the employee's or independent contractor's moral or religious convictions. (c) The school board of each school district shall adopt a policy to implement this section. (d) This section shall not be construed to prohibit any employee or independent contractor of a school district from discussing matters of public concern outside such employee's or independent contractor's official duties. Sec. 2. This act shall take effect and be in force from and after its publication in the statute book"

In [50]:
summarizer(billtext)

[{'summary_text': "an act concerning education; relating to school districts and employees thereof; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; requiring school districts to adopt policies thereon . an employee or independent contractor of such school district shall not knowingly address, identify or refer to a student who is less than 18 years of age unless the school district has received written permission from the student's parent or guardian ."}]