In [1]:
!pip install datasets
!pip install transformers 
!pip install evaluate
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCo

In [2]:
from datasets import load_dataset

# Data Loading

In [3]:
# Load the smaller California state bill subset of the BillSum dataset
billsum = load_dataset("billsum", split="ca_test")

# Split into train and test dataset
billsum = billsum.train_test_split(test_size=0.2)

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading and preparing dataset billsum/default to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc...


Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc. Subsequent calls will reuse this data.


In [7]:
billsum['train'][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 42 is added to the Revenue and Taxation Code, to read:\n42.\n(a) Where a tax, fee, assessment, surcharge, or other amount levied or collected by the tax agency has been determined to have been illegally levied or collected in a final and nonappealable decision of a court of competent jurisdiction, any person who paid that tax, fee, assessment, surcharge, or other amount may file with the tax agency a claim for refund of the amount so paid in accordance with this section and the tax agency shall refund the amount so paid.\n(b) Notwithstanding subdivision (a), a person who has paid to the tax agency a tax, fee, assessment, surcharge, or other amount described in subdivision (a) that filed a claim for refund prior to the effective date of the act adding this section that the tax agency has not refunded before that date shall be refunded by the tax agency to the person in accordance with the provi

In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Tokenizer

In [11]:
## PREPROCESSING
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
#The preprocessing function you want to create needs to:

# 1. Prefix the input with a prompt so T5 knows this is a summarization task. 
#    Some models capable of multiple NLP tasks require prompting for specific tasks.
# 2. Use the keyword text_target argument when tokenizing labels.
# 3. Truncate sequences to be no longer than the maximum length set by the max_length parameter.

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    # Tokenizes and truncates
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
# To apply the preprocessing function over the entire dataset
# You can speed up the map function by setting batched=True to process multiple elements of the dataset at once
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [14]:
# Create batch of examples using DataCollatorForSeq2Seq. It's better to dynamically pad the sentences to the longest length
# in a batch during collation instead of padding to the model's maximum length
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Set Up Evaluation

In [15]:
## SET UP EVALUATOR

import evaluate
import numpy as np

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Create function that passes predictions and labels to compute the ROUGE metric
# This is what is used for model training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Train Model

In [None]:
#model.save_weights("/content/gdrive/My Drive/weights.h5")

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/gdrive/My Drive/my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False, # changed to false
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.741642,0.1241,0.0372,0.1045,0.1048,19.0
2,No log,2.514765,0.1332,0.0477,0.1135,0.1136,19.0
3,No log,2.450777,0.1369,0.0498,0.1142,0.1143,19.0
4,No log,2.433548,0.1387,0.0512,0.1162,0.1162,19.0


TrainOutput(global_step=248, training_loss=3.0527995940177672, metrics={'train_runtime': 278.7399, 'train_samples_per_second': 14.192, 'train_steps_per_second': 0.89, 'total_flos': 1070824333246464.0, 'train_loss': 3.0527995940177672, 'epoch': 4.0})

In [20]:
# Save the model
trainer.save_model("/content/gdrive/My Drive/my_awesome_billsum_model")

# Test Model

In [18]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [21]:
from transformers import pipeline

summarizer = pipeline("summarization", model="/content/gdrive/My Drive/my_awesome_billsum_model")
summarizer(text)

Your max_length is set to 200, but you input_length is only 103. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in American history . it'll ask the ultra-wealthy and corporations to pay their fair share ."}]

In [22]:
billtext = "An Act concerning education; relating to school districts and employees thereof; requiring parental consent for use of a student's pronouns; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; requiring school districts to adopt policies thereon. Be it enacted by the Legislature of the State of Kansas: Section 1. (a) An employee or independent contractor of a school district shall not knowingly address, identify or refer to a student who is less than 18 years of age by a pronoun that differs from the pronoun that aligns with the student's biological sex unless the school district has received written permission from the student's parent or guardian. (b) A school district shall not require an employee or independent contractor of such school district to address, identify or refer to an individual by a pronoun that differs from the pronoun that aligns with the student's biological sex if doing so is contrary to the employee's or independent contractor's moral or religious convictions. (c) The school board of each school district shall adopt a policy to implement this section. (d) This section shall not be construed to prohibit any employee or independent contractor of a school district from discussing matters of public concern outside such employee's or independent contractor's official duties. Sec. 2. This act shall take effect and be in force from and after its publication in the statute book"

In [23]:
summarizer(billtext)

[{'summary_text': "an act concerning education; relating to school districts and employees thereof; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; prohibiting schools from requiring use of an individual's pronouns over moral or religious objections; prohibiting schools from requiring use of an employee or independent contractor of a school district to address, identify or refer to an individual's pronouns; relating to school districts and employees thereof; . . . . . . . . . . . . . . . . . . . . . . . . ."}]

In [24]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})