In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install optuna
!pip install rouge-metric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 21.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 62.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 22.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K   

In [2]:
from pyexpat import features
import datasets 
import optuna 
from datasets import load_dataset 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os.path
from os import path
from datasets import load_dataset

In [3]:
# Create function for printing 
def print_custom(text):
    print('\n')
    print(text)
    print('-'*100)


In [24]:
# Specify our parameter and project variables
LR_MIN = 4e-5
LR_CEIL = 0.01
WD_MIN = 4e-5
WD_CEIL = 0.01
MIN_EPOCHS = 2
MAX_EPOCHS = 5
PER_DEVICE_EVAL_BATCH = 8
PER_DEVICE_TRAIN_BATCH = 8
NUM_TRIALS = 1
SAVE_DIR = 'opt-test'
SAVE_MODEL_DIR = 'models'
SAVE_TOKENIZER_DIR = 'tokenizer'
NAME_OF_MODEL = 'huggingoptunaface'
MAX_LENGTH = 512

In [5]:
# Loading dataset
billsum = load_dataset("billsum", split="ca_test") 
billsum = billsum.train_test_split(test_size=0.2)
billsum = billsum.filter(lambda x: x['text'] is not None and x['summary'] is not None)
# billsum["train"][0]

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading and preparing dataset billsum/default to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc...


Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
# create a dataset with the text and summary columns only, by removing the 'title' since that is not necessary for the summarization task
billsum = billsum.remove_columns(["title"])
billsum["train"][0]  

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) County assessors are required to follow complex state laws in the performance of their duties.\n(b) The application of those laws is intended to provide consistent and accurate assessment practices across the state.\n(c) It is the intent of the Legislature that the interests of taxpayers be protected by ensuring that decisions regarding eligibility of a property for exemption, other than homeowners’ exemptions, or whether a transaction is a change in ownership or qualifies for an exclusion from a change in ownership, or both, be made by staff who is certified to make those decisions.\n(d) It is further the intent of the Legislature that implementation of education and certification requirements required by Section 3 of this act be undertaken in the most efficient and economical manner, utilizing existing resources of California county asses

In [7]:
# Dataset structure check
print_custom('Dataset structure check')
print(billsum)



Dataset structure check
----------------------------------------------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 248
    })
})


In [8]:
# Loading t5-small model for summarization
print_custom('Initializing T5 Small pretrained tokenizer')
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Initializing T5 Small pretrained tokenizer
----------------------------------------------------------------------------------------------------


Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [9]:
# Preprocessing the data
prefix = "summarize: "
print_custom('Tokenizing the dataset')
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
    
# Tokenize the dataset
tokenized_billsum = billsum.map(preprocess_function, batched=True)



Tokenizing the dataset
----------------------------------------------------------------------------------------------------


  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
# Craeting a data collectior
print_custom('Creating a data collector')
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



Creating a data collector
----------------------------------------------------------------------------------------------------


In [11]:
# Viewing the tokenized dataset structure
print_custom('Tokenized dataset structure')
print(tokenized_billsum)



Tokenized dataset structure
----------------------------------------------------------------------------------------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})


In [12]:
# Creating the optuna objective function for t5-small model for summarization 
print_custom('Creating the optuna objective function for t5-small model for summarization')
def objective(trial: optuna.Trial):
    # Specify the model name and folder
    model_name = "t5-small"
    model_folder = "model"
    model_path = f'{model_folder}/{model_name}'

    # Specify the training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=SAVE_DIR,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=trial.suggest_float("learning_rate", LR_MIN, LR_CEIL, log=True),
        weight_decay=trial.suggest_float("weight_decay", WD_MIN, WD_CEIL, log=True),
        num_train_epochs=trial.suggest_int("num_train_epochs", MIN_EPOCHS, MAX_EPOCHS),
        warmup_ratio=trial.suggest_float("warmup_ratio", 0.0, 1.0),
        per_device_train_batch_size=trial.suggest_int("per_device_train_batch_size", 4, 16),
        per_device_eval_batch_size=trial.suggest_int("per_device_eval_batch_size", 4, 16),
        save_total_limit=1,
        load_best_model_at_end=True,
        greater_is_better=True,
        predict_with_generate=True,
        run_name=NAME_OF_MODEL,
        report_to="none",
    )

    # Create the trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_billsum["train"],
        eval_dataset=tokenized_billsum["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    metrics = trainer.evaluate()

    # Return the loss
    return metrics["eval_loss"]




Creating the optuna objective function for t5-small model for summarization
----------------------------------------------------------------------------------------------------


In [13]:
# Create the study
print_custom('Creating the study')
study = optuna.create_study(direction="minimize")

[32m[I 2022-12-02 05:15:57,987][0m A new study created in memory with name: no-name-dd50f44a-777b-44d0-9fde-e28261c83bcc[0m




Creating the study
----------------------------------------------------------------------------------------------------


In [14]:
# Optimize the objective function
print_custom('Optimizing the objective function')
study.optimize(objective, n_trials=NUM_TRIALS)



Optimizing the objective function
----------------------------------------------------------------------------------------------------


The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 989
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 496
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.418267
2,No log,2.061304


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 4
Saving model checkpoint to opt-test/checkpoint-248
Configuration saved in opt-test/checkpoint-248/config.json
Model weights saved in opt-test/checkpoint-248/pytorch_model.bin
tokenizer config file saved in opt-test/checkpoint-248/tokenizer_config.json
Special tokens file saved in opt-test/checkpoint-248/special_tokens_map.json
Copy vocab file to opt-test/checkpoint-248/spiece.model
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** 

[32m[I 2022-12-02 05:20:02,706][0m Trial 0 finished with value: 2.418267011642456 and parameters: {'learning_rate': 0.006866675712996326, 'weight_decay': 0.0011078748185077215, 'num_train_epochs': 2, 'warmup_ratio': 0.6307792186533049, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4}. Best is trial 0 with value: 2.418267011642456.[0m


In [15]:
# Print the best parameters
print_custom('Printing the best parameters')
print(study.best_params)




Printing the best parameters
----------------------------------------------------------------------------------------------------
{'learning_rate': 0.006866675712996326, 'weight_decay': 0.0011078748185077215, 'num_train_epochs': 2, 'warmup_ratio': 0.6307792186533049, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4}


In [16]:
print('Best learning rate:', study.best_params['learning_rate'])

Best learning rate: 0.006866675712996326


In [17]:
# Using the best parameters to train the model
print_custom('Using the best parameters to train the model')
training_args = Seq2SeqTrainingArguments(
    output_dir=SAVE_DIR,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=study.best_params["learning_rate"],
    weight_decay=study.best_params["weight_decay"],
    per_device_train_batch_size=study.best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=study.best_params["per_device_eval_batch_size"],
    num_train_epochs=study.best_params["num_train_epochs"],
    warmup_ratio=study.best_params["warmup_ratio"],
    save_total_limit=1,
    load_best_model_at_end=True,
    greater_is_better=True,
    predict_with_generate=True,
    run_name=NAME_OF_MODEL,
    report_to="none",
)

PyTorch: setting up devices




Using the best parameters to train the model
----------------------------------------------------------------------------------------------------


In [18]:
# Create the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
)

In [19]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 989
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 496
  Number of trainable parameters = 60506624


Epoch,Training Loss,Validation Loss
1,No log,2.337054
2,No log,2.032898


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 4
Saving model checkpoint to opt-test/checkpoint-248
Configuration saved in opt-test/checkpoint-248/config.json
Model weights saved in opt-test/checkpoint-248/pytorch_model.bin
tokenizer config file saved in opt-test/checkpoint-248/tokenizer_config.json
Special tokens file saved in opt-test/checkpoint-248/special_tokens_map.json
Copy vocab file to opt-test/checkpoint-248/spiece.model
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** 

TrainOutput(global_step=496, training_loss=2.0768671343403478, metrics={'train_runtime': 223.0418, 'train_samples_per_second': 8.868, 'train_steps_per_second': 2.224, 'total_flos': 535412166623232.0, 'train_loss': 2.0768671343403478, 'epoch': 2.0})

In [20]:
# Evaluating the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, summary. If text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 4


{'eval_loss': 2.3370535373687744,
 'eval_runtime': 9.0023,
 'eval_samples_per_second': 27.549,
 'eval_steps_per_second': 6.887,
 'epoch': 2.0}

In [21]:
# Model Evaluation using ROUGE metrics
print_custom('Making use of rouge metric to evaluate the model')
from rouge_metric import PyRouge

print_custom('Evaluating the model using rouge metric')
rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=True, rouge_s=True, rouge_su=True)

print_custom('Using the sample format to evaluate the model')
hypotheses = []
references = []

# Looping through the test dataset
for i in range(len(tokenized_billsum["test"])):
    # Getting the input and target
    input = tokenized_billsum["test"][i]["input_ids"]
    target = tokenized_billsum["test"][i]["labels"]

    # Decoding the input and target
    input = tokenizer.decode(input, skip_special_tokens=True)
    target = tokenizer.decode(target, skip_special_tokens=True)

    # Appending the input and target to the lists
    hypotheses.append(input)
    references.append([target])

# Evaluating the model
print_custom('Evaluating the model')
scores = rouge.evaluate(hypotheses, references)



Making use of rouge metric to evaluate the model
----------------------------------------------------------------------------------------------------


Evaluating the model using rouge metric
----------------------------------------------------------------------------------------------------


Using the sample format to evaluate the model
----------------------------------------------------------------------------------------------------


Evaluating the model
----------------------------------------------------------------------------------------------------


In [22]:
# print the results
print_custom('Printing the results')
print(scores)



Printing the results
----------------------------------------------------------------------------------------------------
{'rouge-1': {'r': 0.6662854362017033, 'p': 0.08541137671621327, 'f': 0.15141305753590956}, 'rouge-2': {'r': 0.34100979690254696, 'p': 0.04328458967696099, 'f': 0.07681855187180418}, 'rouge-l': {'r': 0.4802788186724612, 'p': 0.06147020926263846, 'f': 0.10899083511321317}, 'rouge-w-1.2': {'r': 0.13205189039016682, 'p': 0.04223681694857227, 'f': 0.0640024429268655}, 'rouge-s*': {'r': 0.44276931333169867, 'p': 0.007278930277288221, 'f': 0.014322406570546875}, 'rouge-su*': {'r': 0.4472986631293172, 'p': 0.007479270130111619, 'f': 0.014712532362354239}}


In [25]:
# Save the model in the models folder with the name of the model
print_custom('Saving the model in the models folder with the name of the model')
trainer.save_model(f'{SAVE_MODEL_DIR}/{NAME_OF_MODEL}')

# Save the tokenizer in the models folder with the name of the model
print_custom('Saving the tokenizer in the models folder with the name of the model')
tokenizer.save_pretrained(f'{SAVE_TOKENIZER_DIR}/{NAME_OF_MODEL}')

# Save the study
print_custom('Saving the study')
import joblib
joblib.dump(study, f'{SAVE_DIR}/study.pkl')

Saving model checkpoint to models/huggingoptunaface
Configuration saved in models/huggingoptunaface/config.json




Saving the model in the models folder with the name of the model
----------------------------------------------------------------------------------------------------


Model weights saved in models/huggingoptunaface/pytorch_model.bin
tokenizer config file saved in models/huggingoptunaface/tokenizer_config.json
Special tokens file saved in models/huggingoptunaface/special_tokens_map.json
Copy vocab file to models/huggingoptunaface/spiece.model
tokenizer config file saved in tokenizer/huggingoptunaface/tokenizer_config.json
Special tokens file saved in tokenizer/huggingoptunaface/special_tokens_map.json
Copy vocab file to tokenizer/huggingoptunaface/spiece.model




Saving the tokenizer in the models folder with the name of the model
----------------------------------------------------------------------------------------------------


Saving the study
----------------------------------------------------------------------------------------------------


['opt-test/study.pkl']

In [27]:
# Loading the model and tokenizer to make predictions
print_custom('Loading the model and tokenizer to make predictions')
from transformers import T5ForConditionalGeneration, T5Tokenizer
loaded_model = T5ForConditionalGeneration.from_pretrained(f'{SAVE_MODEL_DIR}/{NAME_OF_MODEL}')
loaded_tokenizer = T5Tokenizer.from_pretrained(f'{SAVE_TOKENIZER_DIR}/{NAME_OF_MODEL}')

loading configuration file models/huggingoptunaface/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early



Loading the model and tokenizer to make predictions
----------------------------------------------------------------------------------------------------


All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at models/huggingoptunaface.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [28]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
# Testing out the model with the sample text 
print_custom('Testing out the model with the sample text')
# sample text
text = "While film reviews tend to be fairly short (approximately 600 to 1200 words), they require a lot of preparation before you begin writing. Prior to viewing the film, you may want to get a sense of the bodies of work by the director, writer, or individual actor."

# make to sure to resolve the expected all tensors to be on the same device to be resolved when using the model on cpu 
import torch
print_custom('Resolving the expected all tensors to be on the same device to be resolved when using the model on cpu')
device = torch.device("cpu")
loaded_model.to(device)

# Tokenize the text
print_custom('Tokenizing the text')
inputs = loaded_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

# Generate the summary
print_custom('Generating the summary')
summary_ids = loaded_model.generate(inputs["input_ids"].to(device), num_beams=4, max_length=150, early_stopping=True)

# Decode the summary
print_custom('Decoding the summary')
summary = loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print_custom('Printing the summary')
print(summary)



Testing out the model with the sample text
----------------------------------------------------------------------------------------------------


Resolving the expected all tensors to be on the same device to be resolved when using the model on cpu
----------------------------------------------------------------------------------------------------


Tokenizing the text
----------------------------------------------------------------------------------------------------


Generating the summary
----------------------------------------------------------------------------------------------------


Decoding the summary
----------------------------------------------------------------------------------------------------


Printing the summary
----------------------------------------------------------------------------------------------------
Existing law, the California Film Review Act, provides for the licensure and regulation of film reviews by the director, writer, or individual actor. T