In [24]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
import numpy as np

In [2]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
text = '''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.''' 

In [4]:
result = summarizer(text)
result

[{'summary_text': ' Text mining involves deriving high-quality information from text . Written resources may include websites, books, emails, reviews, and articles . Text mining is similar to text analytics . It involves the discovery by computer of new, previously unknown information by automatically extracting information from different written resources .'}]

In [5]:
len(text), len(result[0]['summary_text'])

(778, 341)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [7]:
type(tokenizer), type(model)

(transformers.models.t5.tokenization_t5_fast.T5TokenizerFast,
 transformers.models.t5.modeling_t5.T5ForConditionalGeneration)

In [8]:
preprocess_text = text.strip().replace("\n","")
input_text = "summarize: " + preprocess_text

In [9]:
tokenized_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

In [10]:
summary_ids = model.generate(tokenized_text,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             min_length=30,
                             max_length=100,
                             early_stopping=True)

In [11]:
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

'text data mining is the process of deriving high-quality information from text. it involves the discovery by computer of new, previously unknown information. a KDD (Knowledge Discovery in Databases) process is similar to text analytics.'

In [12]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5TokenizerFast.from_pretrained('t5-small', model_max_length=1024)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [13]:
text = '''The Inflation Reduction Act lowers prescription drug costs, health care costs, 
and energy costs. It's the most aggressive action on tackling the climate crisis in American history, 
which will lift up American workers and create good-paying, union jobs across the country. 
It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. 
And no one making under $400,000 per year will pay a penny more in taxes.'''

In [14]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text

In [15]:
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             min_length=30,
                             max_length=100,
                             early_stopping=True)

In [16]:
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in history. no one making under $400,000 per year will pay a penny more in taxes."

In [17]:
billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)
example = billsum["train"][0]
example

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 12012.75 of the Government Code is amended to read:\n12012.75.\nThere is hereby created in the State Treasury a special fund called the “Indian Gaming Revenue Sharing Trust Fund” for the receipt and deposit of moneys received by the state from Indian tribes pursuant to the terms of tribal-state gaming compacts for the purpose of making distributions to eligible recipient Indian tribes. Moneys in the Indian Gaming Revenue Sharing Trust Fund shall be available to the California Gambling Control Commission, upon appropriation by the Legislature, for the purpose of making distributions to eligible recipient Indian tribes, in accordance with distribution plans specified in tribal-state gaming compacts.\nSEC. 2.\nSection 12012.90 of the Government Code is amended to read:\n12012.90.\nFor each fiscal year commencing with the 2016–17 fiscal year, all of the following shall apply:\n(a) On or before the

In [18]:
example['title']

'An act to amend Sections 12012.75 and 12012.90 of the Government Code, relating to gaming.'

In [19]:
def preprocess_text(data):
    inputs = ["summarize: " + doc for doc in data["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [20]:
tokenized_billsum = billsum.map(preprocess_text, batched=True, remove_columns=billsum["train"].column_names)
tokenized_billsum

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [23]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [27]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [28]:
trainer.train()

***** Running training *****
  Num examples = 989
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 124
  Number of trainable parameters = 60506624


  0%|          | 0/124 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 248
  Batch size = 16


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.857461929321289, 'eval_rouge1': 0.1304, 'eval_rouge2': 0.0391, 'eval_rougeL': 0.1083, 'eval_rougeLsum': 0.1085, 'eval_runtime': 389.279, 'eval_samples_per_second': 0.637, 'eval_steps_per_second': 0.041, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 248
  Batch size = 16


  0%|          | 0/16 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 2.7139813899993896, 'eval_rouge1': 0.1298, 'eval_rouge2': 0.0388, 'eval_rougeL': 0.1056, 'eval_rougeLsum': 0.1058, 'eval_runtime': 386.0763, 'eval_samples_per_second': 0.642, 'eval_steps_per_second': 0.041, 'epoch': 2.0}
{'train_runtime': 5574.0904, 'train_samples_per_second': 0.355, 'train_steps_per_second': 0.022, 'train_loss': 3.3591325821415072, 'epoch': 2.0}


TrainOutput(global_step=124, training_loss=3.3591325821415072, metrics={'train_runtime': 5574.0904, 'train_samples_per_second': 0.355, 'train_steps_per_second': 0.022, 'train_loss': 3.3591325821415072, 'epoch': 2.0})

In [29]:
summary_ids = model.generate(tokenized_text,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             min_length=30,
                             max_length=100,
                             early_stopping=True)

In [30]:
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in history. no one making under $400,000 per year will pay a penny more in taxes."