In [20]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5TokenizerFast, T5ForConditionalGeneration
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

In [2]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


(…)ilbart-cnn-12-6/resolve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)-12-6/resolve/main/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

(…)tilbart-cnn-12-6/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)tilbart-cnn-12-6/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [3]:
text = '''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.''' 

In [4]:
result = summarizer(text)
result

[{'summary_text': ' Text mining involves deriving high-quality information from text . Written resources may include websites, books, emails, reviews, and articles . Text mining is similar to text analytics . It involves the discovery by computer of new, previously unknown information by automatically extracting information from different written resources .'}]

In [5]:
len(text), len(result[0]['summary_text'])

(778, 341)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

(…)small/resolve/main/tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

(…)ce.co/t5-small/resolve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…).co/t5-small/resolve/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

(…)ace.co/t5-small/resolve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [9]:
type(tokenizer), type(model)

(transformers.models.t5.tokenization_t5_fast.T5TokenizerFast,
 transformers.models.t5.modeling_t5.T5ForConditionalGeneration)

In [10]:
preprocess_text = text.strip().replace("\n","")
input_text = "summarize: " + preprocess_text

In [11]:
tokenized_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

In [12]:
summary_ids = model.generate(tokenized_text,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             min_length=30,
                             max_length=100,
                             early_stopping=True)

In [13]:
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

'text data mining is the process of deriving high-quality information from text. it involves the discovery by computer of new, previously unknown information. a KDD (Knowledge Discovery in Databases) process is similar to text analytics.'

In [15]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5TokenizerFast.from_pretrained('t5-small', model_max_length=1024)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [16]:
text = '''The Inflation Reduction Act lowers prescription drug costs, health care costs, 
and energy costs. It's the most aggressive action on tackling the climate crisis in American history, 
which will lift up American workers and create good-paying, union jobs across the country. 
It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. 
And no one making under $400,000 per year will pay a penny more in taxes.'''

In [17]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text

In [18]:
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             min_length=30,
                             max_length=100,
                             early_stopping=True)

In [19]:
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in history. no one making under $400,000 per year will pay a penny more in taxes."

In [21]:
billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)
example = billsum["train"][0]
example

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 4830 of the Business and Professions Code is amended to read:\n4830.\n(a) This chapter does not apply to:\n(1) Veterinarians while serving in any armed branch of the military service of the United States or the United States Department of Agriculture while actually engaged and employed in their official capacity.\n(2) Regularly licensed veterinarians in actual consultation from other states.\n(3) Regularly licensed veterinarians actually called from other states to attend cases in this state, but who do not open an office or appoint a place to do business within this state.\n(4) Veterinarians employed by the University of California while engaged in the performance of duties in connection with the College of Agriculture, the Agricultural Experiment Station, the School of Veterinary Medicine, or the agricultural extension work of the university or employed by the Western University of Health Sc

In [25]:
example['title']

'An act to amend Section 4830 of the Business and Professions Code, relating to veterinarians.'