In [1]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
import numpy as np

In [None]:
summarizer = pipeline('summarization')
text = '''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.''' 
result = summarizer(text)
result[0]['summary_text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-small', model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [None]:
type(tokenizer), type(model)

In [None]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
print(output)

In [None]:
tokenized_text, summary_ids

In [None]:
input_text = 'translate english to german: That is good'
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(output)

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5TokenizerFast.from_pretrained('t5-small', model_max_length=1024)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [None]:
text = '''The Inflation Reduction Act lowers prescription drug costs, health care costs, 
and energy costs. It's the most aggressive action on tackling the climate crisis in American history, 
which will lift up American workers and create good-paying, union jobs across the country. 
It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. 
And no one making under $400,000 per year will pay a penny more in taxes.'''

In [None]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text

In [None]:
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text,
                             num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=100, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output