In [1]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
import numpy as np

In [2]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
text = '''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.''' 

In [5]:
result = summarizer(text)
result[0]['summary_text']

' Text mining involves deriving high-quality information from text . Written resources may include websites, books, emails, reviews, and articles . Text mining is similar to text analytics . It involves the discovery by computer of new, previously unknown information by automatically extracting information from different written resources .'

In [7]:
tokenizer = AutoTokenizer.from_pretrained('t5-small', model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [8]:
type(tokenizer), type(model)

(transformers.models.t5.tokenization_t5_fast.T5TokenizerFast,
 transformers.models.t5.modeling_t5.T5ForConditionalGeneration)

In [9]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text

In [11]:
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [12]:
print(output)

text data mining is the process of deriving high-quality information from text. it involves the discovery by computer of new, previously unknown information. a KDD (Knowledge Discovery in Databases) process is similar to text analytics.


In [14]:
tokenized_text, summary_ids

(tensor([[21603,    10,  5027,  5558,     6,    92,     3,  4822,    12,    38,
           1499,   331,  5558,    41, 12982,    52,     5,    10,   332,  7407,
            201,  1126,    12,  1499,  9952,     6,    19,     8,   433,    13,
             20,  5927,    53,   306,    18,  4497,   251,    45,  1499,     5,
             94,  5806,    96,   532,  9087,    57,  1218,    13,   126,     6,
           3150,  7752,   251,     6,    57,  3269,  5819,    53,   251,    45,
            315,  1545,  1438,   535, 22812,  1438,   164,   560,  3395,     6,
           1335,     6,  7594,     6,  2456,     6,    11,  2984,     5,  1592,
             18,  4497,   251,    19,  3115,  5105,    57, 13282,    53,  4264,
             11,  5001,    57,   598,   224,    38, 11775,  3275,  1036,     5,
           2150,    12,  1546,   189,    32,     3,    15,    17,   491,     5,
              3, 29495,    62,    54, 15849,   344,   386,   315, 14013,    13,
           1499,  5558,    10,   251, 16

In [16]:
input_text = 'translate english to german: That is good'
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(output)

Das ist gut.
