In [2]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
import numpy as np

In [3]:
summarizer = pipeline('summarization')
text = '''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.''' 
result = summarizer(text)
result[0]['summary_text']

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


' Text mining involves deriving high-quality information from text . Written resources may include websites, books, emails, reviews, and articles . Text mining is similar to text analytics . It involves the discovery by computer of new, previously unknown information by automatically extracting information from different written resources .'

In [4]:
tokenizer = AutoTokenizer.from_pretrained('t5-small', model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [5]:
type(tokenizer), type(model)

(transformers.models.t5.tokenization_t5_fast.T5TokenizerFast,
 transformers.models.t5.modeling_t5.T5ForConditionalGeneration)

In [6]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [7]:
print(output)

text data mining is the process of deriving high-quality information from text. it involves the discovery by computer of new, previously unknown information. a KDD (Knowledge Discovery in Databases) process is similar to text analytics.


In [8]:
tokenized_text, summary_ids

(tensor([[21603,    10,  5027,  5558,     6,    92,     3,  4822,    12,    38,
           1499,   331,  5558,    41, 12982,    52,     5,    10,   332,  7407,
            201,  1126,    12,  1499,  9952,     6,    19,     8,   433,    13,
             20,  5927,    53,   306,    18,  4497,   251,    45,  1499,     5,
             94,  5806,    96,   532,  9087,    57,  1218,    13,   126,     6,
           3150,  7752,   251,     6,    57,  3269,  5819,    53,   251,    45,
            315,  1545,  1438,   535, 22812,  1438,   164,   560,  3395,     6,
           1335,     6,  7594,     6,  2456,     6,    11,  2984,     5,  1592,
             18,  4497,   251,    19,  3115,  5105,    57, 13282,    53,  4264,
             11,  5001,    57,   598,   224,    38, 11775,  3275,  1036,     5,
           2150,    12,  1546,   189,    32,     3,    15,    17,   491,     5,
              3, 29495,    62,    54, 15849,   344,   386,   315, 14013,    13,
           1499,  5558,    10,   251, 16

In [9]:
input_text = 'translate english to german: That is good'
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=3,
                             max_length=200, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(output)

Das ist gut.


In [10]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5TokenizerFast.from_pretrained('t5-small', model_max_length=1024)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [11]:
text = '''The Inflation Reduction Act lowers prescription drug costs, health care costs, 
and energy costs. It's the most aggressive action on tackling the climate crisis in American history, 
which will lift up American workers and create good-paying, union jobs across the country. 
It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. 
And no one making under $400,000 per year will pay a penny more in taxes.'''

In [12]:
preprocess_text = text.strip().replace('\n', '')
input_text = 'summarize: ' + preprocess_text

In [13]:
tokenized_text = tokenizer.encode(input_text, return_tensors='pt').to(device)
summary_ids = model.generate(tokenized_text,
                             num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=100, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in history. no one making under $400,000 per year will pay a penny more in taxes."

In [14]:
billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)
example = billsum["train"][0]
for key in example.keys():
    print(key, '\n', example[key])
    print('\n\n')

text 
 The people of the State of California do enact as follows:


SECTION 1.
(a) It is the intent of the Legislature in enacting this act that cost, quality, and equity data be made available and to encourage health care service plans, health insurers, and providers to develop innovative approaches, services, and programs that may have the potential to deliver health care that is both cost effective and responsive to the needs of all enrollees, including recognizing the diversity of California and the impact of social determinants of health.
(b) It is further the intent of the Legislature that a cost, quality, and equity data atlas be utilized in California to inform efforts to:
(1) Assess California health care needs and available resources.
(2) Contain the cost of health care services and coverage.
(3) Improve the quality and medical appropriateness of health care.
(4) Eliminate or reduce health disparities and address the social determinants of health.
(5) Increase the transparenc

In [15]:
def preprocess_text(data):
    inputs = ["summarize: " + doc for doc in data["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_billsum = billsum.map(preprocess_text, batched=True, remove_columns=billsum["train"].column_names)
tokenized_billsum

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
)
trainer = Seq2SeqTrainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

***** Running training *****
  Num examples = 989
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 62
  Number of trainable parameters = 60506624


  0%|          | 0/62 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 16


  0%|          | 0/16 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 2.187960624694824, 'eval_rouge1': 0.1926, 'eval_rouge2': 0.0959, 'eval_rougeL': 0.1635, 'eval_rougeLsum': 0.1639, 'eval_runtime': 386.4739, 'eval_samples_per_second': 0.642, 'eval_steps_per_second': 0.041, 'epoch': 1.0}
{'train_runtime': 2790.8764, 'train_samples_per_second': 0.354, 'train_steps_per_second': 0.022, 'train_loss': 2.519801478232107, 'epoch': 1.0}


TrainOutput(global_step=62, training_loss=2.519801478232107, metrics={'train_runtime': 2790.8764, 'train_samples_per_second': 0.354, 'train_steps_per_second': 0.022, 'train_loss': 2.519801478232107, 'epoch': 1.0})

In [20]:
summary_ids = model.generate(tokenized_text,
                             num_beams=4, no_repeat_ngram_size=3,
                             min_length=30, max_length=100, early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
output

"The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. This bill would ask the ultra-wealthy and corporations to pay their fair share."

In [21]:
text = """디아블로는 액션 롤플레잉 핵 앤드 슬래시 비디오 게임이다. 
플레이어는 주변 환경을 마우스로 사용해 영웅을 움직이게 한다. 
주문을 외는 등의 다른 활동은 키보드 입력으로 이루어진다. 
플레이어는 이 게임에서 장비를 획득하고, 주문을 배우고, 적을 쓰러뜨리며, NPC와 대화를 나눌 수 있다.
지하 미궁은 주어진 형식이 있고 부분적으로 반복되는 형태가 존재하나 전체적으로 보면 무작위로 생성된다. 
예를 들어 지하 묘지의 경우에는 긴 복도와 닫힌 문들이 존재하고, 동굴은 좀 더 선형 형태를 띠고 있다. 
플레이어에게는 몇몇 단계에서 무작위의 퀘스트를 받는다. 
이 퀘스트는 선택적인 사항이나 플레이어의 영웅들을 성장시키거나 줄거리를 이해하는데 도움을 준다. 
그러나 맨 뒤에 두 퀘스트는 게임을 끝내기 위해 완료시켜야 한다."""

preprocess_text = text.strip().replace("\n", "")

In [22]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-summarization')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-summarization')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

loading file tokenizer.json from cache at C:\Users\admin/.cache\huggingface\hub\models--gogamza--kobart-summarization\snapshots\31f181b155a0ad74bd93bd90ee04310ff72691f4\tokenizer.json
loading file added_tokens.json from cache at C:\Users\admin/.cache\huggingface\hub\models--gogamza--kobart-summarization\snapshots\31f181b155a0ad74bd93bd90ee04310ff72691f4\added_tokens.json
loading file special_tokens_map.json from cache at C:\Users\admin/.cache\huggingface\hub\models--gogamza--kobart-summarization\snapshots\31f181b155a0ad74bd93bd90ee04310ff72691f4\special_tokens_map.json
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--gogamza--kobart-summarization\snapshots\31f181b155a0ad74bd93bd90ee04310ff72691f4\config.json
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Model config BartConfig {


In [23]:
tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt")
summary_ids = model.generate(tokenized_text,
                             num_beams=4, no_repeat_ngram_size=3,
                             min_length=10, max_length=150, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

디아블로는 액션 롤플레잉 핵 앤드 슬래시 비디오 게임이다.
