In [None]:
import torch

In [None]:
pip install transformers datasets evaluate rouge_score huggingface_hub

In [None]:
from datasets import load_dataset

d = load_dataset("multi_news")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
prefix = "summarize: "

def preprocess(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized = d.map(preprocess, batched=True)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
tokenized

In [None]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np

In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="summarizer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["validation"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/content/summarizer is already a clone of https://huggingface.co/pablo-chocobar/summarizer. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, document. If summary, document are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5622
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimizatio

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.857162,0.1386,0.0423,0.106,0.106,18.9968
2,3.201600,2.802902,0.1415,0.0435,0.108,0.108,18.9966
3,3.036100,2.781355,0.143,0.0446,0.1093,0.1093,18.9968
4,3.036100,2.774536,0.1434,0.0448,0.1097,0.1097,18.9968


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_toke

TrainOutput(global_step=1408, training_loss=3.087070031599565, metrics={'train_runtime': 2647.0944, 'train_samples_per_second': 8.495, 'train_steps_per_second': 0.532, 'total_flos': 6087132863004672.0, 'train_loss': 3.087070031599565, 'epoch': 4.0})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
trainer.push_to_hub()


Saving model checkpoint to summarizer
Configuration saved in summarizer/config.json
Configuration saved in summarizer/generation_config.json
Model weights saved in summarizer/pytorch_model.bin
tokenizer config file saved in summarizer/tokenizer_config.json
Special tokens file saved in summarizer/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 32.0k/231M [00:00<?, ?B/s]

Upload file runs/Jan26_19-22-07_f74ab1315fea/events.out.tfevents.1674760931.f74ab1315fea.2166.2: 100%|########…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/pablo-chocobar/summarizer
   54c8dd3..62181e8  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/pablo-chocobar/summarizer
   54c8dd3..62181e8  main -> main

To https://huggingface.co/pablo-chocobar/summarizer
   62181e8..f0c577b  main -> main

   62181e8..f0c577b  main -> main



'https://huggingface.co/pablo-chocobar/summarizer/commit/62181e89f10a57c8d86f0761c64954d4788935e1'

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="pablo-chocobar/summarizer")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/config.json
Model config T5Config {
  "_name_or_path": "pablo-chocobar/summarizer",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
 

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/242M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/pytorch_model.bin
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at pablo-chocobar/summarizer.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/generation_config.json
Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

loading file spiece.model from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--pablo-chocobar--summarizer/snapshots/f0c577b20ac67db1fa6318ce1ca2130f23359479/tokenizer_config.json


In [None]:
text = """Today, CCS projects are storing almost 45 million tons of CO2 every year, which is about the amount of CO2 emissions created by 10 million passenger cars. Capture generally takes place at large stationary sources of CO2, like power plants or industrial plants that make cement, steel, and chemicals. Most current carbon capture projects use a liquid to chemically remove the CO2 before it goes out the smokestack, but several new types of capture processes are under development.

The captured CO2 gas is then compressed so it becomes liquid-like and transported to a storage site, generally through a pipeline. Ship transport is more expensive than using pipelines, but it is being considered in both Europe and Japan. Once at the storage site, the CO2 is pumped more than 2,500 feet down wells into geological formations like used-up oil and gas reservoirs, as well as formations that contain unusable, salty water. 
CCS is sometimes referred to as CCUS, where the “U” stands for utilization. Enhanced oil recovery (EOR) is the major use of CO2 today. EOR is where CO2 is injected into active oil reservoirs in order to recover more oil. Other possible uses of CO2 include making chemicals or fuels, but they require large amounts of carbon-free energy, making the costs too high to be competitive today. For large-scale implementation of CCS, utilization is projected to use less than 10% of the captured CO2.
There has also been considerable interest recently in using CCS technologies to remove CO2 from the atmosphere.  One option is bioenergy with CCS (BECCS), where biomass (like wood or grasses) removes CO2 from the air through photosynthesis. The biomass is then harvested and burned in a power plant to produce energy, with the CO2 being captured and stored. This creates what is called “negative emissions” because it takes CO2 from the atmosphere and stores it.  Another negative emission option is called direct air capture (DAC), where CO2 is removed from the air using a chemical process.  However, the concentration of CO2 in the air is about 300 times less than in the smokestacks of power plants or industrial plants, making it much less efficient to capture. Because of this, DAC is quite expensive today."""

In [None]:
summarizer(text)

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 1,
  "length_penalty": 2.0,
  "max_length": 200,
  "min_length": 30,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



[{'summary_text': '– Today, CCS projects are storing almost 45 million tons of CO2 every year, which is about the amount of emissions created by 10 million passenger cars. Most current carbon capture projects use a liquid to chemically remove the CO2 before it goes out the smokestack, but several new types of capture processes are under development. The captured CO2 gas is then compressed so it becomes liquid-like and transported to a storage site, generally through a pipeline, but it is being considered in both Europe and Japan, where the “U” stands for utilization, where CO2 is pumped more than 2,500 feet down wells into geological formations that contain unusable, salty water, and'}]