# Text summarization - eksperymenty



*   Model: mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization
*   Zbiór danych: cnn_dailymail




In [1]:
! pip install datasets transformers rouge-score nltk

Collecting datasets
  Downloading datasets-1.18.0-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 5.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 29.8 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.4 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 32.0 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 34.6 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)


**Przygotowanie zbioru danych, modelu i niezbędnych bibliotek**

In [2]:
import transformers
from datasets import load_dataset, load_metric, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import numpy as np
import torch.nn.functional as F
import torch
nltk.download('punkt')
print(transformers.__version__)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
4.15.0


In [5]:
raw_datasets_train = load_dataset("ccdv/cnn_dailymail", "3.0.0", split='train')
raw_datasets_val = load_dataset("ccdv/cnn_dailymail", "3.0.0", split='validation[:30%]')
raw_datasets_test = load_dataset("ccdv/cnn_dailymail", "3.0.0", split='test[:30%]')
metric = load_metric("rouge")

Downloading:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f...


  0%|          | 0/5 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376M [00:00<?, ?B/s]

  0%|          | 0/5 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f. Subsequent calls will reuse this data.


Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f)
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f)


Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [6]:
raw_datasets = {'train':raw_datasets_train,'validation':raw_datasets_val,'test':raw_datasets_test}

In [7]:
raw_datasets = DatasetDict(raw_datasets)

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 4010
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 3447
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization")

Downloading:   0%|          | 0.00/324 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236M [00:00<?, ?B/s]

**Przetworzenie danych**

In [10]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, batch_size=1)

  0%|          | 0/287113 [00:00<?, ?ba/s]

  0%|          | 0/4010 [00:00<?, ?ba/s]

  0%|          | 0/3447 [00:00<?, ?ba/s]

**Reimplementacja funkcji straty ScaleGrad**

In [12]:
#reimplementacja ScaleGrad
def getNovelMask(target, vocab_size):
    b,l = target.size()
    zeros = torch.zeros(b,l,vocab_size).to(target.device)
    ones = torch.ones(b,l,vocab_size).to(target.device)
    target_index = target.unsqueeze(1).expand(b,l,l).transpose(-2,-1).triu().transpose(-2,-1)
    target_index[target_index==-100] = 0
   
    matrix = zeros.scatter_add_(2, target_index, ones)
    matrix[:,:,0] = 0
    summ_true = torch.tensor(range(1,l+1)).unsqueeze(0).float().to(target.device)
    summ_now = torch.sum(matrix,dim=-1)
    diff = summ_true - summ_now
    matrix[:,:,0] = diff
    matrix = torch.cat((torch.zeros(b,1,vocab_size).to(target.device),matrix[:,:-1,:]),1)
    novel_mask = matrix < 1.

    return novel_mask

def sg_loss(inputs, labels, logits):
    inp = inputs
    pad = torch.ones((logits.size(-1)))
    target = labels
    target_to_loss = torch.flatten(labels)
    num = int(logits.size(-1)/target_to_loss.shape[0]) + 1
    target_to_loss_pad = target_to_loss.repeat(num).cuda()
    logits = logits
  
    # ScaleGrad
    probs = F.softmax(logits, dim=-1)
    novel_mask = getNovelMask(target,logits.size(-1))
    rep_mask = ~novel_mask

    new_probs = probs * novel_mask * gamma + probs * rep_mask + 1e-8
    new_probs = F.normalize(new_probs, p=1, dim=-1)
    l_probs_to_loss = torch.log(torch.argmax(new_probs,dim=-1))
    l_probs_to_loss = torch.flatten(l_probs_to_loss).clone().detach().requires_grad_(True)
    l_probs_to_loss_pad = l_probs_to_loss.repeat(num).cuda()
    loss = -F.nll_loss(l_probs_to_loss_pad, target_to_loss_pad.long(), reduction='sum')
    ntokens = inp['input_ids'].numel()

    return loss / (ntokens*num)

**Przygotowanie treningu**

In [13]:
batch_size = 1
model_name = 'bert-small2bert-small-finetuned-cnn_daily_mail-summarization'

training_args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned--cnn_daily_mail",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    weight_decay=0.01,
    push_to_hub=False,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    num_train_epochs=1, 
    predict_with_generate=True,
    overwrite_output_dir=True,
    save_strategy = 'epoch'
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
#dla ScaleGrad gamma = 0.8, dla MLE: gamma = 1.0
gamma = 0.8
class ScaleGradTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = sg_loss(inputs,labels,logits)
        return (loss, outputs) if return_outputs else loss

trainer = ScaleGradTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

**Fintuning modelu**

In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: id, article, token_type_ids, highlights.
***** Running training *****
  Num examples = 287113
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 287113


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,2.600605,38.5396,16.769,26.5166,35.4815,72.5788


The following columns in the evaluation set  don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: id, article, token_type_ids, highlights.
***** Running Evaluation *****
  Num examples = 4010
  Batch size = 1
Saving model checkpoint to bert-small2bert-small-finetuned-cnn_daily_mail-summarization-finetuned--cnn_daily_mail/checkpoint-287113
Configuration saved in bert-small2bert-small-finetuned-cnn_daily_mail-summarization-finetuned--cnn_daily_mail/checkpoint-287113/config.json
Model weights saved in bert-small2bert-small-finetuned-cnn_daily_mail-summarization-finetuned--cnn_daily_mail/checkpoint-287113/pytorch_model.bin
tokenizer config file saved in bert-small2bert-small-finetuned-cnn_daily_mail-summarization-finetuned--cnn_daily_mail/checkpoint-287113/tokenizer_config.json
Special tokens file saved in bert-small2bert-small-finetuned-cnn_daily_mail-summarization-finetuned--cnn_daily_mail/checkpoint-287113/special_tokens_map.json


Training complet

TrainOutput(global_step=287113, training_loss=3.417546753119593e-05, metrics={'train_runtime': 14961.7435, 'train_samples_per_second': 19.19, 'train_steps_per_second': 19.19, 'total_flos': 2.511542179396866e+16, 'train_loss': 3.417546753119593e-05, 'epoch': 1.0})

In [18]:
model.save_pretrained('text_summarization_scalegrad')
tokenizer.save_pretrained('text_summarization_tokenizer_scalegrad')

Configuration saved in text_summarization_scalegrad/config.json
Model weights saved in text_summarization_scalegrad/pytorch_model.bin
tokenizer config file saved in text_summarization_tokenizer_scalegrad/tokenizer_config.json
Special tokens file saved in text_summarization_tokenizer_scalegrad/special_tokens_map.json


('text_summarization_tokenizer_scalegrad/tokenizer_config.json',
 'text_summarization_tokenizer_scalegrad/special_tokens_map.json',
 'text_summarization_tokenizer_scalegrad/vocab.txt',
 'text_summarization_tokenizer_scalegrad/added_tokens.json',
 'text_summarization_tokenizer_scalegrad/tokenizer.json')

**Zapisanie modelu i zebranie wyników**

In [19]:
tokenizer = AutoTokenizer.from_pretrained("text_summarization_tokenizer_scalegrad")
model = AutoModelForSeq2SeqLM.from_pretrained("text_summarization_scalegrad") 
model = model.cuda()
device = 'cuda'
def generate_summary_text(text):
    inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    output = model.generate(input_ids, attention_mask=attention_mask)

    return tokenizer.decode(output[0], skip_special_tokens=True)

Didn't find file text_summarization_tokenizer_scalegrad/added_tokens.json. We won't load it.
loading file text_summarization_tokenizer_scalegrad/vocab.txt
loading file text_summarization_tokenizer_scalegrad/tokenizer.json
loading file None
loading file text_summarization_tokenizer_scalegrad/special_tokens_map.json
loading file text_summarization_tokenizer_scalegrad/tokenizer_config.json
loading configuration file text_summarization_scalegrad/config.json
Model config EncoderDecoderConfig {
  "_name_or_path": "text_summarization_scalegrad",
  "architectures": [
    "EncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "google/bert_uncased_L-4_H-512_A-8",
    "add_cross_attention": true,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
   

In [20]:
def generate_summary(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

In [21]:
pred_str = []
label_str = []

for i in range(20):
  exp = raw_datasets['test'][i]
  res = generate_summary(exp)
  pred_str.append(res["pred"])
  label_str.append(res["highlights"])

rouge_output = metric.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

Score(precision=0.05924679907661112, recall=0.1080148409077864, fmeasure=0.0747530847701591)


In [22]:
model_res = []
true_highlights = []
texts = []
for j in ['train','test']:
  for i in range(5):
    text = raw_datasets[j][i]['article']
    texts.append(text)
    model_res.append(generate_summary_text(text))
    true_highlights.append(raw_datasets[j][i]['highlights'])

**Prezentacja wyników**

In [23]:
import pandas as pd

res = pd.DataFrame({'tekst oryginalny':texts,'wynik modelu':model_res, 'referencja':true_highlights})

In [24]:
res

Unnamed: 0,tekst oryginalny,wynik modelu,referencja
0,It's official: U.S. President Barack Obama wan...,obama wants congress to approve use of militar...,Syrian official: Obama climbed to the top of t...
1,(CNN) -- Usain Bolt rounded off the world cham...,usain bolt wins men's 4x100m relay gold in mos...,Usain Bolt wins third gold of world championsh...
2,"Kansas City, Missouri (CNN) -- The General Ser...",gsa employee flown back to mainland u. s. nine...,The employee in agency's Kansas City office is...
3,Los Angeles (CNN) -- A medical doctor in Vanco...,dr. blaga stancheva : california arson suspect...,NEW: A Canadian doctor says she was part of a ...
4,(CNN) -- Police arrested another teen Thursday...,"jose carlos montano, 18, was arrested on charg...",Another arrest made in gang rape outside Calif...
5,"(CNN)James Best, best known for his portrayal ...","james best, best known for his role in bumblin...","James Best, who played the sheriff on ""The Duk..."
6,(CNN)The attorney for a suburban New York card...,"randy zelin defends his client, dr. anthony mo...",A lawyer for Dr. Anthony Moschetto says the ch...
7,(CNN)President Barack Obama took part in a rou...,president obama is trying to reframe the topic...,"""No challenge poses more of a public threat th..."
8,Moscow (CNN)A Russian TV channel aired Hillary...,a russian tv channel aired hillary clinton's f...,"Presidential hopeful's video, featuring gay co..."
9,(CNN)Marco Rubio is all in. The Republican se...,marco rubio is running on an optimistic messag...,"Raul Reyes: In seeking Latino vote, Marco Rubi..."


In [25]:
res.to_csv('wyniki.csv')

In [26]:
pred_str = []
label_str = []

for i in range(raw_datasets['test'].num_rows):
  exp = raw_datasets['test'][i]
  res = generate_summary(exp)
  pred_str.append(res["pred"])
  label_str.append(res["highlights"])

rouge_output = metric.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

Score(precision=0.14614111627221946, recall=0.17806014571841425, fmeasure=0.1557089506411709)
