In [1]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime
import json
from sklearn.model_selection import train_test_split
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

In [2]:
model_name = 'facebook/bart-large-xsum'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoder_max_length = 1024
decoder_max_length = 64

In [3]:
def unzip(xys):
    xs, ys = zip(*xys)
    return (list(xs), list(ys))

def to_dataset(items):
    titles, texts = unzip([(item["translated_title"], item["translated_text"]) for item in items])
    return Dataset.from_dict({
        'summary': titles,
        'document': texts
    })

In [4]:
with open('nplus1/train_translated_data.json') as f:
    train_items = json.load(f)

with open('nplus1/test_translated_data.json') as f:
    test_items = json.load(f)
    
nplus1_dataset = DatasetDict({
    'train': to_dataset(train_items),
    'test': to_dataset(test_items),
})

In [7]:
# train_data_txt

In [8]:
train_data_txt = nplus1_dataset['train']
validation_data_txt = nplus1_dataset['test']

In [9]:
validation_data_txt

Dataset({
    features: ['summary', 'document'],
    num_rows: 175
})

**Preprocess and tokenize**

In [10]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1569
})

## Training

---

### Metrics

In [12]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = datasets.load_metric("rouge")


### Training arguments

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=10,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_steps=50,
    eval_steps=50,
    save_total_limit=10,
    evaluation_strategy="steps",
    # optim="adamw_torch",
    metric_for_best_model = 'rouge1',
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [52]:
trainer.evaluate()
# {'eval_loss': 5.45355749130249,
#  'eval_rouge1': 29.1556,
#  'eval_rouge2': 8.8158,
#  'eval_rougeL': 23.6041,
#  'eval_rougeLsum': 23.555,
#  'eval_gen_len': 24.2571,
#  'eval_runtime': 1236.1095,
#  'eval_samples_per_second': 0.283,
#  'eval_steps_per_second': 0.071}

***** Running Evaluation *****
  Num examples = 350
  Batch size = 4
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 5.45355749130249,
 'eval_rouge1': 29.1556,
 'eval_rouge2': 8.8158,
 'eval_rougeL': 23.6041,
 'eval_rougeLsum': 23.555,
 'eval_gen_len': 24.2571,
 'eval_runtime': 1236.1095,
 'eval_samples_per_second': 0.283,
 'eval_steps_per_second': 0.071}

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1569
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3930
  Number of trainable parameters = 406290432
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
50,4.6623,3.827141,35.4728,13.8773,30.4115,30.2769,23.9714
100,3.5234,3.450229,36.7248,15.511,32.3221,32.1836,18.6571
150,3.3518,3.389961,39.3509,17.5478,34.8286,34.7176,19.2057
200,3.3342,3.359379,40.9713,19.2534,36.9751,37.04,18.2686
250,3.2388,3.361399,42.495,19.4434,37.7553,37.703,19.9429
300,3.3066,3.367665,42.9941,18.8477,37.4142,37.325,20.5086
350,3.3026,3.471125,39.9959,17.7979,35.931,35.9317,20.8971
400,3.1834,3.410693,42.0947,17.3278,35.7259,35.7199,20.84
450,2.7852,3.427615,42.4795,18.5068,37.1818,37.0393,19.1143
500,2.9252,3.503216,40.717,17.7608,36.33,36.3138,19.9886


***** Running Evaluation *****
  Num examples = 175
  Batch size = 4
Saving model checkpoint to results/checkpoint-50
Configuration saved in results/checkpoint-50/config.json
Model weights saved in results/checkpoint-50/pytorch_model.bin
tokenizer config file saved in results/checkpoint-50/tokenizer_config.json
Special tokens file saved in results/checkpoint-50/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 175
  Batch size = 4
Saving model checkpoint to results/checkpoint-100
Configuration saved in results/checkpoint-100/config.json
Model weights saved in results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in results/checkpoint-100/tokenizer_config.json
Special tokens file saved in results/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 175
  Batch size = 4
Saving model checkpoint to results/checkpoint-150
Configuration saved in results/checkpoint-150/config.json
Model weights saved in results/checkpoint-