In [None]:
# |default_exp summarization
# |default_cls_lvl 3

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# summarization

Training, saving, and tuning code for building summariztion model(s) that can predict both headlines and short summaries of topics given the text associated to a topic segment

In [None]:
# |export
from __future__ import annotations

import datetime
import gc
import os
import time
import warnings

import wandb
from blurr.text.data.seq2seq.core import Seq2SeqBatchTokenizeTransform, Seq2SeqTextBlock, default_text_gen_kwargs
from blurr.text.modeling.core import BaseModelCallback, BaseModelWrapper
from blurr.text.modeling.seq2seq.core import Seq2SeqMetricsCallback, blurr_seq2seq_splitter
from blurr.text.utils import get_hf_objects
from blurr.utils import PreCalculatedCrossEntropyLoss
from fastcore.all import *
from fastai.data.block import DataBlock, ColReader, ItemGetter, ColSplitter, RandomSplitter
from fastai.callback.wandb import WandbCallback
from fastai.imports import *
from fastai.learner import *
from fastai.losses import CrossEntropyLossFlat
from fastai.optimizer import Adam, ranger
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers.utils import logging as hf_logging
from transformers import PegasusForConditionalGeneration, BartForConditionalGeneration, T5ForConditionalGeneration

from course_copilot import utils, training, preprocessing

  from .autonotebook import tqdm as notebook_tqdm


## Development environment

In [None]:
# | hide
import pdb

from fastcore.test import *
import nbdev

from blurr.utils import print_versions

In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# | echo: false
os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

What we're running with at the time this documentation was generated:
torch: 1.12.1+cu102
fastai: 2.7.9
transformers: 4.22.1


In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: Tesla V100-SXM2-16GB


## Setup

In [None]:
# |export
class SummarizationConfig(training.TrainConfig):
    hf_model_cls = PegasusForConditionalGeneration
    hf_model_checkpoint = "sshleifer/distill-pegasus-cnn-16-4"

    # datablock/dataloaders
    text_gen_kwargs = {}
    tok_kwargs = {}

    # learner
    input_sequence_size = 512
    max_target_length = 5

    batch_size = 8
    use_fp16 = True
    use_wandb = True

In [None]:
# |export
class ContentSummarizationConfig(SummarizationConfig):
    max_target_length = 80
    text_gen_kwargs = {"do_sample": True, "max_length": 100, "top_k": 50, "top_p": 0.95}

In [None]:
ContentSummarizationConfig.max_target_length

80

In [None]:
# |export
class HeadlineSummarizationConfig(SummarizationConfig):
    max_target_length = 10

In [None]:
class XsumCFG(ContentSummarizationConfig):
    training_subset = 0.25
    n_frozen_epochs = 0
    n_unfrozen_epochs = 1


[f"{k}: {v}" for k, v in training.get_train_config_props(XsumCFG).items()]

['batch_size: 8',
 'hf_model_checkpoint: sshleifer/distill-pegasus-cnn-16-4',
 'hf_model_cls: PegasusForConditionalGeneration',
 'input_sequence_size: 512',
 'max_target_length: 80',
 'n_frozen_epochs: 0',
 'n_unfrozen_epochs: 1',
 'only_seed_splits: True',
 'preprocess_strategy: None',
 'random_seed: 2022',
 "text_gen_kwargs: {'do_sample': True, 'max_length': 100, 'top_k': 50, 'top_p': 0.95}",
 'tok_kwargs: {}',
 'training_subset: 0.25',
 'use_fp16: True',
 'use_wandb: True',
 'val_pct: 0.25']

## Data

In [None]:
# |export
def _get_training_data(cfg: SummarizationConfig, data_dir="data"):  # configuration for summarization  # data directory
    segmentation_df, summarization_df = preprocessing.preprocess_data(
        ds="train", data_path=data_dir, return_file=True, save_file=False
    )
    return summarization_df

In [None]:
sdf = _get_training_data(XsumCFG, data_dir="../data")
sdf.head()

Unnamed: 0,course_title,lesson_num,start_seconds,topic,transcript
0,C-Squared Podcast,1,0,Intro,[Music] welcome everybody to episode one of a chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up christian well not so much fabi uh it's first of all great um to finally start a podcast the chess podcast i know that um there's a lot of podcasts out there but i wanted to bring our own tune to the mix and i think uh yeah i'm excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's been a while at your home it's good to be here it's my first time in uh visiting here and uh yeah it's been an intere...
1,C-Squared Podcast,1,137,Candidates 2018,camps look like in general yeah well you mentioned the 2018 cycle uh where we worked together we started with the training before the candidates and for me it's interesting because i've i've played a lot of these candidates tournaments and i'm always doing it a bit differently trying different things trying to improve it but sometimes it goes less or more successfully you never know what will work out i think what we did in 2018 not just for the candidates but also for the world championship because i qualified for that i think what we did then was extremely successful um we we arranged it...
2,C-Squared Podcast,1,464,Candidates training,going in the candidates like how was the experience yeah i think the preparation was pretty serious it included a bunch of uh camps and preparation devoted to players as i assume i think everyone has the same sort of general approach which is to think about their openings their strategy look at the opponents try to get in shape make sure that you're not you know rusty or blundering things or hallucinating variations uh but there's a lot of nerves and i i felt a lot of nerves before the tournament and i think possibly i you know overworked over trained a bit because it was yeah it was like ...
3,C-Squared Podcast,1,610,Playing for 2nd place,were you just like focused on grabbing first well i was only focused on first but of course there were always these thoughts that well maybe second is enough but you can't play for second like let's say once i had achieved plus three in the tournament and john was plus four and i tried to go and go into like full like risk reverse mode which is still difficult to do but let's say i had gone that mode and and achieved it and like finished second with like plus three and john got plus five uh and then like magnus says well i'm going to play right then you also feel kind of stupid you know li...
4,C-Squared Podcast,1,916,Magnus' WC decision,know you can't uh you can't tell him you have to do something i i guess let me rephrase that fair to let you guys play the tournament first and then tell you the decision well i think he said it in a strange way which was that i'll play against alireza which to me is strange because if you don't want to play world championship match i fully understand you know but did he say that did he actually name him yeah that's kind of what he said um yeah he more he like he didn't say definitively like i won't play against anyone but he was like i probably won't play unless it's frozen right and yeah...


## Huggingface objects

In [None]:
# | export
def _get_task_hf_objects(cfg: SummarizationConfig):
    hf_tok_kwargs = {}
    if cfg.hf_model_checkpoint == "sshleifer/tiny-mbart":
        hf_tok_kwargs["src_lang"], hf_tok_kwargs["tgt_lang"] = "en_XX", "en_XX"

    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
        pretrained_model_name_or_path=cfg.hf_model_checkpoint,
        model_cls=cfg.hf_model_cls,
        tokenizer_kwargs=hf_tok_kwargs,
    )
    return hf_arch, hf_config, hf_tokenizer, hf_model

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = _get_task_hf_objects(XsumCFG)
hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

('pegasus',
 transformers.models.pegasus.configuration_pegasus.PegasusConfig,
 transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast,
 transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration)

## Dataloaders

In [None]:
# |export
def _get_dls(cfg: SummarizationConfig, df, hf_arch, hf_config, hf_tokenizer, hf_model):
    if hf_arch in ["bart", "t5"]:
        cfg.text_gen_kwargs = {**hf_config.task_specific_params["summarization"], **{"max_length": 40, "min_length": 5}}

    # not all "summarization" parameters are for the model.generate method ... remove them here
    # TODO: add text_gen_kwargs dynamically
    generate_func_args = list(inspect.signature(hf_model.generate).parameters.keys())
    for k in cfg.text_gen_kwargs.copy():
        if k not in generate_func_args:
            del text_gen_kwargs[k]

    if hf_arch == "mbart":
        cfg.text_gen_kwargs["decoder_start_token_id"] = hf_tokenizer.get_vocab()["en_XX"]

    def add_t5_prefix(inp):
        return f"summarize: {inp}" if (hf_arch == "t5") else inp

    batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        padding="max_length",
        max_length=cfg.input_sequence_size,
        max_target_length=cfg.max_target_length,
        text_gen_kwargs=cfg.text_gen_kwargs,
    )

    blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
    dblock = DataBlock(
        blocks=blocks, get_x=ColReader("transcript"), get_y=ColReader("topic"), splitter=RandomSplitter()
    )

    dls = dblock.dataloaders(df, bs=cfg.batch_size)
    return dls

In [None]:
dls = _get_dls(XsumCFG, sdf, hf_arch, hf_config, hf_tokenizer, hf_model)
b = dls.one_batch()

In [None]:
len(b), len(b[0]), b[0]["input_ids"].shape, len(b[1]), b[1].shape

(2, 3, torch.Size([8, 512]), 8, torch.Size([8, 80]))

In [None]:
print(hf_tokenizer.decode(b[0]["input_ids"][0]))

hey everybody we're getting ready to start here everyone's click in on putting my shirt on here oh my dress shirt I was always wearing clothes ok and 3 2 1 boom mics on everything we're ready to go welcome ladies and gentlemen back to my studio here in Vancouver Canada my name is Michael Markowski I'm gonna be showing you how to do some drawing today I'm super excited because I think today we're really going to learn a lot about how to take all these different techniques we've been doing over the past three classes so far put them together to make some new drawings that are gonna really excite us and you're I think you're really gonna be surprised by how much you already know I mean based on just what we've learned so far so we're gonna put it all together and to create some new artworks let me see I'm just gonna turn this light on here okay so let me see what are the little housekeeping things I want to get cleared away right at the beginning if you have any drawings you'd like for me

## Models

In [None]:
# |export
def _get_learner(cfg: SummarizationConfig, dls, hf_config, hf_model, hf_arch):
    if cfg.random_seed:
        set_seed(cfg.random_seed)

    model = BaseModelWrapper(hf_model)
    learn_cbs = [BaseModelCallback]

    learn = Learner(
        dls,
        model,
        opt_func=ranger,
        loss_func=PreCalculatedCrossEntropyLoss(),
        cbs=learn_cbs,
        splitter=partial(blurr_seq2seq_splitter, arch=hf_arch),
    )

    learn.create_opt()
    learn.freeze()

    if cfg.use_fp16:
        learn = learn.to_fp16()

    return learn

In [None]:
learn = _get_learner(cfg=XsumCFG, dls=dls, hf_config=hf_config, hf_model=hf_model, hf_arch=hf_arch)

In [None]:
# learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])

In [None]:
seq2seq_metrics = {
    "rouge": {
        "compute_kwargs": {"rouge_types": ["rouge1", "rouge2", "rougeL", "rougeLsum"], "use_stemmer": True},
        "returns": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
    }
}
fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]

In [None]:
learn.fit_one_cycle(1, lr_max=1e-4, cbs=fit_cbs)

epoch,train_loss,valid_loss,rouge1,rouge2,rougeL,rougeLsum,time
0,6.355293,6.34308,0.082162,0.021642,0.076186,0.076393,01:05


In [None]:
learn.metrics = None

In [None]:
export_fname = "summarize_export"
learn.export(fname=f"{export_fname}.pkl")

In [None]:
del learn, dls, hf_model, hf_tokenizer, hf_config
torch.cuda.empty_cache()
gc.collect()

1347

## Model Trainer

In [None]:
# |export
class SummarizationModelTrainer(training.ModelTrainer):
    def __init__(
        self,
        experiment_name,
        train_config: SummarizationConfig,
        data_path="data",
        model_output_path="models",
        log_output_path="logs",
        log_preds=False,
        log_n_preds=None,
        use_wandb=False,
        verbose=False,
        **kwargs,
    ):
        super().__init__(
            experiment_name=experiment_name,
            train_config=train_config,
            data_path=data_path,
            model_output_path=model_output_path,
            log_output_path=log_output_path,
            log_preds=log_preds,
            log_n_preds=log_n_preds,
            use_wandb=use_wandb,
            verbose=verbose,
            **kwargs,
        )

    def get_training_data(self):
        return _get_training_data(cfg=self.train_config, data_dir=self.data_path)

## Train

In [None]:
# | export
@patch
def train(self: SummarizationModelTrainer):
    # timing
    start = time.time()

    yyyymmddHm = datetime.today().strftime("%Y%m%d_%H%m")
    seed = self.train_config.random_seed

    summarization_df = self.get_training_data()

    hf_arch, hf_config, hf_tokenizer, hf_model = _get_task_hf_objects(self.train_config)

    dls = _get_dls(self.train_config, summarization_df, hf_arch, hf_config, hf_tokenizer, hf_model)

    learn = _get_learner(cfg=self.train_config, dls=dls, hf_config=hf_config, hf_model=hf_model, hf_arch=hf_arch)

    seq2seq_metrics = {
        "rouge": {
            "compute_kwargs": {"rouge_types": ["rouge1", "rouge2", "rougeL", "rougeLsum"], "use_stemmer": True},
            "returns": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
        }
    }
    fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]
    if self.use_wandb:
        fit_cbs.append(WandbCallback(log_preds=False))

    if self.train_config.random_seed and not self.train_config.only_seed_splits:
        set_seed(self.train_config.random_seed)

    learn.fit_one_cycle(
        self.train_config.n_unfrozen_epochs,
        cbs=fit_cbs,
        lr_max=1e-4,
    )

    end = time.time()

    learn.metrics = None
    learn.export(self.model_output_path / f"{self.experiment_name}.pkl")

    # clean up
    super(self.__class__, self).train()

    del learn, dls, hf_model, hf_tokenizer, hf_config
    torch.cuda.empty_cache()
    gc.collect()

    return None

In [None]:
trainer = SummarizationModelTrainer(
    experiment_name="test_summarization",
    train_config=XsumCFG,
    data_path="../data",
    model_output_path="../models",
    log_output_path="../logs",
    log_preds=True,
    log_n_preds=2,
    use_wandb=True,
)



In [None]:
trainer.train()

Could not gather input dimensions


epoch,train_loss,valid_loss,rouge1,rouge2,rougeL,rougeLsum,time
0,6.358508,6.284994,0.096207,0.026339,0.086273,0.085904,01:08


## Export -

In [None]:
# |hide
nbdev.nbdev_export()