## Install dependencies

In [19]:
%%capture
!sudo apt install git-lfs

In [None]:
!pip install "transformers==4.26.0" "datasets==2.9.0" "accelerate==0.16.0" "evaluate==0.4.0" --upgrade
!pip install "deepspeed==0.8.0" ninja --upgrade
!pip install rouge-score nltk py7zr tensorboard


In [4]:
import deepspeed
deepspeed.ops.op_builder.CPUAdamBuilder().load()

Installed CUDA version 11.0 does not match the version torch was compiled with 11.6 but since the APIs are compatible, accepting this combination
Time to load cpu_adam op: 2.3554303646087646 seconds


Using /home/ubuntu/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
No modifications detected for re-loaded extension module cpu_adam, skipping build step...
Loading extension module cpu_adam...


<module 'cpu_adam' from '/home/ubuntu/.cache/torch_extensions/py38_cu116/cpu_adam/cpu_adam.so'>

In [9]:
!python3 -m ds_report

/usr/bin/python3: No module named ds_report


# Process dataset

In [10]:
# experiment config
model_id = "google/flan-t5-xxl"
repository_id = "flan-t5-xl-cnn"

# Dataset 
dataset_id = "cnn_dailymail"
dataset_config = "3.0.0"
save_dataset_path = "data"
text_column = "article"
summary_column = "highlights"
prompt_start = "Summarize the following news article:\n"
generation_start = "\nSummary:\n"
prompt_template = f"{prompt_start}{{input}}{generation_start}"

max_source_length=500
max_target_length=129

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np 


# Load dataset from the hub
dataset = load_dataset(dataset_id,name=dataset_config)
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Downloading builder script: 100%|██████████| 8.33k/8.33k [00:00<00:00, 5.79MB/s]
Downloading metadata: 100%|██████████| 9.88k/9.88k [00:00<00:00, 6.98MB/s]
Downloading readme: 100%|██████████| 15.1k/15.1k [00:00<00:00, 9.46MB/s]


Downloading and preparing dataset cnn_dailymail/3.0.0 to /home/ubuntu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data: 100%|██████████| 159M/159M [00:02<00:00, 65.4MB/s]
Downloading data: 100%|██████████| 376M/376M [00:06<00:00, 60.7MB/s]]
Downloading data: 46.4MB [00:00, 73.3MB/s]/5 [00:08<00:14,  4.77s/it]
Downloading data: 2.43MB [00:00, 64.3MB/s]                  3.90s/it]
Downloading data: 2.11MB [00:00, 63.8MB/s]                  2.46s/it]
Downloading data files: 100%|██████████| 5/5 [00:12<00:00,  2.45s/it]
                                                                                           

Dataset cnn_dailymail downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 305.14it/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.54k/2.54k [00:00<00:00, 830kB/s]
Downloading (…)"spiece.model";: 100%|██████████| 792k/792k [00:00<00:00, 103MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 157MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 723kB/s]


Train dataset size: 287113
Test dataset size: 11490


In [16]:
prompt_lenght = len(tokenizer(prompt_template.format(input=""))["input_ids"])
max_sample_length = tokenizer.model_max_length - prompt_lenght
print(f"Prompt lenght: {prompt_lenght}")
print(f"Max input lenght: {max_sample_length}")

Prompt lenght: 12
Max input lenght: 500


In [24]:
from datasets import concatenate_datasets
import numpy as np

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x[text_column], truncation=True), batched=True, remove_columns=[text_column, summary_column])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
max_source_length = min(max_source_length, max_sample_length)
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x[summary_column], truncation=True), batched=True, remove_columns=[text_column, summary_column])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# use 95th percentile as max target length
max_target_length = int(np.percentile(target_lenghts, 95))
print(f"Max target length: {max_target_length}")

  0%|          | 0/299 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-2243e7f93d3afe60.arrow


Max source length: 500
Max target length: 129


In [2]:
max_source_length=500
max_target_length=129

In [8]:
import os

def preprocess_function(sample,padding="max_length"):
    # created prompted input
    inputs = [prompt_template.format(input=item) for item in sample[text_column]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample[summary_column], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=list(dataset["train"].features))
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

tokenized_dataset["train"].save_to_disk(os.path.join(save_dataset_path,"train"))
tokenized_dataset["test"].save_to_disk(os.path.join(save_dataset_path,"eval"))

100%|██████████| 288/288 [05:51<00:00,  1.22s/ba]
100%|██████████| 14/14 [00:16<00:00,  1.15s/ba]
100%|██████████| 12/12 [00:14<00:00,  1.17s/ba]


Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


                                                                                                   

## Launch Training


In [9]:
import nltk
nltk.download("punkt", quiet=True)

True

In [None]:
!python3 scripts/run_seq2seq_deepspeed.py \
    --model_id $model_id \
    --dataset_path $save_dataset_path \
    --repository_id $repository_id \
    --epochs 3 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --generation_max_length $max_target_length \
    --lr 5e-5 

46h on v100 with deespeed z3 offload

In [11]:
!echo deepspeed --num_gpus=4 scripts/run_seq2seq_deepspeed.py \
    --model_id $model_id \
    --dataset_path $save_dataset_path \
    --epochs 3 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --generation_max_length $max_target_length \
    --lr 1e-5 \
    --deepspeed configs/ds_flan_t5_z3_offload.json \
    --repository_id $repository_id 

deepspeed --num_gpus=4 scripts/run_seq2seq_deepspeed.py --model_id google/flan-t5-xxl --dataset_path data --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --generation_max_length 129 --lr 1e-5 --deepspeed configs/ds_flan_t5_z3_offload.json --repository_id flan-t5-xl-cnn


In [37]:
from transformers import AutoConfig, AutoModelWithLMHead
from accelerate import init_empty_weights
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_cold


model_id="google/flan-t5-xl"
num_gpus=1

# get parameters
config = AutoConfig.from_pretrained(model_id)
with init_empty_weights():
    model = AutoModelWithLMHead.from_config(config)
    model_parameter = sum(p.numel() for p in model.parameters())
    largest_layer = max([p.numel() for p in model.parameters()])


# calculate needed memory
estimate_zero3_model_states_mem_needs_all_cold(model_parameter,largest_layer_params=largest_layer, num_gpus_per_node=num_gpus,num_nodes=1)

Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 1 GPU per node.
SW: Model with 2849M total params, 65M largest layer params.
  per CPU  |  per GPU |   Options
   71.66GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
   71.66GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
   63.70GB |   5.55GB | offload_param=none, offload_optimizer=cpu , zero_init=1
   63.70GB |   5.55GB | offload_param=none, offload_optimizer=cpu , zero_init=0
    0.37GB |  48.02GB | offload_param=none, offload_optimizer=none, zero_init=1
   15.92GB |  48.02GB | offload_param=none, offload_optimizer=none, zero_init=0


# Results

dataset: `"cnn_dailymail"`
training examples: `287113`
validation examples: `13368`


| Model           | DS offload | Hardware     | batch size per GPU | duration | cost |
|-----------------|------------|--------------|--------------------|----------|------|
| FLAN-T5-XL (3B) | No         | 4x V100 16GB | OOM                | -        | -    |
| FLAN-T5-XL (3B) | No         | 8x V100 16GB | 1                | 105h        | ~$2570    |
| FLAN-T5-XL (3B) | Yes         | 4x V100 16GB | 8                | 69h        | ~$828    |
| FLAN-T5-XL (3B) | Yes         | 8x V100 16GB | 8                | 32        | ~$768   |
| FLAN-T5-XXL (11B) | Yes         | 4x V100 16GB | OOM                | -        | -   |
| FLAN-T5-XXL (11B) | Yes         | 8x V100 16GB | OOM                | -        | -   |