Access tokens: https://huggingface.co/settings/tokens (if you don't have one, create in write mode)

In [1]:
!pip install bitsandbytes
!pip install datasets -U
!pip install trl -U
!pip install peft -U
!huggingface-cli login

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-

In [2]:
#Imports
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, set_seed
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [3]:
# VARS
MODEL_SIZE="350M"
RUN_NAME=f"minerva_summary_{MODEL_SIZE}"
OUTPUT_DIR=f"minerva_summary/{RUN_NAME}"
MODEL_NAME = f"sapienzanlp/Minerva-{MODEL_SIZE}-base-v1.0"
TRAIN_SAMPLES = 5000
EVALUATION_SAMPLES = 500
RESPONSE_TEMPLATE = "### Summary:"
PROMPT_TEMPLATE = "### Text:"
# ----

def prepare_model_with_template(model, tokenizer):
  """
    Add response and prompt templates to the tokenizer as special tokens
  """

  initial_token_count = len(tokenizer)
  added_token_count = tokenizer.add_special_tokens({"additional_special_tokens": [PROMPT_TEMPLATE, RESPONSE_TEMPLATE]})
  model.resize_token_embeddings(new_num_tokens=initial_token_count+added_token_count)

  return model

def save_model_push_hub(peft_model_id, save_model_path, revision_id):
  print("PEFT_MODEL_ID: " + peft_model_id)
  print("SAVE_MODEL_PATH: " + save_model_path)
  print("REVISION_ID: " + revision_id)
  """
    Since you are using LoRA Finetuning, the Trainer will save only the Adapter parameters
    Here, before saving the model to the hub, you have to load the original model, then
    you have to merge the saved adapter weights...
  """

  # You have to initialize again the model, you saved only the adapter weights
  original_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16)
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  original_model = prepare_model_with_template(original_model, tokenizer)

  model = PeftModel.from_pretrained(original_model, peft_model_id)
  merged_model = model.merge_and_unload()

  merged_model.push_to_hub(save_model_path, revision=revision_id, private=True)
  tokenizer.push_to_hub(save_model_path, revision=revision_id, private=True)

In [4]:
dataset_ccnews = load_dataset("vblagoje/cc_news") #['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'] #708241
#dataset_ccnews = load_dataset("vblagoje/cc_news") #['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'] #708241

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/215M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/708241 [00:00<?, ? examples/s]

In [5]:
# Filter out large or invalid input
dataset_ccnews_filtered = dataset_ccnews.filter(lambda x: len(x["text"]) + len(x["description"]) < 2700 and len(x["description"]) > 0)

Filter:   0%|          | 0/708241 [00:00<?, ? examples/s]

In [6]:
# Let's create train and test splits
dataset_summary = DatasetDict()
dataset_summary["train"] = dataset_ccnews_filtered["train"].select(range(20000)) # Take first 20000 samples as training
dataset_summary["test"] = dataset_ccnews_filtered["train"].select(range(20000, 25000)) # Take more 5000 samples as test
dataset_summary

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
        num_rows: 5000
    })
})

In [7]:
set_seed(17)

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

# FORMATTING
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['text'])):
        text = f"{PROMPT_TEMPLATE} {example['text'][i]}\n{RESPONSE_TEMPLATE} {example['description'][i]}{tokenizer.eos_token}"
        output_texts.append(text)
    return output_texts

# MODEL
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
                                        quantization_config=bnb_config,
                                        torch_dtype=torch.float16)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# Adapter settings
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

model = prepare_model_with_template(model, tokenizer)

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

tokenizer_config.json:   0%|          | 0.00/959 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/795k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/703M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

Trainable parameters: 663552


In [8]:
from random import sample

collator = DataCollatorForCompletionOnlyLM(RESPONSE_TEMPLATE, tokenizer=tokenizer)

# Training Parameters
training_bs = 8
evalutation_bs = 8
num_train_epochs = 1

## OPTIMIZER Parameters
weight_decay = 5e-3
learning_rate = 5e-4
lr_scheduler_type = "linear"
warmup_ratio = 0.2

training_args = TrainingArguments(
    run_name=RUN_NAME,
    per_device_train_batch_size=training_bs,
    per_device_eval_batch_size=evalutation_bs,
    lr_scheduler_type=lr_scheduler_type,
    optim="paged_adamw_8bit", # "adamw_torch",
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    overwrite_output_dir='True',
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=600, # number of steps every evaluation loss computation
    save_steps=600, # number of steps to save the model checkpoints
    max_steps=1200,
    # gradient_accumulation_steps=4, # simulate larger batch sizes
    output_dir=OUTPUT_DIR,
    bf16=True,
)

# GET only a subset of the training and test set
# train over all the data can be not affordable on colab machines...
train_subset_idx = sample(list(range(len(dataset_summary["train"]))), TRAIN_SAMPLES)
eval_subset_idx = sample(list(range(len(dataset_summary["test"]))), EVALUATION_SAMPLES)

trainer = SFTTrainer(
    model,
    #train_dataset=dataset_summary["train"], # dataset_newsum_filtered["train"].select(train_subset_idx),
    #eval_dataset=dataset_summary["test"], # dataset_newsum_filtered["validation"].select(eval_subset_idx),
    train_dataset=dataset_summary["train"].select(train_subset_idx),
    eval_dataset=dataset_summary["test"].select(eval_subset_idx),
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=700,
    args=training_args,
)

trainer.train()

trainer.save_model(OUTPUT_DIR+f"/minerva_{MODEL_SIZE}_finetuned")

tokenizer.save_pretrained(OUTPUT_DIR+f"/minerva_{MODEL_SIZE}_finetuned")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
600,1.5803,1.079293
1200,1.2406,1.071014


Maybe they love one of them, maybe they don't.
Elway has done a good job of hiding his intentions, saying only that he'll take "the best player that is best for the Denver Broncos" regardless of what position he plays.
"I'm open to trading," too, Elway added, although it would certainly take a blow-me-away offer for him to move out of the fifth spot in a draft so deep in high-level, even generational talent outside the enticing quarterback group.
Elway always tries to fill his biggest needs in free agency so he won't have to reach in the draft and can simply take the best player. Signing Keenum gave him that safeguard this year.
Elway won't have to settle for anything less than the QB atop his wish list, and if he's not there, Elway can bypass the quarterbacks altogether to grab a prime playmaker at another position.
RUN ON RUNNING BACKS
Elway suggested the Broncos will dive into the deep pool of running backs after releasing sixth-year pro C.J. Anderson last week.
"It's a deep running

('minerva_summary/minerva_summary_350M/minerva_350M_finetuned/tokenizer_config.json',
 'minerva_summary/minerva_summary_350M/minerva_350M_finetuned/special_tokens_map.json',
 'minerva_summary/minerva_summary_350M/minerva_350M_finetuned/tokenizer.model',
 'minerva_summary/minerva_summary_350M/minerva_350M_finetuned/added_tokens.json',
 'minerva_summary/minerva_summary_350M/minerva_350M_finetuned/tokenizer.json')

In [9]:
!ls minerva_summary/minerva_summary_350M

checkpoint-1200  checkpoint-600  minerva_350M_finetuned  runs


In [10]:
"""!mv "minerva_summary/minerva_summary_350M/checkpoint-400" "minerva_summary/minerva_summary_350M/checkpoint400"
!ls"""

'!mv "minerva_summary/minerva_summary_350M/checkpoint-400" "minerva_summary/minerva_summary_350M/checkpoint400"\n!ls'

In [12]:
from peft import PeftModel

peft_model_id = OUTPUT_DIR+"/checkpoint-600"
#save_model_path = "<hf_account>/<model_name>"
save_model_path = "Aivalf/MinervaSummarization"
revision_id = "steps-600"

save_model_push_hub(peft_model_id, save_model_path, revision_id)

PEFT_MODEL_ID: minerva_summary/minerva_summary_350M/checkpoint-600
SAVE_MODEL_PATH: Aivalf/MinervaSummarization
REVISION_ID: steps-600


model.safetensors:   0%|          | 0.00/703M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/795k [00:00<?, ?B/s]

In [13]:
peft_model_id = OUTPUT_DIR+"/checkpoint-1200"
save_model_path = "Aivalf/MinervaSummarization"
revision_id = "steps-1200"

save_model_push_hub(peft_model_id, save_model_path, revision_id)

PEFT_MODEL_ID: minerva_summary/minerva_summary_350M/checkpoint-1200
SAVE_MODEL_PATH: Aivalf/MinervaSummarization
REVISION_ID: steps-1200


model.safetensors:   0%|          | 0.00/703M [00:00<?, ?B/s]