<a href="https://colab.research.google.com/github/phamnguyenlongvu/LLMs/blob/main/Efficiently_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install libraries

In [None]:
!pip install "peft==0.2.0"
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes" loralib --upgrade --quiet
!pip install rouge-score tensorboard py7zr

In [None]:
!pip install -U datasets

### Load dataset
We will use samsum dataset. This dataset contains about 16k messager - like with summary.

Dataset fields:
+ Dialogue: text of dialogue
+ Summary: human written summary of the dialogue
+ id: unique id of an example

I will use 5000 samples in train dataset for this demo.

In [None]:
from datasets import load_dataset

dataset = load_dataset("samsum")

train_dataset = dataset["train"].select(range(5000))

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(dataset['test'])}")

In [None]:
x = 1
print(dataset["train"][x])

In [None]:
### Tokenizer

In [None]:
from transformers import  AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
tokenizer

### Data preprocessing
Summarization is a test-generation task. Model will take a text as input and generate a summary as output. We need to know how long our input and output will take to batch our data efficiently.

In [None]:
from datasets import concatenate_datasets
import numpy as np

tokenizer_inputs = concatenate_datasets([train_dataset, dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
input_lengths = [len(x) for x in tokenizer_inputs["input_ids"]]
max_source_length = int(np.percentile(input_lengths, 85))
print(f"Max source length: {max_source_length}")

tokenizer_tagets = concatenate_datasets([train_dataset, dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
target_lengths = [len(x) for x in tokenizer_tagets["input_ids"]]
max_target_length = int(np.percentile(target_lengths, 90))
print(f"Max target length: {max_target_length}")


In [None]:
def preprocess_function(sample, padding="max_length"):
  inputs = ["summarize: " + item for item in sample["dialogue"]]
  model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

  labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

  if padding == "max_length":
    labels["input_ids"] = [
        [(i if i != tokenizer.pad_token_id else -100) for i in label] for label in labels["input_ids"]
    ]

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenizer_dataset_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
tokenizer_dataset_test = dataset["test"].map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenizer dataset: {list(tokenizer_dataset_train.features)}")

tokenizer_dataset_train.save_to_disk("data/train")
tokenizer_dataset_test.save_to_disk("data/test")


In [None]:
!make CUDA_VERSION=122

In [None]:
!nvcc --version

In [None]:
!conda list | grep cuda

### Finetuning T5 with Lora

In [None]:
from transformers import AutoModelForSeq2SeqLM
import torch

# model_id = "philschmid/flan-t5-xxl-sharded-fp16"
model_id = "google/flan-t5-large"

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

### LoRA - Low rank Adaptation
- Là một kí thuật học sâu giúp giảm số lượng tham số cần huấn luyện trong mô hình LLMs. Thay vì tinh chỉnh toàn bộ, LoRA chỉ điều chỉnh trọng số của một số ma trận hạng thấp được thêm vào mô hình.

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

lora_config = LoraConfig(
    r=4, # Kích thước của ma trận hạng thấp.
    lora_alpha=16, # Hệ số mở rộng (scaling factor)
    target_modules=["q", "v"], # Modules mà LoRA áp dụng - query, value
    lora_dropout=0.01, # Tỉ lệ dropout để tránh overfit
    bias="none", # Cách xứ lý bias
    task_type=TaskType.SEQ_2_SEQ_LM # Xác định loại bài toán
)

model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id= -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8

)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir = "lora-flan-t5-xxl"

training_args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=3,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=1000,
    save_strategy="no",
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenizer_dataset_train
)


In [None]:
model.config.use_cache = False

In [None]:
trainer.train()

In [None]:
peft_model_id="T5-LARGE"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

peft_model_id = "T5-LARGE"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

print("Model loaded")

In [None]:
from random import randrange
sample = dataset["test"][randrange(len(dataset["test"]))]

input_id = tokenizer(sample["dialogue"], return_tensors="pt", truncation=True).input_ids

outputs = model.generate(input_ids=input_id, max_new_tokens=10, do_sample=True, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

print(f"Input sentence: {sample['dialogue']}")
print(f"\n {'-' * 30}")
print(f"Baseline human summary: {sample['summary']}")
print(f"\n {'-' * 30}")
print(f"Model generated summary: {tokenizer.batch_decode(outputs.detach(), skip_special_tokens=True)[0]}")

In [None]:
import evaluate
import numpy as np
from tqdm import tqdm

metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=50):
    # generate summary
    outputs = model.generate(input_ids=torch.tensor(sample["input_ids"]).unsqueeze(0), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(torch.tensor(sample['labels']) != -100, torch.tensor(sample['labels']), tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

eval_dataset = dataset["train"].select(range(5000, 5030))
tokenizer_dataset_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

print(list(tokenizer_dataset_eval.features))

# tokenizer_dataset_eval["input_ids"] = torch.tensor(tokenizer_dataset_eval["input_ids"], dtype=torch.long)
# tokenizer_dataset_eval["attention_mask"] = torch.tensor(tokenizer_dataset_eval["attention_mask"], dtype=torch.long)
# tokenizer_dataset_eval["labels"] = torch.tensor(tokenizer_dataset_eval["labels"], dtype=torch.long)


predictions, references = [] , []
for sample in tqdm(tokenizer_dataset_eval):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")