# **Text Summarization using RLHF**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -q datasets==2.18.0 trl==0.8.1 evaluate==0.4.1 rouge_score==0.1.2 peft==0.10.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [

## **1. Supervised Fine-Tuning**

### **Dataset**

In [2]:
from datasets import load_dataset

sft_ds_name = 'CarperAI/openai_summarize_tldr'
sft_ds = load_dataset(sft_ds_name)
sft_train = sft_ds['train']
sft_valid = sft_ds['valid']
sft_test = sft_ds['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/532 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/111M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/116722 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6553 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/6447 [00:00<?, ? examples/s]

In [3]:
sft_train

Dataset({
    features: ['prompt', 'label'],
    num_rows: 116722
})

In [4]:
sft_train['prompt'][0]

"SUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mother has been 

In [5]:
sft_train['label'][0]

"I still have contact with an old ex's friends but can't stand to see or talk to him. His friends are really nice ,so how do I tell them I possibly want to unfriend them on Facebook because of him?"

In [6]:
def formatting_func(example):
    text = f"### Text: {example['prompt']}\n ### Summary: {example['label']}" # Format cho bài toán summarization
    return text

In [7]:
for example in sft_train:
    print(formatting_func(example))
    break

### Text: SUBREDDIT: r/relationships
TITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting
POST: Not sure if this belongs here but it's worth a try. 

Backstory:
When I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. 

Now: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mother has bee

### **Model**

In [9]:
import torch
from trl import ModelConfig, get_quantization_config, get_kbit_device_map
from transformers import AutoTokenizer
model_config = ModelConfig(
    model_name_or_path='facebook/opt-350m' # Sử dụng opt 350M
)
## Tối ưu cho quá trình huấn luyện
torch_dtype = (
    model_config.torch_dtype
    if model_config.torch_dtype in ["auto", None]
    else getattr(torch, model_config.torch_dtype)
)
quantization_config = get_quantization_config(model_config)
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    attn_implementation=model_config.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token_id = tokenizer.eos_token_id # padding gắn với token end

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [10]:
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
# Giảm bớt số lượng tham số
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

### **Metric**

In [12]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    if isinstance(eval_preds, tuple):
        eval_preds = eval_preds[0]
    labels_ids = eval_preds.label_ids # List summary sẽ gán nhãn
    pred_ids = eval_preds.predictions # list predicted
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    result = rouge.compute(predictions=pred_str, references=label_str)
    return result

### **Trainer**

In [13]:
from transformers import TrainingArguments

num_epochs = 1 # 10

training_args = TrainingArguments(
    output_dir='./save_model',
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    adam_beta1=0.9,
    adam_beta2=0.95,
    num_train_epochs=num_epochs,
    load_best_model_at_end=True,
)

In [14]:
from trl import SFTTrainer

max_input_length = 512

trainer = SFTTrainer(
    model=model_config.model_name_or_path,
    model_init_kwargs=model_kwargs,
    args=training_args,
    train_dataset=sft_train,
    eval_dataset=sft_valid,
    max_seq_length=max_input_length,
    tokenizer=tokenizer,
    peft_config=peft_config,
    compute_metrics=compute_metrics,
    packing=True,
    formatting_func=formatting_func # Chạy cái này trước để build ra dataset class theo prompt đã định nghĩa
)



pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


## **2. Reward Modeling**

### **Dataset**

In [None]:
from datasets import load_dataset

rw_ds_name = 'CarperAI/openai_summarize_comparisons' # Sử dụng chosen và reject
rw_ds = load_dataset(rw_ds_name)
rw_train = rw_ds['train']
rw_valid = rw_ds['valid1']
rw_test = rw_ds['test']

In [None]:
rw_train

In [None]:
next(iter(rw_train))

In [None]:
def preprocess_function(examples): # Build lại bộ dataset gồm 4 trường thông tin :
                                    # input_ids_chosen, attention_mask_chosen, input_ids_rejected, attention_mask_rejected
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for prompt, chosen, rejected in zip(examples["prompt"], examples["chosen"], examples["rejected"]):
        chosen = f"### Text: {prompt}\n ### Summary: {chosen}"
        tokenized_chosen = tokenizer(chosen)

        rejected = f"### Text: {prompt}\n ### Summary: {rejected}"
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

In [None]:
rw_ds_processed = rw_ds.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/92534 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/83629 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/33082 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/50715 [00:00<?, ? examples/s]

In [None]:
rw_ds_processed['train']

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 92534
})

In [None]:
max_input_length = 512

rw_ds_filted = rw_ds_processed.filter( # lọc sample có nhiều hơn 512 length
    lambda x: len(x["input_ids_chosen"]) <= max_input_length
    and len(x["input_ids_rejected"]) <= max_input_length
)

Filter:   0%|          | 0/92534 [00:00<?, ? examples/s]

Filter:   0%|          | 0/83629 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33082 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50715 [00:00<?, ? examples/s]

In [None]:
rw_ds_filted["train"]

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 81192
})

In [None]:
rw_train = rw_ds_filted["train"]
rw_valid = rw_ds_filted["valid1"]
rw_test = rw_ds_filted["test"]

### **Model**

In [None]:
import torch
from trl import ModelConfig, get_quantization_config, get_kbit_device_map
from transformers import AutoModelForSequenceClassification # Tiến hành phân loại do reward model

model_config = ModelConfig(
    model_name_or_path='facebook/opt-350m' # ./save_sft_model/checkpoint-1000
)

torch_dtype = (
    model_config.torch_dtype
    if model_config.torch_dtype in ["auto", None]
    else getattr(torch, model_config.torch_dtype)
)
quantization_config = get_quantization_config(model_config)
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    attn_implementation=model_config.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, num_labels=1, **model_kwargs
)

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

### **Trainer**

In [None]:
from trl import RewardConfig

num_epochs = 1 # 10

reward_config = RewardConfig(
    output_dir='./save_rw_model',
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=num_epochs,
    load_best_model_at_end=True,
    max_length=max_input_length,
)

In [None]:
from trl import RewardTrainer

trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=reward_config,
    train_dataset=rw_train,
    eval_dataset=rw_valid,
    peft_config=peft_config,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

## **3. PPO**

### Sau khi thực hiện 2 bước trên sẽ thu được kết quả SFT model và reward model

### **Dataset**

In [None]:
ppo_ds_name = 'CarperAI/openai_summarize_tldr'
ppo_ds = load_dataset(sft_ds_name, split="train")

In [None]:
next(iter(ppo_ds))

{'prompt': "SUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mothe

In [None]:
def build_dataset(ds, tokenizer, max_length=200): # Format lại bộ data
    ds = ds.filter(lambda x: len(x["prompt"]) > max_length, batched=False)

    def tokenize(sample):
        sample["text"] = sample["prompt"] + sample["label"] # Text = prompt + label
        sample["input_ids"] = tokenizer.encode(sample["text"])[: max_length] # Lấy ra input_ids của text
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
ppo_ds = build_dataset(ppo_ds, tokenizer)

Filter:   0%|          | 0/116722 [00:00<?, ? examples/s]

Map:   0%|          | 0/116528 [00:00<?, ? examples/s]

In [None]:
next(iter(ppo_ds))

{'prompt': "SUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mothe

### **Model**

In [None]:
from trl import AutoModelForCausalLMWithValueHead

from peft import LoraConfig

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

model_path = "./save_sft_model" # Model đầu tiên sẽ lấy model của SFT để cải tiến
model = AutoModelForCausalLMWithValueHead.from_pretrained( # Sử dụng Causal
    pretrained_model_name_or_path='facebook/opt-350m', # ./save_sft_model/checkpoint-1000
    peft_config=peft_config,
)

### **Trainer**

In [None]:
from trl import PPOConfig, PPOTrainer

def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}

ppo_config = PPOConfig(
    model_name="facebook/opt-350m"
)

device = 0 if torch.cuda.is_available() else "cpu"

ppo_trainer = PPOTrainer(ppo_config, model, None, tokenizer, dataset=ppo_ds, data_collator=collator)

### **Reward Model**

In [None]:
from transformers import AutoModelForSequenceClassification, pipeline

rw_model = model = AutoModelForSequenceClassification.from_pretrained('./save_rw_model')
sentiment_pipe = pipeline("sentiment-analysis", model=rw_model, device=device)

In [None]:
if sentiment_pipe.tokenizer.pad_token_id is None:
    sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id

if sentiment_pipe.model.config.pad_token_id is None:
    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id

### **Training**

In [None]:
from tqdm import tqdm

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 200,
}
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"] # Lấy query làm input

    # Get response from gpt2
    response_tensors, ref_response_tensors = ppo_trainer.generate( # Đẩy qua mô hình SFT
        query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
    )
    batch["response"] = tokenizer.batch_decode(response_tensors)
    batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])] # Nối query + response
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs) # Đẩy qua reward model sẽ cho ra điểm số của step hiện tại đang đánh giá
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
    ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
    ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]
    batch["ref_rewards"] = ref_rewards

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards) # Tiến hành backward PPO để cải tiến model
    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"]) # Lưu lại kết quả