<a href="https://colab.research.google.com/github/phamnguyenlongvu/LLMs/blob/main/RLHF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.5.0
!pip install -q sentencepiece

In [None]:
import random
import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
from trl import RewardTrainer

In [None]:
random.seed(42)

In [None]:
ds = load_dataset("argilla/dolly-curated-comparison-falcon-7b-instruct", split="train")

In [None]:
df = ds.to_pandas()
df

In [None]:
responses = ["response-1", "response-2"]

def get_chosen_and_not_chosen(responses):
  chosen_id = random.randint(0, len(responses) - 1)
  not_chosen_id = 1 - chosen_id
  return responses[chosen_id], responses[not_chosen_id], chosen_id

rows = []

for record in ds:
  chosen, not_chosen, chosen_id = get_chosen_and_not_chosen(responses)
  chosen_from_falcon, _, _ = get_chosen_and_not_chosen(responses)

  rows.append(
      {
          "instruction": record["prompt"],
          "chosen_response": record[chosen],
          "rejected_response": record[not_chosen]
      }
  )

In [None]:
prepared_dataset = Dataset.from_list(rows)
prepared_dataset.to_pandas()

In [None]:
prepared_dataset

In [None]:
prepared_dataset_mini = prepared_dataset.select(range(1000))
prepared_dataset_mini

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    AutoTokenizer,
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

quantization_config = BitsAndBytesConfig(load_in_8bit=False, load_in_4bit=True)
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-350m",
                                                           quantization_config=quantization_config,
                                                           device_map={"":0},
                                                           trust_remote_code=True,
                                                           num_labels=1)
model.config.use_cache = False


In [None]:
if tokenizer.pad_token == None:
  tokenizer.pad_token = tokenizer.eos_token
  model.config.pad_token_id = model.config.eos_token_id

def formatting_func(examples):
  kwargs = {
      "padding": "max_length",
      "truncation": True,
      "max_length": 256,
      "return_tensors": "pt"
  }

  prompt_plus_chosen_response = (
      examples["instruction"] + "\n" + examples["chosen_response"]
  )
  prompt_plus_rejected_response = (
      examples["instruction"] + "\n" +examples["rejected_response"]
  )

  token_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
  token_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

  return {
      "input_ids_chosen": token_chosen["input_ids"][0],
      "attention_mask_chosen": token_chosen["attention_mask"][0],
      "input_ids_rejected": token_rejected["input_ids"][0],
      "attention_mask_rejected": token_rejected["attention_mask"][0]
  }

formatted_dataset = prepared_dataset_mini.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "True"

In [None]:
formatted_dataset["train"]

In [None]:
formatted_dataset["train"]["instruction"][1]

In [None]:
len(formatted_dataset["train"]["input_ids_chosen"][1])

In [None]:
from transformers import TrainingArguments
from peft import LoraConfig
from trl import RewardTrainer

# Prepare training parameters
training_args = TrainingArguments(
    output_dir="./train_logs",  # Output folder
    max_steps=100,  # Maximum number of training steps
    per_device_train_batch_size=4,  # Batch size per GPU for training
    gradient_accumulation_steps=1,  # Number of steps to accumulate gradients
    learning_rate=1.0e-4,  # Learning rate
    optim="adamw_torch",  # Optimizer
    save_steps=50,  # How often to save checkpoints
    logging_steps=10,  # How often to log training information
    report_to="tensorboard",  # Reporting method (in this case, TensorBoard)
    remove_unused_columns=False,  # Whether to remove unused columns
    evaluation_strategy="steps",  # Evaluation strategy
    num_train_epochs=5,  # Number of training epochs
)

# Prepare PEFT parameters
peft_config = LoraConfig(
    r=16,  # Value of r
    lora_alpha=16,  # Value of lora_alpha
    bias="none",  # Bias setting
    task_type="SEQ_CLS",  # Task type (Sequence Classification)
    modules_to_save=["scores"],  # Modules to save
)

# Prepare RewardTrainer
trainer = RewardTrainer(
    model=model,  # The model for reinforcement learning
    tokenizer=tokenizer,  # The tokenizer for processing input data
    args=training_args,  # Training arguments
    train_dataset=formatted_dataset["train"],  # Training dataset
    eval_dataset=formatted_dataset["test"],  # Evaluation dataset
    peft_config=peft_config,  # PEFT configuration
    max_length=512,  # Maximum length of input
)

# Execute training
trainer.train()

# Save the pretrained reward model
trainer.model.save_pretrained("./reward_model")

In [None]:
import torch

def get_Score(model, tokenizer, prompt, response):
  inputs = tokenizer.encode_plus(
      prompt,
      response,
      truncation=True,
      padding="max_length",
      max_length=512,
      return_tensors="pt"
  ).to("cuda:0")

  with torch.no_grad():
    outputs=model(**inputs)

  logits = outputs.logits

  return logits.item()


In [None]:
x = 1000
prepared_dataset[x]

In [None]:
prompt = prepared_dataset[x]["instruction"]
rejected_response = prepared_dataset[x]["rejected_response"]
chosen_response = prepared_dataset[x]["chosen_response"]

print(prompt)
print(rejected_response)
print(chosen_response)

In [None]:
score = get_Score(model, tokenizer, prompt, rejected_response)
score

In [None]:
score = get_Score(model, tokenizer, prompt, chosen_response)
score

In [None]:
test_dataset = prepared_dataset.shuffle().select(range(500))

In [None]:
rs = []

for record in test_dataset:
  prompt = record["instruction"]
  rejected_response = record["rejected_response"]
  chosen_response = record["chosen_response"]

  rejected_score = get_Score(model, tokenizer, prompt, rejected_response)
  chosen_score = get_Score(model, tokenizer, prompt, chosen_response)

  rs.append({
      "prompt": prompt,
      "rejected_response": rejected_response,
      "chosen_response": chosen_response,
      "rejected_score": rejected_score,
      "chosen_score": chosen_score
  })

In [None]:
import pandas as pd
df = pd.DataFrame(rs)
df