### Clone and install the repository

In [None]:
!git clone https://github.com/OpenRLHF/OpenRLHF.git

In [1]:
%cd OpenRLHF

/mnt/dataset1/thuannd/Repository/aivn-aio2024-chatbot-llm-rlhf/OpenRLHF


In [None]:
!pip install openrlhf[vllm_latest]

### Train Reward Model

In [None]:
!deepspeed --module openrlhf.cli.train_rm \
   --save_path ./checkpoint/Llama-3.2-1B-rm-dpo \
   --save_steps -1 \
   --logging_steps 1 \
   --eval_steps -1 \
   --train_batch_size 96 \
   --micro_train_batch_size 8 \
   --pretrain thuanan/Llama-3.2-1B-Instruct-Chat-sft \
   --value_head_prefix score \
   --bf16 \
   --max_epochs 1 \
   --max_len 2048 \
   --zero_stage 2 \
   --learning_rate 5e-6 \
   --dataset thuanan/Vi-Alpaca-Preference \
   --apply_chat_template \
   --chosen_key chosen \
   --rejected_key rejected \
   --flash_attn \
   --load_checkpoint \
   --packing_samples \
   --gradient_checkpointing \
   --adam_offload \
   --lora_rank 16 \
   --lora_alpha 32

### Merge LoRA Adapter Weights

In [3]:
!python -m openrlhf.cli.lora_combiner \
   --model_path /mnt/dataset1/thuannd/Repository/aivn-aio2024-chatbot-llm-rlhf/checkpoint/Llama-3.2-1B-rm-init \
   --lora_path ./checkpoint/Llama-3.2-1B-rm-dpo \
   --output_path ./checkpoint/Llama-3.2-1B-rm-dpo-combined \
   --is_rm \
   --bf16

Loading the base model from /mnt/dataset1/thuannd/Repository/aivn-aio2024-chatbot-llm-rlhf/checkpoint/Llama-3.2-1B-rm-init
Loading the LoRA adapter from ./checkpoint/Llama-3.2-1B-rm-dpo
Applying and merging the LoRA weights
Saving the complete model to ./checkpoint/Llama-3.2-1B-rm-dpo-combined


### Push to Hugging Face Hub

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

ckpt_path = "./checkpoint/Llama-3.2-1B-rm-dpo-combined"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)

In [13]:
type(model)

transformers.models.llama.modeling_llama.LlamaForSequenceClassification

In [14]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((20

In [31]:
model.push_to_hub(
    "thuanan/Llama-3.2-1B-RM-DPO",
    commit_message="Add model ckpt",
)

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thuanan/Llama-3.2-1B-RM-DPO/commit/35273dea7e8a6fa128ed8d53a7e4b429ad12ffaa', commit_message='Add model ckpt', commit_description='', oid='35273dea7e8a6fa128ed8d53a7e4b429ad12ffaa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thuanan/Llama-3.2-1B-RM-DPO', endpoint='https://huggingface.co', repo_type='model', repo_id='thuanan/Llama-3.2-1B-RM-DPO'), pr_revision=None, pr_num=None)

In [32]:
tokenizer.push_to_hub(
    "thuanan/Llama-3.2-1B-RM-DPO",
    commit_message="Add tokenizer",
)

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thuanan/Llama-3.2-1B-RM-DPO/commit/740231d7bb2e843aadeea234cb5088f5b8a43140', commit_message='Add tokenizer', commit_description='', oid='740231d7bb2e843aadeea234cb5088f5b8a43140', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thuanan/Llama-3.2-1B-RM-DPO', endpoint='https://huggingface.co', repo_type='model', repo_id='thuanan/Llama-3.2-1B-RM-DPO'), pr_revision=None, pr_num=None)

### Test Reward Model

In [26]:
inputs = tokenizer(
    "Tại sao bạn lại thích học lập trình?",
    return_tensors="pt",
    max_length=2048,
    truncation=True,
)

In [27]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((20

In [28]:
reward = model.model(**inputs).last_hidden_state
reward


tensor([[[ 2.3110,  3.2832,  0.0715,  ..., -0.8754, -2.5920,  1.2897],
         [ 0.1436, -0.1955,  0.9064,  ..., -1.4842, -3.2248, -0.3736],
         [ 0.1863,  3.4998,  1.6896,  ...,  1.3823, -2.3932,  1.7603],
         ...,
         [-1.9774,  4.3347, -0.2467,  ...,  1.8927,  1.4284,  2.0450],
         [-1.4019,  2.4985,  1.7585,  ..., -1.9504, -1.1227,  2.3001],
         [-1.0206,  2.1333,  2.1011,  ..., -1.6172, -1.9382, -1.5404]]],
       grad_fn=<MulBackward0>)

In [29]:
reward = model.score(reward)[:, -1]
reward

tensor([[0.0406]], grad_fn=<SelectBackward0>)