In [None]:
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q peft
!pip install datasets -q
!pip install torch
!pip install numpy
!pip install pandas
!pip install accelerate transformers huggingface_hub

In [None]:
!nvidia-smi

In [13]:
import os
import gc
import pandas as pd
import numpy as np
from types import SimpleNamespace
from datasets import load_dataset

import torch
from torch import nn,optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, get_cosine_schedule_with_warmup)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'



In [4]:
cfg = {
    'model_id': 'h2oai/h2o-danube-1.8b-sft',
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'seed': 2004, 'beta': 0.1,
    'batch_size': 1, 'learning_rate': 1e-07,
    'weight_decay': 1e-02, 'num_epochs': 1,
    'warmup_steps': 0, 'accumulation_steps': 2,
    'logging_steps': 50
}

cfg = SimpleNamespace(**cfg)

In [5]:
datatoy = {
    'prompt': ['What is New York', '5+3 = '],
}
train = pd.DataFrame(datatoy)
train

Unnamed: 0,prompt
0,What is New York
1,5+3 =


In [6]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)

In [7]:
messages = [
    {"role": "user", "content": "How are you?"},
    {"role": "assistant", "content": "I'm an AI agent, so I have no feelings, how can I help you today?"}

]

In [8]:
encoded_input_ids = tokenizer.apply_chat_template(messages)
tokenizer.decode(encoded_input_ids)

"<|prompt|>How are you?</s> <|answer|>I'm an AI agent, so I have no feelings, how can I help you today?</s>"

In [9]:
#Use the supervised finetuning model: Hanube
ref_model = AutoModelForCausalLM.from_pretrained(
    cfg.model_id, torch_dtype= torch.bfloat16,
    device_map = 'auto'
)

ref_model.eval();

In [10]:
def get_messages(prompt):
    return [
        {'role': 'user', 'content': prompt},
        {'role': 'assistant', 'content': ''}
    ]

def extract_response(question_response):
    #Extract response from model output
    response = question_response.strip().split('<|answer|>')[-1]
    return response

In [11]:
prompts = train['prompt']
responses = []

for prompt in prompts:
    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1] # remove </s>

    # Generate response using the model
    with torch.no_grad():
        ref_model.eval()
        output_ids = ref_model.generate(
            input_ids,
            max_new_tokens=1024,
            pad_token_id=tokenizer.pad_token_id
        )[0]

    # Decode model output and extract response
    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)

    responses.append(
        extract_response(question_response)
    )


In [12]:
for idx, response in enumerate(responses):
    print(f'#Response for question {idx+1}: \n', response.strip(), '\n')


#Response for question 1: 
 New York is a state in the United States of America. It's known for its big cities, like New York City and Albany, as well as its beautiful parks, like Central Park and the Adirondack Mountains. It also has many famous landmarks, like the Statue of Liberty and the Empire State Building. People from all over the world come to visit New York because it's so exciting and full of fun things to do! 

#Response for question 2: 
 To solve the equation, we need to perform the addition operation.
The given equation is 5 + 3 = X.
To find the value of X, we can add the two numbers together:
5 + 3 = 8
So, the value of X is 8.
The answer is: 8 



In [14]:
#Create a preference dataset
chosen = ['New York is a state of the US', '5+3=8']
rejected = responses

train['chosen'] = chosen
train['rejected'] = rejected
train

Unnamed: 0,prompt,chosen,rejected
0,What is New York,New York is a state of the US,New York is a state in the United States of A...
1,5+3 =,5+3=8,"To solve the equation, we need to perform the..."


In [15]:
#Dataloader
def encode(prompt, response):
    eos_token = '</s>'
    start_inst_token = '<|prompt|>'
    end_inst_token = '<|answer|>'
    prompt = start_inst_token + prompt + eos_token + end_inst_token
    prompt_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').input_ids
    response_ids = tokenizer(response, add_special_tokens=False, return_tensors='pt').input_ids
    eos_ids = tokenizer(eos_token, add_special_tokens=False, return_tensors='pt').input_ids
    input_ids = torch.cat([prompt_ids,response_ids, eos_ids], dim=1).view(-1)
    labels = torch.cat([
        torch.full_like(prompt_ids[:, :-1], tokenizer.eos_token_id), #a tensor with full eos_token_id
        prompt_ids[:, -1].view(1,-1),  #dont take '>' of '<|answer|>'
        response_ids, eos_ids
    ], dim =1).view(-1)

    return input_ids, labels

In [16]:
class CustomData(Dataset):
    def __init__(self, prompts, chosen, rejected):
        self.prompts = prompts
        self.chosen = chosen
        self.rejected = rejected

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, index):
        prompt = self.prompts[index]
        chosen_response = self.chosen[index]
        rejected_response = self.rejected[index]
        chosen_input_ids, chosen_labels = encode(prompt, chosen_response)
        rejected_input_ids, rejected_labels = encode(prompt, rejected_response)

        return {
            'chosen_input_ids': chosen_input_ids,
            'chosen_labels': chosen_labels,
            'rejected_input_ids': rejected_input_ids,
            'rejected_labels': rejected_labels

        }

def collate_fn(batch):
    padded_data = {}
    data = pd.DataFrame(batch)
    keys = data.columns
    for key in keys:
      padded_data[key] = pad_sequence(
          sequences = data[key].values.tolist(),
          batch_first = True, padding_value = tokenizer.eos_token_id
      )
    return padded_data

In [17]:
prompts = train['prompt'].values.tolist()
chosen_responses = train['chosen'].values.tolist()
rejected_responses = train['rejected'].values.tolist()

torch.manual_seed(seed = cfg.seed)
trainset = CustomData(prompts, chosen_responses, rejected_responses)
trainloader = DataLoader(trainset, batch_size = cfg.batch_size, shuffle = True, drop_last=True,  collate_fn = collate_fn)



In [18]:
#DPO Loss
def get_batch_logps(logits, labels, pad_id_value):
    labels = labels.clone()
    loss_mask = labels != pad_id_value
    per_token_logps = torch.gather(logits.log_softmax(dim=-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
    return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)

def dpo_loss(chosen_logps, rejected_logps, ref_chosen_logps, ref_rejected_logps, beta):
    policy_ratio = chosen_logps - rejected_logps
    ref_ratio = ref_chosen_logps - ref_rejected_logps
    losses = (-1) * F.logsigmoid((beta * policy_ratio) - beta * ref_ratio)
    return losses.mean()

def get_ref_logits(ref_model, chosen_pair, rejected_pair):
    ref_model.eval()
    with torch.no_grad():
        ref_chosen_logits = ref_model(chosen_pair).logits #
        ref_rejected_logits = ref_model(rejected_pair).logits
    return ref_chosen_logits, ref_rejected_logits # Return logits tensors directly

In [19]:
#Finetune sample dataset using DPO loss
class Policy(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.backbone = AutoModelForCausalLM.from_pretrained(
            cfg.model_id, low_cpu_mem_usage=True,
            device_map = 'cuda', torch_dtype = torch.bfloat16,
            use_cache = False
        )
        self.backbone.gradient_checkpointing_enable()

    def forward(self, input_ids):
        logits = self.backbone(input_ids).logits
        return logits

In [20]:
model = Policy(cfg)
optimizer = optim.AdamW(
    model.parameters(), lr = cfg.learning_rate,
    weight_decay = cfg.weight_decay
)
scheduler = get_cosine_schedule_with_warmup(
    optimizer = optimizer, num_warmup_steps = cfg.warmup_steps,
    num_training_steps = len(trainloader) * cfg.num_epochs
)
scaler = GradScaler()

for epoch in range(cfg.num_epochs):
    for batch_idx, batch in enumerate(trainloader):
        model.train()

        chosen_input_ids = batch['chosen_input_ids'].long().to(cfg.device)
        chosen_labels = batch['chosen_labels'].long().to(cfg.device)
        rejected_input_ids = batch['rejected_input_ids'].long().to(cfg.device)
        rejected_labels = batch['rejected_labels'].long().to(cfg.device)

        ref_chosen_logits, ref_rejected_logits = get_ref_logits(ref_model, chosen_input_ids, rejected_input_ids)

        policy_chosen_logits = model(chosen_input_ids)
        policy_rejected_logits = model(rejected_input_ids)
        policy_chosen_logps = get_batch_logps(policy_chosen_logits, chosen_labels, pad_id_value = tokenizer.eos_token_id)
        policy_rejected_logps = get_batch_logps(policy_rejected_logits, rejected_labels, pad_id_value= tokenizer.eos_token_id)
        ref_chosen_logps = get_batch_logps(ref_chosen_logits, chosen_labels, pad_id_value= tokenizer.eos_token_id)
        ref_rejected_logps = get_batch_logps(ref_rejected_logits, rejected_labels, pad_id_value= tokenizer.eos_token_id)

        loss = dpo_loss(
            policy_chosen_logps, policy_rejected_logps,
            ref_chosen_logps, ref_rejected_logps,
            beta = cfg.beta
        ) / cfg.accumulation_steps

        print(
            f'Epoch: {epoch+1} / {cfg.num_epochs}'
            f'| Batch: {batch_idx}/{len(trainloader)}'
            f'| DPO Loss: {loss.item():.8f}'
        )

        loss.backward()
        if not (batch_idx % cfg.accumulation_steps):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

Epoch: 1 / 1| Batch: 0/2| DPO Loss: 0.34657359
Epoch: 1 / 1| Batch: 1/2| DPO Loss: 0.34713626


In [21]:
#The model after DPO
test_prompts = ['What is New York', '5+3 = ']
responses = []

for prompt in test_prompts:

    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1]

    with torch.no_grad():
        model.eval()
        output_ids = model.backbone.generate(
                input_ids=input_ids,
                max_new_tokens=512,
                pad_token_id=tokenizer.pad_token_id
        )[0]

    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)
    response = extract_response(question_response)

    print('Prompt:', tokenizer.decode(input_ids[0]))
    print('Response:', response)
    print('\n')

Prompt: <|prompt|>What is New York</s> <|answer|>
Response:  New York is a state in the United States of America. It's known for its big cities, like New York City and Albany, as well as its beautiful parks and museums. People from all over the world come to visit New York because it has so much to offer!


Prompt: <|prompt|>5+3 = </s> <|answer|>
Response:  To solve the equation, we need to perform the addition operation.
The given equation is 5 + 3 = X.
To find the value of X, we can add the two numbers together:
5 + 3 = 8
So, the value of X is 8.
The answer is: 8




In [28]:
#Train on a reference dataset
#https://huggingface.co/datasets/argilla/ultrafeedback-binarized-preferences

from datasets import load_dataset

data_path = 'argilla/ultrafeedback-binarized-preferences'
train = load_dataset(data_path)['train'].to_pandas()
train = train[['instruction', 'chosen_response', 'rejected_response']]
train.columns = ['prompt', 'chosen', 'rejected']
train = train.sample(2000, random_state=1).reset_index(drop=True)
train

Unnamed: 0,prompt,chosen,rejected
0,You will be given a definition of a task first...,"[256, 512, 4096, 4096]","Sure, I'd be happy to help! Here's the definit..."
1,"const [upgradePrimeToPlusMutation, { isUpgrade...",It appears that the code you've provided is a ...,It's essential to test various scenarios to en...
2,I work in Typescript and use Google Data Stora...,"Sure, I can help you with that. First, let's d...","Sure, here's an example of a generic base clas..."
3,"TASK DEFINITION: In this task, you are given a...","Sure, I'd be happy to help! Here's the transla...",همراحم از زمان محل را به نظر می رسند، من بنا ب...
4,Develop an algorithm that can detect a given t...,"Certainly, I can help you develop an algorithm...","As a helpful and respectful AI assistant, I mu..."
...,...,...,...
1995,"Teacher:Given a premise as input, generate a h...",If all eight jackets that weren't wasting away...,"Yes, I understand the problem. Here's a possib..."
1996,"Question and answer: Q: Can we conclude from ""...","The correct answer is: no.\n\nThe premise, ""Pe...","Confidence: 100%\n\nYes, we can conclude that ..."
1997,Are most Italians devout Catholics?,Hello! Thank you for your question. It's impor...,It is complex to provide a simple answer to su...
1998,can u propose a 2-min music based upon my curr...,"Of course, I'd be happy to help! However, I ne...","Certainly, I can create a playlist for you bas..."


In [29]:
for col in ['prompt', 'chosen', 'rejected']:
    if col == 'prompt':
      max_len = 500
    else:
      max_len = 1024
    train[col] = train[col].str[:max_len]

In [30]:
prompts = train['prompt'].values.tolist()
chosen_responses = train['chosen'].values.tolist()
rejected_responses = train['rejected'].values.tolist()

torch.manual_seed(seed=cfg.seed)
train_dataset = CustomData(
    prompts=prompts,
    chosen=chosen_responses,
    rejected=rejected_responses
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

In [31]:
model = Policy(cfg)

optimizer = optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)


train_losses = []
for epoch in range(cfg.num_epochs):
    for batch_idx, batch in enumerate(train_dataloader):
        torch.cuda.empty_cache()
        gc.collect()
        model.train()

        chosen_input_ids = batch['chosen_input_ids'].long().to(cfg.device)
        chosen_labels = batch['chosen_labels'].long().to(cfg.device)
        rejected_input_ids = batch['rejected_input_ids'].long().to(cfg.device)
        rejected_labels = batch['rejected_labels'].long().to(cfg.device)

        ref_chosen_logits, ref_rejected_logits = get_ref_logits(ref_model, chosen_input_ids, rejected_input_ids)

        policy_chosen_logits = model(chosen_input_ids)
        policy_rejected_logits = model(rejected_input_ids)

        policy_chosen_logps = get_batch_logps(policy_chosen_logits, chosen_labels, pad_id_value = tokenizer.eos_token_id)
        policy_rejected_logps = get_batch_logps(policy_rejected_logits, rejected_labels, pad_id_value= tokenizer.eos_token_id)
        ref_chosen_logps = get_batch_logps(ref_chosen_logits, chosen_labels, pad_id_value= tokenizer.eos_token_id)
        ref_rejected_logps = get_batch_logps(ref_rejected_logits, rejected_labels, pad_id_value= tokenizer.eos_token_id)

        loss = dpo_loss(
            policy_chosen_logps, policy_rejected_logps,
            ref_chosen_logps, ref_rejected_logps,
            beta = cfg.beta
        ) / cfg.accumulation_steps
        train_losses.append(loss.item())
        if not (batch_idx % 100):
          print(
              f'Epoch: {epoch+1} / {cfg.num_epochs}'
              f'| Batch: {batch_idx}/{len(train_dataloader)}'
              f'| DPO Loss: {loss.item():.8f}'
          )

        loss.backward()
        if not (batch_idx % cfg.accumulation_steps):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

Epoch: 1 / 1| Batch: 0/2000| DPO Loss: 0.34657359
Epoch: 1 / 1| Batch: 100/2000| DPO Loss: 0.34695885
Epoch: 1 / 1| Batch: 200/2000| DPO Loss: 0.34621421
Epoch: 1 / 1| Batch: 300/2000| DPO Loss: 0.34662130
Epoch: 1 / 1| Batch: 400/2000| DPO Loss: 0.34747672
Epoch: 1 / 1| Batch: 500/2000| DPO Loss: 0.34407410
Epoch: 1 / 1| Batch: 600/2000| DPO Loss: 0.34660622
Epoch: 1 / 1| Batch: 700/2000| DPO Loss: 0.34711385
Epoch: 1 / 1| Batch: 800/2000| DPO Loss: 0.34665397
Epoch: 1 / 1| Batch: 900/2000| DPO Loss: 0.34603325
Epoch: 1 / 1| Batch: 1000/2000| DPO Loss: 0.34673220
Epoch: 1 / 1| Batch: 1100/2000| DPO Loss: 0.34637719
Epoch: 1 / 1| Batch: 1200/2000| DPO Loss: 0.34638396
Epoch: 1 / 1| Batch: 1300/2000| DPO Loss: 0.34642366
Epoch: 1 / 1| Batch: 1400/2000| DPO Loss: 0.34655908
Epoch: 1 / 1| Batch: 1500/2000| DPO Loss: 0.34657669
Epoch: 1 / 1| Batch: 1600/2000| DPO Loss: 0.34659299
Epoch: 1 / 1| Batch: 1700/2000| DPO Loss: 0.34626752
Epoch: 1 / 1| Batch: 1800/2000| DPO Loss: 0.34677899
Epoch

In [None]:
#Test model again

test_prompts = ['What is New York', '5+3 =']
responses = []

for prompt in test_prompts:

    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1] 

    with torch.no_grad():
        model.eval()
        output_ids = model.backbone.generate(
                input_ids=input_ids,
                max_new_tokens=1024,
                pad_token_id=tokenizer.pad_token_id
        )[0]

    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)
    response = extract_response(question_response)

    print('Prompt:', tokenizer.decode(input_ids[0]))
    print('Response:', response)
    print('\n')