In [1]:
import random
import pandas as pd
from operator import itemgetter
import torch
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments
from trl import RewardTrainer

In [2]:
df = pd.read_csv('scenarioData3_llm_reward.csv')
print(df.head())

                                              prompt  \
0  <bos><start_of_turn>user\nGiven the mission ba...   
1  <bos><start_of_turn>user\nGiven the mission ba...   
2  <bos><start_of_turn>user\nGiven the mission ba...   
3  <bos><start_of_turn>user\nGiven the mission ba...   
4  <bos><start_of_turn>user\nGiven the mission ba...   

                                          completion  label  
0  ## Operation Crimson Echo - Success Scenario\n...   True  
1  ## Operation Crimson Echo - Success Scenario\n...   True  
2  ## Operation Crimson Echo - Success Scenario\n...   True  
3  ## Operation Crimson Echo - Success Scenario\n...   True  
4  ## Operation Crimson Echo - Success Scenario\n...   True  


In [3]:
completion_lst = df['completion'].tolist()

In [4]:
new_df = pd.DataFrame(columns=['chosen_response', 'rejected_response'])
chosen_lst = []
rejected_lst = [] 

for i in range(50): 
    chosen_lst.append(completion_lst[i])
    rejected_lst.append(completion_lst[i+50])

new_df['chosen_response'] = chosen_lst
new_df['rejected_response'] = rejected_lst

In [5]:
print(new_df.head())

                                     chosen_response  \
0  ## Operation Crimson Echo - Success Scenario\n...   
1  ## Operation Crimson Echo - Success Scenario\n...   
2  ## Operation Crimson Echo - Success Scenario\n...   
3  ## Operation Crimson Echo - Success Scenario\n...   
4  ## Operation Crimson Echo - Success Scenario\n...   

                                   rejected_response  
0  ## Operation Crimson Echo - Success Scenario\n...  
1  ## Operation Crimson Echo - Success Scenario\n...  
2  ## Operation Crimson Echo - Success Scenario\n...  
3  ## Operation Crimson Echo - Success Scenario\n...  
4  ## Operation Crimson Echo - Success Scenario\n...  


In [6]:
#Select a base model whch we need to train for reward modeling.
model_name = "distilroberta-base"
#model_name = "allenai/longformer-base-4096"
#model_name = "sshleifer/distilbart-cnn-12-6"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def formatting_func(row):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}
    #kwargs = {"padding": "max_length", "truncation": True, "max_length": 2048, "return_tensors": "pt"}
    prompt_plus_chosen_response = row["chosen_response"]
    prompt_plus_rejected_response = row["rejected_response"]
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)
    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }
    
formatted_dataset = new_df.apply(formatting_func, axis=1)
formatted_df= pd.DataFrame(list(formatted_dataset))

In [8]:
print(formatted_df.iloc[0, :])

input_ids_chosen           [tensor(0), tensor(48342), tensor(13346), tens...
attention_mask_chosen      [tensor(1), tensor(1), tensor(1), tensor(1), t...
input_ids_rejected         [tensor(0), tensor(48342), tensor(13346), tens...
attention_mask_rejected    [tensor(1), tensor(1), tensor(1), tensor(1), t...
Name: 0, dtype: object


In [9]:
from datasets import Dataset
formatted_dict = {
    'input_ids_chosen': formatted_df['input_ids_chosen'].tolist(), 
    'attention_mask_chosen': formatted_df['attention_mask_chosen'].tolist(), 
    'input_ids_rejected': formatted_df['input_ids_rejected'].tolist(), 
    'attention_mask_rejected': formatted_df['attention_mask_rejected'].tolist()
}
formatted_dataset = Dataset.from_dict(formatted_dict)

In [10]:
print(formatted_dataset)

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 50
})


In [11]:
tokenizer.model_max_length

512

In [12]:
formatted_dataset = formatted_dataset.train_test_split()
# Configuring the training arguments
training_args = TrainingArguments(
    output_dir="./reward_model2_4",
    per_device_train_batch_size=32,
    evaluation_strategy="steps",
    logging_steps=1,
    num_train_epochs=50,
    report_to=None,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine", 
    learning_rate=1e-5,
    no_cuda=True
)

# Loading the RewardTrainer from TRL
trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
)

trainer.train()

wandb: Currently logged in as: pkailin2002 (nus-pkailin). Use `wandb login --relogin` to force relogin


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
1,0.6837,0.693043,0.538462
2,0.7138,0.69293,0.538462
3,0.7041,0.692681,0.692308
4,0.6873,0.692344,0.769231
5,0.702,0.691867,0.846154
6,0.6786,0.691323,0.923077
7,0.6846,0.69063,0.923077
8,0.6772,0.689844,1.0
9,0.6824,0.688861,1.0
10,0.6787,0.687809,1.0


TrainOutput(global_step=100, training_loss=0.2382880789099727, metrics={'train_runtime': 3396.9533, 'train_samples_per_second': 0.545, 'train_steps_per_second': 0.029, 'total_flos': 0.0, 'train_loss': 0.2382880789099727, 'epoch': 50.0})

In [13]:
trainer.save_model("./rewardmodel5")