In [1]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch



In [2]:
# Load all helpfulness/harmless subsets (share the same schema)
raw_datasets = load_dataset("Anthropic/hh-rlhf")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("reward_modeling_anthropic_hh", num_labels=1)

In [6]:
tokens = tokenizer("hello, I am your father.", return_tensors="pt")
print(tokens)

{'input_ids': tensor([[31373,    11,   314,   716,   534,  2988,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [7]:
with torch.no_grad():
    output = model(**tokens)

In [8]:
output.logits

tensor([[-0.2241]])

In [9]:
def preprocess_function(examples):
        new_examples = {
            "input_ids_chosen": [],
            "attention_mask_chosen": [],
            "input_ids_rejected": [],
            "attention_mask_rejected": [],
        }
        for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
            tokenized_chosen = tokenizer(chosen)
            tokenized_rejected = tokenizer(rejected)

            new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
            new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
            new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
            new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

        return new_examples

In [10]:
raw_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=16,
    )

In [11]:
max_length = 1024
raw_datasets = raw_datasets.filter(
        lambda x: len(x["input_ids_chosen"]) <= max_length
        and len(x["input_ids_rejected"]) <= max_length
    )

In [12]:
with torch.no_grad():
    output = model(input_ids=torch.tensor([raw_datasets["test"]["input_ids_chosen"][0]]),
        attention_mask=torch.tensor([raw_datasets["test"]["attention_mask_chosen"][0]]))

In [13]:
output.logits

tensor([[-0.2263]])

In [14]:
# calculate f1 score on the test set
from datasets import load_metric
metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits > 0
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [19]:
from tqdm import tqdm
# make predictions on test set using model
prediction_results = np.zeros((len(raw_datasets["test"]),))
for i, sample in enumerate(tqdm(raw_datasets["test"])):
    with torch.no_grad():
        output = model(input_ids=torch.tensor([sample["input_ids_chosen"]]),
            attention_mask=torch.tensor([sample["attention_mask_chosen"]]))
        sample["logits_chosen"] = output.logits

        output = model(input_ids=torch.tensor([sample["input_ids_rejected"]]),
            attention_mask=torch.tensor([sample["attention_mask_rejected"]]))
        sample["logits_rejected"] = output.logits
        
        prediction_results[i] = float(sample["logits_chosen"] > sample["logits_rejected"])

100%|██████████| 8489/8489 [19:07<00:00,  7.40it/s]


In [20]:
np.sum(prediction_results) / len(prediction_results)

0.5763929791494876

In [21]:
# calculate f1 score
compute_metrics((prediction_results, np.ones((len(prediction_results),))))

{'f1': 0.731280824988791}

In [22]:
shp_datasets = load_dataset("stanfordnlp/shp")


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [25]:
def transform_example(example):
    history = example['history']
    human_ref_A = example['human_ref_A']
    human_ref_B = example['human_ref_B']
    label = example['labels']

    chosen_text = f"Human: {history} Assistant: {human_ref_A}" if label == 1 else f"Human: {history} Assistant: {human_ref_B}"
    rejected_text = f"Human: {history} Assistant: {human_ref_B}" if label == 1 else f"Human: {history} Assistant: {human_ref_A}"
    
    return {'chosen': chosen_text, 'rejected': rejected_text}

shp_datasets = shp_datasets.map(transform_example, num_proc=16)
shp_datasets = shp_datasets.map(preprocess_function, batched=True, num_proc=16)
shp_datasets = shp_datasets.filter(
        lambda x: len(x["input_ids_chosen"]) <= max_length
        and len(x["input_ids_rejected"]) <= max_length
    )

Map (num_proc=16):   0%|          | 0/348718 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/18436 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/18409 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/348718 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1169 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1344 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1152 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1384 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1703 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=16):   0%|          | 0/18436 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1128 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1045 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1139 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1465 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1053 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=16):   0%|          | 0/18409 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1156 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1393 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1517 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1178 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Filter:   0%|          | 0/348718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18436 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18409 [00:00<?, ? examples/s]

In [26]:
shp_prediction_results = np.zeros((len(shp_datasets["test"]),))
for i, sample in enumerate(tqdm(shp_datasets["test"])):
    with torch.no_grad():
        output = model(input_ids=torch.tensor([sample["input_ids_chosen"]]),
            attention_mask=torch.tensor([sample["attention_mask_chosen"]]))
        sample["logits_chosen"] = output.logits

        output = model(input_ids=torch.tensor([sample["input_ids_rejected"]]),
            attention_mask=torch.tensor([sample["attention_mask_rejected"]]))
        sample["logits_rejected"] = output.logits
        
        shp_prediction_results[i] = float(sample["logits_chosen"] > sample["logits_rejected"])

100%|██████████| 17791/17791 [48:16<00:00,  6.14it/s] 


In [27]:
compute_metrics((shp_prediction_results, np.ones((len(shp_prediction_results),))))

{'f1': 0.7203481263036754}

In [28]:
np.sum(prediction_results) / len(prediction_results)

0.5763929791494876

In [29]:
shp_model = AutoModelForSequenceClassification.from_pretrained("reward_modeling_shp", num_labels=1)

In [32]:
# make predictions on test set using model
shphh_prediction_results = np.zeros((len(raw_datasets["test"]),))
for i, sample in enumerate(tqdm(raw_datasets["test"])):
    with torch.no_grad():
        output = shp_model(input_ids=torch.tensor([sample["input_ids_chosen"]]),
            attention_mask=torch.tensor([sample["attention_mask_chosen"]]))
        sample["logits_chosen"] = output.logits

        output = shp_model(input_ids=torch.tensor([sample["input_ids_rejected"]]),
            attention_mask=torch.tensor([sample["attention_mask_rejected"]]))
        sample["logits_rejected"] = output.logits
        
        shphh_prediction_results[i] = float(sample["logits_chosen"] > sample["logits_rejected"])

print(f"accuracy {np.sum(shphh_prediction_results) / len(shphh_prediction_results)}")
print(f"f1 score {compute_metrics((shphh_prediction_results, np.ones((len(shphh_prediction_results),))))}")

100%|██████████| 8489/8489 [18:47<00:00,  7.53it/s]

accuracy 0.5289197785369302
f1 score {'f1': 0.6918868942137298}





In [33]:
shpshp_prediction_results = np.zeros((len(shp_datasets["test"]),))
for i, sample in enumerate(tqdm(shp_datasets["test"])):
    with torch.no_grad():
        output = shp_model(input_ids=torch.tensor([sample["input_ids_chosen"]]),
            attention_mask=torch.tensor([sample["attention_mask_chosen"]]))
        sample["logits_chosen"] = output.logits

        output = shp_model(input_ids=torch.tensor([sample["input_ids_rejected"]]),
            attention_mask=torch.tensor([sample["attention_mask_rejected"]]))
        sample["logits_rejected"] = output.logits
        
        shpshp_prediction_results[i] = float(sample["logits_chosen"] > sample["logits_rejected"])

print(f"accuracy {np.sum(shpshp_prediction_results) / len(shpshp_prediction_results)}")
print(f"f1 score {compute_metrics((shpshp_prediction_results, np.ones((len(shpshp_prediction_results),))))}")

100%|██████████| 17791/17791 [47:12<00:00,  6.28it/s]  

accuracy 0.6979933674329717
f1 score {'f1': 0.8221390976199145}



