In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

dataset = load_dataset("argilla/tripadvisor-hotel-reviews")
tokenizer = AutoTokenizer.from_pretrained("openai-gpt")


In [None]:
# Add padding with the pad token
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
    adafactor=False,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    auto_find_batch_size=False,
    batch_eval_metrics=False,
    bf16=False,
    bf16_full_eval=False,
    data_seed=None,
    dataloader_drop_last=False,
    dataloader_num_workers=0,
    dataloader_persistent_workers=False,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=None,
    ddp_backend=None,
    ddp_broadcast_buffers=None,
    ddp_bucket_cap_mb=None,
    ddp_find_unused_parameters=None,
    ddp_timeout=1800,
    debug=[],
    deepspeed=None,
    disable_tqdm=True,
    dispatch_batches=None,
    do_eval=True,
    do_predict=False,
    do_train=False,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_do_concat_batches=True,
    eval_on_start=False,
    eval_steps=None,
    eval_strategy="epoch",
    eval_use_gather_object=False,
    fp16=False,
    fp16_backend="auto",
    fp16_full_eval=False,
    fp16_opt_level="O1",
    fsdp=[],
    fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
    fsdp_min_num_params=0,
    fsdp_transformer_layer_cls_to_wrap=None,
    full_determinism=False,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs=None,
    greater_is_better=None,
    group_by_length=False,
    half_precision_backend="auto",
    hub_always_push=False,
    hub_model_id=None,
    hub_private_repo=False,
    hub_strategy="every_save",
    hub_token="<HUB_TOKEN>",
    ignore_data_skip=False,
    include_inputs_for_metrics=False,
    include_num_input_tokens_seen=False,
    include_tokens_per_second=False,
    jit_mode_eval=False,
    label_names=None,
    label_smoothing_factor=0.0,
    learning_rate=5e-05,
    length_column_name="length",
    load_best_model_at_end=False,
    local_rank=0,
    log_level="passive",
    log_level_replica="warning",
    log_on_each_node=True,
    logging_dir="output_dir/runs/Jan01_15-17-36_fa8372f5-394b-4363-9cc4-dd02a34e7b18",
    logging_first_step=False,
    logging_nan_inf_filter=True,
    logging_steps=500,
    logging_strategy="steps",
    lr_scheduler_kwargs={},
    lr_scheduler_type="linear",
    max_grad_norm=1.0,
    max_steps=-1,
    metric_for_best_model=None,
    mp_parameters="",
    neftune_noise_alpha=None,
    no_cuda=False,
    num_train_epochs=3.0,
    optim="adamw_torch",
    optim_args=None,
    optim_target_modules=None,
    output_dir="output_dir",
    overwrite_output_dir=False,
    past_index=-1,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    prediction_loss_only=False,
    push_to_hub=False,
    push_to_hub_model_id=None,
    push_to_hub_organization=None,
    ray_scope="last",
    remove_unused_columns=True,
    report_to=['tensorboard'],
    restore_callback_states_from_checkpoint=False,
    resume_from_checkpoint=None,
    run_name="output_dir",
    save_on_each_node=False,
    save_only_model=False,
    save_safetensors=True,
    save_steps=500,
    save_strategy="steps",
    save_total_limit=None,
    seed=42,
    skip_memory_metrics=True,
    split_batches=None,
    tf32=None,
    torch_compile=False,
    torch_compile_backend=None,
    torch_compile_mode=None,
    torch_empty_cache_steps=None,
    torchdynamo=None,
    tpu_metrics_debug=False,
    tpu_num_cores=None,
    use_cpu=False,
    use_ipex=False,
    use_legacy_prediction_loop=False,
    use_liger_kernel=False,
    use_mps_device=False,
    warmup_ratio=0.0,
    warmup_steps=0,
    weight_decay=0.0,
)

In [9]:
type(tokenized_datasets["train"])

tokenized_datasets["train"], tokenized_datasets["validation"] = tokenized_datasets["train"].train_test_split(test_size=0.2)

In [10]:
from transformers import Trainer, TrainingArguments, OpenAIGPTForSequenceClassification


model = OpenAIGPTForSequenceClassification.from_pretrained("openai-gpt")

training_dataset = tokenized_datasets["train"]
testing_dataset = tokenized_datasets["validation"]

trainer = Trainer(
    model=model, args=training_args, train_dataset=training_dataset, eval_dataset=testing_dataset,
)  

Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from datasets import load_dataset

# Load the dataset
preference_data = load_dataset("trl-internal-testing/hh-rlhf-helpful-base-trl-style", split="train")

# Define a function to extract the prompt
def extract_prompt(text):
    prompt = text[0]['content']
    return prompt

# Apply the function to the dataset 
preference_data_with_prompt = preference_data.map(
    lambda sample: {**sample, 'prompt': extract_prompt(sample['chosen'])}
)

sample = preference_data_with_prompt.select(range(1))
print(sample['prompt'])

README.md:   0%|          | 0.00/964 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43835 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2354 [00:00<?, ? examples/s]

Map:   0%|          | 0/43835 [00:00<?, ? examples/s]

['Hi, I want to learn to play horseshoes. Can you teach me?']


In [17]:
sample['prompt']
sample['chosen']
sample['rejected']

[[{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?',
   'role': 'user'},
  {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
   'role': 'assistant'},
  {'content': 'Okay. What else is needed to play, and what are the rules?',
   'role': 'user'},
  {'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.',
   'role': 'assistant'}]]

In [None]:
sample