### Dataset Loading and Preparation

In [4]:
from datasets import load_dataset, load_metric
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

### Train Reward Model

Prepare reward model dataset. Let's fit reward model on limited part of train dataset

In [3]:
imdb = load_dataset("imdb")

In [None]:
reduced_train_dataset = imdb['train'].shuffle(seed=42).select([i for i in list(range(3000))])
reduced_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [3]:
from trl import RewardTrainer, RewardConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [10]:
reward_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-cased")
reward_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased", num_labels=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    return reward_tokenizer(examples["text"], truncation=True)

In [1]:
tokenized_train = reduced_train_dataset.map(preprocess_function, batched=True)
tokenized_test = reduced_test_dataset.map(preprocess_function, batched=True)

NameError: name 'reduced_train_dataset' is not defined

In [None]:
reward_collator = DataCollatorWithPadding(tokenizer=reward_tokenizer)

In [5]:
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
training_args = TrainingArguments(
   output_dir="new_reward/",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
reward_trainer = Trainer(
   model=reward_model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=reward_tokenizer,
   data_collator=reward_collator,
   compute_metrics=compute_metrics,
)

In [None]:
reward_trainer.train()

In [None]:
reward_trainer.evaluate()

### SFT Model

#### Train and Eval dataset preparations

#### Model Training

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [28]:
sft_model_name = "lvwerra/gpt2-imdb"
sft_model = AutoModelForCausalLM.from_pretrained(sft_model_name)
sft_tokenizer = AutoTokenizer.from_pretrained(sft_model_name)

### WARP Training

In [None]:
from src.trainer.warp_config import WARPConfig
from src.trainer.warp_trainer import WARPTrainer

In [None]:
warp_config = WARPConfig()

warp_trainer = WARPTrainer(
    config=warp_config,
    model=sft_model,
    tokenizer=sft_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer
)

In [None]:
warp_trainer.train()