In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
from trl import RewardTrainer, RewardConfig
import torch

In [11]:
%env WANDB_PROJECT=WARP_imdb

env: WANDB_PROJECT=WARP_imdb


In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-cased")

# поменять стандартный классифаер в конце на предсказание скаляра (в нашем случае - reward)
model.classifier = torch.nn.Linear(model.pre_classifier.weight.size(1), 1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
data = datasets.load_from_disk('../comment_pairs')

# Reward trainer Должен иметь следующие фичи на входе 
* `input_ids_chosen`
* `attention_mask_chosen`
* `input_ids_rejected` 
* `attention_mask_rejected`

**Chosen** в нашем случае - позитивный комментарий, **Rejected** - негативный

In [16]:
def tokenize_data(sample):
    tokenized_pos = tokenizer(sample['positive_comment'], padding='max_length', truncation=True, return_tensors='pt')
    tokenized_neg = tokenizer(sample['negative_comment'], padding='max_length', truncation=True, return_tensors='pt')

    sample['input_ids_chosen'] = tokenized_pos['input_ids']
    sample['attention_mask_chosen'] = tokenized_pos['attention_mask']
    sample['input_ids_rejected'] = tokenized_neg['input_ids']
    sample['attention_mask_rejected'] = tokenized_neg['attention_mask']

    return sample


data = data.map(tokenize_data, batched=True, num_proc=16)
data.set_format('torch')

##### Обучение reward модели с помощью trl 

In [17]:
reward_config = RewardConfig(
    output_dir="../reward_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="../logs",
    learning_rate=1e-5,
    do_eval=False,
    report_to='wandb',
    max_length=512,
    remove_unused_columns=False,
    logging_steps=50,
    save_steps=500,
    run_name='reward_model_train'
)

trainer = RewardTrainer(
    model=model,
    args=reward_config,
    train_dataset=data,
    tokenizer=tokenizer,
    compute_metrics=None
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikitasergeev692[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
50,0.686
100,0.3865
150,0.1846
200,0.1521
250,0.1071
300,0.1325
350,0.116
400,0.0961
450,0.0827
500,0.0834


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1173, training_loss=0.11806740917309877, metrics={'train_runtime': 371.847, 'train_samples_per_second': 100.848, 'train_steps_per_second': 3.155, 'total_flos': 0.0, 'train_loss': 0.11806740917309877, 'epoch': 3.0})

In [18]:
trainer.save_model('../reward_model/final_checkpoint')