In [1]:
!pip install -q transformers trl datasets wandb torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install --upgrade accelerate==1.6.0
!pip install --upgrade transformers==4.51.3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from trl import RewardTrainer, RewardConfig
from datasets import load_dataset
import warnings

warnings.filterwarnings('ignore')

2025-06-09 01:13:32.163569: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749420812.174068 1068555 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749420812.177339 1068555 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749420812.186749 1068555 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749420812.186758 1068555 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749420812.186759 1068555 computation_placer.cc:177] computation placer alr

In [7]:
wandb_logging = True
pretrained_model = "HuggingFaceTB/SmolLM2-135M-Instruct"  
reward_learning_rate = 5e-5
reward_training_cycles = 1
max_sequence_len = 512  
training_batch_size = 8  
validation_batch_size = 8  
reward_model_save_path = os.path.join(os.getcwd(), "trained_reward_model")  

reinforce_batch_dim = 4  
reinforce_learning_rate = 5e-5
reinforce_epoch_count = 1
max_generated_tokens = 512
reinforce_output_path = os.path.join(os.getcwd(), "reinforce_baseline")  

os.makedirs(reward_model_save_path, exist_ok=True)  
os.makedirs(reinforce_output_path, exist_ok=True)

In [8]:
def get_tokenizer(tokenizer_path: str):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, padding_side="left")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def extract_text(conversation):
    if isinstance(conversation, str):
        return conversation
    if isinstance(conversation, list):
        return "\n".join(
            [f"{turn['role'].capitalize()}: {turn['content']}"
             if isinstance(turn, dict)
             else str(turn)
             for turn in conversation]
        )
    return str(conversation)

def edit_reward_dataset(batch, tokenizer):
    chosen_texts = [extract_text(c) for c in batch["chosen"]]
    rejected_texts = [extract_text(r) for r in batch["rejected"]]

    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": max_sequence_len,
    }

    tokenized_chosen = tokenizer(chosen_texts, **kwargs)
    tokenized_rejected = tokenizer(rejected_texts, **kwargs)

    return {
        "input_ids_chosen": tokenized_chosen["input_ids"],
        "attention_mask_chosen": tokenized_chosen["attention_mask"],
        "input_ids_rejected": tokenized_rejected["input_ids"],
        "attention_mask_rejected": tokenized_rejected["attention_mask"],
    }

def edit_reinforce_dataset(batch, tokenizer):
    prompts = []
    for conversation in batch["chosen"]:
        if isinstance(conversation, list):
            prompt = "\n".join(
                [f"{turn['role'].capitalize()}: {turn['content']}" for turn in conversation[:-1]]
            ) + "\nAssistant: "
        else:
            prompt = str(conversation) + "\nAssistant: "
        prompts.append(prompt)

    tokens = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=max_sequence_len,
    )

    return {
        k: v for k, v in tokens.items()
    }

def splitting_dataset(test_size=0.2, train_subset_sizes=None, val_subset_size=None):
    dataset = load_dataset("juyoungml/HelpSteer2-binarized")

    if "train" in dataset:
        train_dataset = dataset["train"]
        split_dataset = train_dataset.train_test_split(test_size=test_size, seed=42)
        dataset["train"] = split_dataset["train"]
        dataset["validation"] = split_dataset["test"]

    if train_subset_sizes is not None:
        dataset["train"] = dataset["train"].select(range(train_subset_sizes))

    if val_subset_size is not None and "validation" in dataset:
        dataset["validation"] = dataset["validation"].select(range(val_subset_size))

    return dataset

In [9]:
def train_reward_model(output_model_dir=reward_model_save_path):
    if wandb_logging:
        import wandb

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Используемое устройство: {device}")

    tokenizer = get_tokenizer(pretrained_model)
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model,
        num_labels=1
    ).to(device)

    for param in model.parameters():
        param.requires_grad = True

    dataset = splitting_dataset(test_size=0.2)
    
    dataset = dataset.map(lambda x: edit_reward_dataset(x, tokenizer), batched=True)

    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]

    training_args = RewardConfig(
        output_dir=output_model_dir,
        per_device_train_batch_size=training_batch_size,
        per_device_eval_batch_size=validation_batch_size,
        num_train_epochs=reward_training_cycles,
        learning_rate=reward_learning_rate,
        gradient_accumulation_steps=1,
        fp16=torch.cuda.is_available(),
        max_length=max_sequence_len,
        remove_unused_columns=True,
        report_to="wandb" if wandb_logging else "none",
        gradient_checkpointing=True,
        logging_steps=10,
    )

    trainer = RewardTrainer(
        model=model,
        args=training_args,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    print("\nНачальная оценка модели...")
    metrics = trainer.evaluate()
    print("Метрики начальной оценки:", metrics)
    if wandb_logging:
        import wandb
        wandb.log({"initial_metrics": metrics})

    print("\nНачало обучения...")
    trainer.train()

    trainer.save_model(output_model_dir)
    print(f"\nМодель сохранена в: {output_model_dir}")

    print("\nФинальная оценка модели...")
    metrics = trainer.evaluate()
    print("Метрики после обучения:", metrics)
    if wandb_logging:
        import wandb
        wandb.log({"final_metrics": metrics})
        wandb.finish()

    print("\nОбучение завершено!")

train_reward_model()

Используемое устройство: cuda


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



Начальная оценка модели...


[34m[1mwandb[0m: Currently logged in as: [33mmregorova[0m ([33mmregorova-mipt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Метрики начальной оценки: {'eval_loss': 0.9563356637954712, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.4837144837144837, 'eval_runtime': 8.6331, 'eval_samples_per_second': 167.378, 'eval_steps_per_second': 20.966}

Начало обучения...


Step,Training Loss
10,0.8636
20,0.8737
30,0.7124
40,0.8172
50,0.9327
60,0.8072
70,0.7662
80,0.732
90,0.6718
100,0.8425



Модель сохранена в: /home/user/notebooks/Maria/tb/trained_reward_model

Финальная оценка модели...


Метрики после обучения: {'eval_loss': 0.637004554271698, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.6281163434903048, 'eval_runtime': 8.6831, 'eval_samples_per_second': 166.415, 'eval_steps_per_second': 20.845, 'epoch': 1.0}


0,1
eval/accuracy,▁█
eval/loss,█▁
eval/model_preparation_time,▁▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▅▆▃▆▆▂▃▂▃▂▂▂▂▅▂▂▄▃▂▄▂▂▂▃▂▂▁▃▂▁▄▂▄▂▂▂▃▆▂
train/learning_rate,████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.62812
eval/loss,0.637
eval/model_preparation_time,0.0027
eval/runtime,8.6831
eval/samples_per_second,166.415
eval/steps_per_second,20.845
total_flos,0.0
train/epoch,1.0
train/global_step,723.0
train/grad_norm,3.93982



Обучение завершено!


In [10]:
import sys
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, DataCollatorWithPadding

In [11]:
!pip install wandb -qU
import wandb
wandb.login()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


True

In [12]:
def edit_reinforce_dataset2(example, tokenizer):
    encoding = tokenizer(
        example["chosen"],
        padding="max_length",
        truncation=True,
        max_length=max_sequence_len,
    )
    return {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"],
    }

def train_reinforce(reward_model_path, output_model_dir):
    if wandb_logging:
        if IS_ON_KAGGLE:
            user_secrets = UserSecretsClient()
            my_secret = user_secrets.get_secret("wandb_api_key")
            wandb.login(key=my_secret)

        wandb.init(
            project="reinforce-test",
            config={
                "batch_size": reinforce_batch_dim,
                "learning_rate": reinforce_learning_rate,
            },
        )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    from transformers import DataCollatorWithPadding

In [13]:
def train_reinforce(reward_model_path, output_model_dir):
    wandb.init(project="huggingface", name="reinforce-run")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = get_tokenizer(pretrained_model)
    model = AutoModelForCausalLM.from_pretrained(pretrained_model).to(device)

    reward_tokenizer = get_tokenizer(pretrained_model)
    reward_model = AutoModelForSequenceClassification.from_pretrained(
        reward_model_path
    ).to(device)
    reward_model.eval()

    dataset = splitting_dataset(
        test_size=0.2, train_subset_sizes=1000, val_subset_size=300

    )
    
    train_dataset = dataset["train"].map(
        lambda x: edit_reinforce_dataset2(x, tokenizer)
    )
    val_dataset = dataset["validation"].map(
        lambda x: edit_reinforce_dataset2(x, tokenizer)
    )
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    train_loader = DataLoader(
        train_dataset, batch_size=reinforce_batch_dim, shuffle=True, collate_fn=data_collator
    )
    val_loader = DataLoader(
        val_dataset, batch_size=reinforce_batch_dim, collate_fn=data_collator
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=reinforce_learning_rate)

    def count_rewards(generated_sequences, input_ids):
        texts = reward_tokenizer.batch_decode(
            generated_sequences, skip_special_tokens=True
        )
        inputs = reward_tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=max_sequence_len,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            rewards = reward_model(**inputs).logits.squeeze(-1)
        return rewards

    def evaluate(model, dataloader):
        model.eval()
        all_rewards = []
        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Evaluation"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                generated = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=max_generated_tokens,
                    pad_token_id=tokenizer.pad_token_id,
                )
                rewards = count_rewards(generated, input_ids)
                all_rewards.extend(rewards.cpu().numpy())
        return np.mean(all_rewards), all_rewards

    mean_reward_before, all_rewards_before = evaluate(model, val_loader)
    print(f"Initial mean reward: {mean_reward_before:.4f}")
    if wandb_logging:
        wandb.log(
            {
                "eval/mean_reward_before": mean_reward_before,
                "eval/all_rewards_before": all_rewards_before,
            }
        )

    total_rewards = 0.0
    step_count = 0
    baseline = 0.0

    for epoch in range(reinforce_epoch_count):
        model.train()

        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Training epoch {epoch+1}")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_generated_tokens,
                pad_token_id=tokenizer.pad_token_id,
                return_dict_in_generate=True,
                output_scores=True,
            )

            full_sequences = outputs.sequences  # input + generated
            generated_tokens = full_sequences[:, input_ids.shape[1] :]

            rewards = count_rewards(full_sequences, input_ids)

            gen_attention_mask = torch.ones(
                generated_tokens.shape, dtype=torch.long, device=device
            )
            full_attention_mask = torch.cat([attention_mask, gen_attention_mask], dim=1)

            logits = model(full_sequences, attention_mask=full_attention_mask).logits
            log_probs = torch.log_softmax(logits[:, :-1], dim=-1)

            selected_log_probs = log_probs.gather(
                -1, generated_tokens.unsqueeze(-1)
            ).squeeze(-1)
            aggregated_log_probs = selected_log_probs.sum(dim=1)

            total_rewards += rewards.sum().item()
            step_count += rewards.shape[0]
            baseline = total_rewards / step_count
            baseline_tensor = torch.tensor(baseline, device=device)
            advantages = rewards - baseline_tensor

            loss = -(advantages * aggregated_log_probs).mean()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if wandb_logging:
                wandb.log(
                    {
                        "train/loss": loss.item(),
                        "train/avg_reward": rewards.mean().item(),
                        "train/avg_advantage": advantages.mean().item(),
                        "train/baseline": baseline,
                        "train/step": batch_idx,
                    }
                )

            if batch_idx % 10 == 0:
                print(
                    f"Batch {batch_idx}: loss: {loss.item():.4f}, avg_reward: {rewards.mean().item():.4f}"
                )

    mean_reward_after, all_rewards_after = evaluate(model, val_loader)
    print(f"Final mean reward: {mean_reward_after:.4f}")
    if wandb:
        wandb.log(
            {
                "eval/mean_reward_after": mean_reward_after,
                "eval/all_rewards_after": all_rewards_after,
            }
        )
        wandb.finish()

    os.makedirs(output_model_dir, exist_ok=True)
    model.save_pretrained(output_model_dir)
    print("Model saved to:", output_model_dir)

reward_model_path = reward_model_save_path 
output_model_dir = "./reinforce_with_alignment"

train_reinforce(reward_model_path, output_model_dir)

Evaluation: 100%|███████████████████████████████████████████████████████████████████████| 75/75 [07:31<00:00,  6.02s/it]


Initial mean reward: 0.0607


Training epoch 1:   0%|▎                                                                | 1/250 [00:09<37:37,  9.07s/it]

Batch 0: loss: -168.5786, avg_reward: 1.0222


Training epoch 1:   4%|██▊                                                             | 11/250 [01:09<21:44,  5.46s/it]

Batch 10: loss: -119.0837, avg_reward: 0.3842


Training epoch 1:   8%|█████▍                                                          | 21/250 [02:33<29:45,  7.80s/it]

Batch 20: loss: -6359.0674, avg_reward: -0.1524


Training epoch 1:  12%|███████▉                                                        | 31/250 [03:21<08:08,  2.23s/it]

Batch 30: loss: -0.1573, avg_reward: 0.3655


Training epoch 1:  16%|██████████▍                                                     | 41/250 [04:45<29:22,  8.43s/it]

Batch 40: loss: 5431.2979, avg_reward: 0.8311


Training epoch 1:  20%|█████████████                                                   | 51/250 [05:51<20:10,  6.08s/it]

Batch 50: loss: 3559.6919, avg_reward: 0.5628


Training epoch 1:  24%|███████████████▌                                                | 61/250 [06:23<09:44,  3.09s/it]

Batch 60: loss: 196.0574, avg_reward: 0.7266


Training epoch 1:  29%|██████████████████▍                                             | 72/250 [06:45<02:10,  1.37it/s]

Batch 70: loss: 374.0320, avg_reward: 0.9149


Training epoch 1:  33%|████████████████████▉                                           | 82/250 [06:49<00:45,  3.73it/s]

Batch 80: loss: 15.5010, avg_reward: 0.4043


Training epoch 1:  37%|███████████████████████▌                                        | 92/250 [07:00<01:46,  1.49it/s]

Batch 90: loss: 11.7135, avg_reward: 0.4674


Training epoch 1:  41%|█████████████████████████▋                                     | 102/250 [07:20<03:55,  1.59s/it]

Batch 100: loss: -18.5863, avg_reward: -0.0928


Training epoch 1:  45%|████████████████████████████▏                                  | 112/250 [07:31<01:00,  2.27it/s]

Batch 110: loss: 2.7252, avg_reward: 0.3806


Training epoch 1:  49%|██████████████████████████████▋                                | 122/250 [07:34<00:24,  5.30it/s]

Batch 120: loss: 4.3792, avg_reward: 0.1931


Training epoch 1:  53%|█████████████████████████████████▎                             | 132/250 [07:46<02:00,  1.03s/it]

Batch 130: loss: -741.0897, avg_reward: 0.1683


Training epoch 1:  56%|███████████████████████████████████▌                           | 141/250 [07:56<03:46,  2.08s/it]

Batch 140: loss: -16.9304, avg_reward: 0.0537


Training epoch 1:  60%|██████████████████████████████████████                         | 151/250 [08:25<06:01,  3.65s/it]

Batch 150: loss: -1.9141, avg_reward: 0.2622


Training epoch 1:  64%|████████████████████████████████████████▌                      | 161/250 [09:46<12:48,  8.63s/it]

Batch 160: loss: 1116.4875, avg_reward: 1.9947


Training epoch 1:  68%|███████████████████████████████████████████                    | 171/250 [11:17<11:49,  8.98s/it]

Batch 170: loss: 9.5164, avg_reward: 1.1755


Training epoch 1:  72%|█████████████████████████████████████████████▌                 | 181/250 [12:46<10:13,  8.89s/it]

Batch 180: loss: 18.8554, avg_reward: 2.1646


Training epoch 1:  76%|████████████████████████████████████████████████▏              | 191/250 [14:17<08:51,  9.00s/it]

Batch 190: loss: 17.9464, avg_reward: 1.9930


Training epoch 1:  80%|██████████████████████████████████████████████████▋            | 201/250 [15:48<07:28,  9.14s/it]

Batch 200: loss: 20.6210, avg_reward: 2.6248


Training epoch 1:  84%|█████████████████████████████████████████████████████▏         | 211/250 [17:21<06:06,  9.40s/it]

Batch 210: loss: 11.3108, avg_reward: 2.1059


Training epoch 1:  88%|███████████████████████████████████████████████████████▋       | 221/250 [18:54<04:31,  9.37s/it]

Batch 220: loss: 12.3202, avg_reward: 2.1166


Training epoch 1:  92%|██████████████████████████████████████████████████████████▏    | 231/250 [20:25<02:50,  8.95s/it]

Batch 230: loss: 17.1815, avg_reward: 2.2655


Training epoch 1:  96%|████████████████████████████████████████████████████████████▋  | 241/250 [21:57<01:23,  9.26s/it]

Batch 240: loss: 10.6713, avg_reward: 2.1545


Training epoch 1: 100%|███████████████████████████████████████████████████████████████| 250/250 [23:19<00:00,  5.60s/it]
Evaluation: 100%|███████████████████████████████████████████████████████████████████████| 75/75 [11:22<00:00,  9.10s/it]

Final mean reward: 1.8854





0,1
eval/mean_reward_after,▁
eval/mean_reward_before,▁
train/avg_advantage,▂▄▃▁▄▂▄▃▅▃▃▄▄▃▄▄▂▃▅▄▄▆▄▄▅▅▇▅▆██▇█▅█▆▅▇▇▇
train/avg_reward,▃▁▄▄▃▄▁▄▂▃▂▄▄▅▂▃▆▂▆▄▃▃▇▆▇████▆█▆▇▇█▆▇█▇█
train/baseline,▄▄▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇██
train/loss,▇█▇▁▅█▄▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇▇▇▇▇▇▇▇▇▇▇▇
train/step,▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇███

0,1
eval/mean_reward_after,1.88537
eval/mean_reward_before,0.06072
train/avg_advantage,0.7579
train/avg_reward,1.67465
train/baseline,0.91675
train/loss,4.90406
train/step,249.0


Model saved to: ./reinforce_with_alignment
