In [None]:
from trl import DPOTrainer, DPOConfig
import torch
import random
import numpy as np

# from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from evaluate import load
import numpy as np
# from datasets import load_dataset
import pandas as pd
import os
import logging
import warnings
import json
from huggingface_hub import login
import wandb
# import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    set_seed,
)

from torch.utils.data import IterableDataset, DataLoader
from trl import DPOTrainer, DPOConfig

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.FATAL)
torch.set_float32_matmul_precision('medium') 
logging.getLogger("lightning").setLevel(logging.CRITICAL)
logging.getLogger("lightning.fabric.utilities.rank_zero").setLevel(logging.CRITICAL)
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.CRITICAL)


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    set_seed(seed)
    torch.backends.cudnn.deterministic = True


set_random_seed(42)

2025-05-28 21:53:21.847210: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-28 21:53:23.291867: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
CONFIG = json.load(open('/home/jupyter/datasphere/project/tokens.json'))
tok = CONFIG["HF_TOK"]
wdb_tok = CONFIG['WANDB_API_KEY']

login(token=tok)
wandb.login(key=wdb_tok)

In [3]:
dev = '/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.dev.tsv'
train = '/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.train.tsv'
test = '/home/jupyter/datasphere/project/rugec/data/RULEC-GEC.test.tsv'

rulec_train = pd.read_csv(train, delimiter='\t')
rulec_test = pd.read_csv(test, delimiter='\t')
rulec_dev = pd.read_csv(dev, delimiter='\t')

clang8 = pd.read_csv('/home/jupyter/datasphere/project/rugec/data/clang8_source_target_ru.spacy_tokenized.tsv', sep='\t')
gera_train = pd.read_csv('/home/jupyter/datasphere/project/rugec/data/GERA.train.tsv', sep='\t')
gera_test = pd.read_csv('/home/jupyter/datasphere/project/rugec/data/GERA.test.tsv', sep='\t')
gera_dev = pd.read_csv('/home/jupyter/datasphere/project/rugec/data/GERA.dev.tsv', sep='\t')
relco = pd.read_csv('/home/jupyter/datasphere/project/rugec/data/relco_filtered.tsv', sep='\t')

# train.corrupt_sent = train.corrupt_sent.map(lambda x: str(x))
# test.corrupt_sent = test.corrupt_sent.map(lambda x: str(x))
# dev.corrupt_sent = dev.corrupt_sent.map(lambda x: str(x))
# train.correct_sent = train.correct_sent.map(lambda x: str(x))
# test.correct_sent = test.correct_sent.map(lambda x: str(x))
# dev.correct_sent = dev.correct_sent.map(lambda x: str(x))

# train_all = pd.concat([rulec_train.sample(frac=10, replace=True), clang8, relco])
train_all = pd.concat([rulec_train, clang8, relco, gera_train, gera_dev])
train_all.corrupt_sent = train_all.corrupt_sent.map(lambda x: str(x))
train_all.correct_sent = train_all.correct_sent.map(lambda x: str(x))
train_all = train_all.sample(frac=1)
fine_tune = {'train':Dataset.from_dict({'corrupt_sent':train_all['corrupt_sent'],'correct_sent' : train_all['correct_sent']}),
     'test':Dataset.from_dict({'corrupt_sent':rulec_test['corrupt_sent'],'correct_sent' : rulec_test['correct_sent']}),
      'dev':Dataset.from_dict({'corrupt_sent' : rulec_dev['corrupt_sent'], 'correct_sent':rulec_dev['correct_sent']})
     }
fine_tune

{'train': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 59947
 }),
 'test': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 5000
 }),
 'dev': Dataset({
     features: ['corrupt_sent', 'correct_sent'],
     num_rows: 2500
 })}

In [4]:
def get_model_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return torch.device("mps")
    else:
        return torch.device("cpu")
    
device = get_model_device()
device

device(type='cuda')

In [5]:
name = 'mika5883/ft_rugec_A'
config = AutoConfig.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name).to(device)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
comet_metric = load('comet') 

Downloading builder script: 100%|██████████| 6.97k/6.97k [00:00<00:00, 17.5MB/s]
Fetching 5 files: 100%|██████████| 5/5 [00:40<00:00,  8.16s/it]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../../tmp/xdg_cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.


In [None]:
class OnTheFlyDpoDataset(IterableDataset):
    def __init__(self, prompts, golds, model, tokenizer, comet_metric, num_beams=6, num_beam_groups=3, batch_size=8):
        self.prompts = prompts
        self.golds = golds
        self.model = model
        self.tokenizer = tokenizer
        self.comet = comet_metric
        self.num_beams = num_beams
        self.num_beam_groups = num_beam_groups
        self.batch_size = batch_size
        self.ep = 0
        self.save_data = {}
    def __len__(self):
        return len(self.prompts)
    def __iter__(self):
        for i in range(0, len(self.prompts), self.batch_size):
            batch_prompts = self.prompts[i : i + self.batch_size]
            batch_golds   = self.golds[i : i + self.batch_size]

            inputs = self.tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=128,
            ).to(self.model.device)
            # outputs = self.model.generate(
            #     **inputs,
            #     num_beams=self.num_beams,
            #     num_return_sequences=self.num_beams,
            #     max_length=128,
            # )
            outputs = self.model.generate(
                **inputs,
                num_beams=self.num_beams,
                num_return_sequences=self.num_beams,
                num_beam_groups = self.num_beam_groups,
                diversity_penalty=1.0, 
                do_sample=False,
                max_length=128,
            )
            decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            beams_grouped = [
                decoded[j : j + self.num_beams]
                for j in range(0, len(decoded), self.num_beams)
            ]

            flat_srcs = [p[9:] for p in batch_prompts for _ in range(self.num_beams)]
            flat_mts  = [hyp for group in beams_grouped for hyp in group]
            flat_refs = [ref for ref in batch_golds for _ in range(self.num_beams)]
            self.comet.add_batch(predictions=flat_mts, references=flat_refs, sources=flat_srcs)
            scores = self.comet.compute()["scores"]

            grouped_scores = [
                scores[k : k + self.num_beams]
                for k in range(0, len(scores), self.num_beams)
            ]

            for prompt, group, score_list in zip(batch_prompts, beams_grouped, grouped_scores):
                seen = {}
                for hyp, sc in zip(group, score_list):
                    if hyp not in seen:
                        seen[hyp] = sc
                items = sorted(seen.items(), key=lambda x: x[1], reverse=True)
                chosen = items[0][0]
                rejected = items[1][0] if len(items) > 1 else items[0][0]
                if self.ep not in self.save_data:
                    self.save_data[self.ep] = []
                self.save_data[self.ep].append({
                    "prompt": prompt,
                    "chosen": chosen,
                    "rejected": rejected
                })
                if len(self.save_data[self.ep]) == len(self.prompts):
                    self.ep += 1
                yield {
                    "prompt":  prompt,
                    "chosen":  chosen,
                    "rejected": rejected,
                }

                
class DPODataCollator:
    def __init__(self, tokenizer, max_prompt_length=128, max_completion_length=128):
        self.tokenizer = tokenizer
        self.max_prompt_length = max_prompt_length
        self.max_completion_length = max_completion_length

    def __call__(self, batch):
        input_ids = []
        labels_chosen = []
        labels_rejected = []
        pr_am = []
        ch_am = []
        re_am = []

        for example in batch:
            row = CustomDPOTrainer.tokenize_row(
                features=example,
                processing_class=self.tokenizer,
                max_prompt_length=self.max_prompt_length,
                max_completion_length=self.max_completion_length,
                add_special_tokens=True,
            )

            prompt_ids = row["prompt_input_ids"]
            chosen_ids = row["chosen_input_ids"]
            rejected_ids = row["rejected_input_ids"]
            
            prompt_attention_mask = row["prompt_attention_mask"]
            chosen_attention_mask = row["chosen_attention_mask"]
            rejected_attention_mask = row["rejected_attention_mask"]
            
            input_ids.append(prompt_ids)
            labels_chosen.append(chosen_ids)
            labels_rejected.append(rejected_ids)
            pr_am.append(prompt_attention_mask)
            ch_am.append(chosen_attention_mask)
            re_am.append(rejected_attention_mask)
                   
        def pad(batch_ids, pad_val):
            return torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(ids) for ids in batch_ids],
                batch_first=True,
                padding_value=pad_val,
            )
        padded = {
        "prompt_input_ids": pad(input_ids, pad_val=self.tokenizer.pad_token_id),
        "chosen_input_ids": pad(labels_chosen, pad_val=self.tokenizer.pad_token_id),
        "rejected_input_ids": pad(labels_rejected, pad_val=self.tokenizer.pad_token_id),
        "prompt_attention_mask": pad(pr_am, pad_val=0),
        "chosen_attention_mask": pad(ch_am, pad_val=0),
        "rejected_attention_mask": pad(re_am, pad_val=0),
        }
        padded = {k: v.to(device) for k, v in padded.items()}
        return padded                

class CustomDPOTrainer(DPOTrainer):
    def _prepare_dataset(self, dataset, processing_class, args, dataset_name):
        # skipping maps inside
        return dataset

    @staticmethod
    def tokenize_row(features, processing_class, max_prompt_length, max_completion_length, add_special_tokens):
        tokenizer = processing_class
        prompt = tokenizer(features["prompt"],  add_special_tokens=False)
        chosen = tokenizer(features["chosen"],  add_special_tokens=False)
        rejected = tokenizer(features["rejected"], add_special_tokens=False)
        
        p_ids = prompt["input_ids"]
        c_ids = chosen["input_ids"]
        r_ids = rejected["input_ids"]
       
        prompt_attention_mask = prompt["attention_mask"]
        chosen_attention_mask = chosen["attention_mask"]
        rejected_attention_mask = rejected["attention_mask"]
        
        if add_special_tokens:
            if tokenizer.bos_token_id is not None:
                p_ids = [tokenizer.bos_token_id] + p_ids
                prompt_attention_mask = [1] + prompt_attention_mask
            if tokenizer.eos_token_id is not None:
                p_ids = p_ids + [tokenizer.eos_token_id]
                prompt_attention_mask = prompt_attention_mask + [1]

        if tokenizer.eos_token_id is not None:
            c_ids = c_ids + [tokenizer.eos_token_id]
            chosen_attention_mask = chosen_attention_mask + [1]
            r_ids = r_ids + [tokenizer.eos_token_id]
            rejected_attention_mask = rejected_attention_mask + [1]

        if max_prompt_length is not None:
            p_ids = p_ids[-max_prompt_length:]
        if max_completion_length is not None:
            c_ids = c_ids[: max_completion_length]
            r_ids = r_ids[: max_completion_length]

        return {
            "prompt_input_ids": p_ids,
            "prompt_attention_mask": prompt_attention_mask,            
            "chosen_input_ids":  c_ids,
            "chosen_attention_mask": chosen_attention_mask,            
            "rejected_input_ids": r_ids,
            "rejected_attention_mask": rejected_attention_mask,
        }

    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=self.data_collator,
            num_workers=self.args.dataloader_num_workers,
        )
    def get_eval_dataloader(self,eval_dataset=None):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=self.data_collator,
            num_workers=self.args.dataloader_num_workers,
        )


train_prompts, train_golds = [a for a in fine_tune['train'][:].values()]
train_prompts = ['grammar: ' + a for a in train_prompts]

# len(train_prompts), len(train_golds)
dev_prompts, dev_golds = [a for a in fine_tune['dev'][:].values()]
dev_prompts = ['grammar: ' + a for a in dev_prompts]

test_prompts, test_golds = [a for a in fine_tune['test'][:].values()]
test_prompts = ['grammar: ' + a for a in test_prompts]    
    
train_ds = OnTheFlyDpoDataset(
    train_prompts,
    train_golds,
    model,
    tokenizer,
    comet_metric,
    num_beams=6,
    batch_size=16,
    num_beam_groups=3
)

dev_ds = OnTheFlyDpoDataset(
    dev_prompts,
    dev_golds,
    model,
    tokenizer,
    comet_metric,
    num_beams=6,
    batch_size=16,
    num_beam_groups=3
)


collator = DPODataCollator(tokenizer)


In [9]:
train_prompts[0], train_golds[0]

('grammar: Первая компания – Даквал-Пули из Худ Ривер , ш .',
 'Первая компания – Даквал-Пули из Худ Ривер , ш .')

In [10]:
len(train_ds)

59947

In [11]:
(len(train_ds) // 16 // 2 + 1)# / 30

1874

In [None]:
dpo_config = DPOConfig(
    max_steps=1900, 
    beta=0.1,                           
    loss_type='bco_pair', #better sigmoid.
    learning_rate=1e-4,                
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # num_train_epochs=3,
    gradient_accumulation_steps=2,
    eval_strategy="steps",      
    eval_steps=100,                   
    save_strategy="best", 
    metric_for_best_model='loss',
    logging_steps=10,
    fp16=True,
    save_total_limit=1,
    max_length=128,                   
    generate_during_eval=False,
    remove_unused_columns=False,      # Important: keeps prompt/chosen/rejected
    # warmup_ratio=0.02,                 
    lr_scheduler_type="cosine",      
    report_to="all",        
    logging_dir="./logs",
    # padding_value=tokenizer.pad_token_id,  
    output_dir="gec_t5_dpo_A_v3",   
    run_name='gec_t5_dpo_A_bco_pair'
)

In [9]:
trainer = CustomDPOTrainer(
    model=model,
    args=dpo_config,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=collator)

In [None]:
trainer.train()

wandb: Tracking run with wandb version 0.18.5
wandb: Run data is saved locally in /home/jupyter/work/resources/rugec/notebooks/dpo/wandb/run-20250528_215511-d7d2lotz
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run gec_t5_dpo_A_bco_pair
wandb: ⭐️ View project at https://wandb.ai/mika5883/huggingface
wandb: 🚀 View run at https://wandb.ai/mika5883/huggingface/runs/d7d2lotz
  0%|          | 0/1900 [00:00<?, ?it/s]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  0%|          

{'loss': 1.342, 'grad_norm': 1.7331138849258423, 'learning_rate': 9.999446382823013e-05, 'rewards/chosen': -0.0036153055261820555, 'rewards/rejected': -0.11398736387491226, 'rewards/accuracies': 0.6937500238418579, 'rewards/margins': 0.11037204414606094, 'logps/chosen': -1.0418598651885986, 'logps/rejected': -6.1441826820373535, 'logits/chosen': -2.378326654434204, 'logits/rejected': -2.378329038619995, 'epoch': 0.01}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  1%|          | 11/1900 [01:46<4:45:33,  9.07s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  1%|          | 12/1900 [01:55<4:42:25,  8.98s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.5737, 'grad_norm': 2.1288082599639893, 'learning_rate': 9.998024842193876e-05, 'rewards/chosen': -0.293789803981781, 'rewards/rejected': -0.47767987847328186, 'rewards/accuracies': 0.734375, 'rewards/margins': 0.18389011919498444, 'logps/chosen': -6.779936790466309, 'logps/rejected': -13.15968132019043, 'logits/chosen': -2.8013837337493896, 'logits/rejected': -2.8001739978790283, 'epoch': 0.01}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  1%|          | 21/1900 [03:25<5:07:35,  9.82s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  1%|          | 22/1900 [03:34<4:58:05,  9.52s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3094, 'grad_norm': 1.5286844968795776, 'learning_rate': 9.9950181809607e-05, 'rewards/chosen': -0.015217083506286144, 'rewards/rejected': -0.2304154634475708, 'rewards/accuracies': 0.765625, 'rewards/margins': 0.2151983678340912, 'logps/chosen': -1.1678739786148071, 'logps/rejected': -7.8710150718688965, 'logits/chosen': -2.854144811630249, 'logits/rejected': -2.856541156768799, 'epoch': 0.02}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  2%|▏         | 31/1900 [05:02<5:24:13, 10.41s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  2%|▏         | 32/1900 [05:11<5:07:50,  9.89s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3057, 'grad_norm': 1.146453857421875, 'learning_rate': 9.990645931631796e-05, 'rewards/chosen': 0.08272037655115128, 'rewards/rejected': -0.3342912197113037, 'rewards/accuracies': 0.721875011920929, 'rewards/margins': 0.417011559009552, 'logps/chosen': -1.832923173904419, 'logps/rejected': -10.22989273071289, 'logits/chosen': -2.897273540496826, 'logits/rejected': -2.9067845344543457, 'epoch': 0.02}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  2%|▏         | 41/1900 [06:44<5:20:39, 10.35s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  2%|▏         | 42/1900 [06:53<5:11:08, 10.05s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3165, 'grad_norm': 2.359985589981079, 'learning_rate': 9.984909289536473e-05, 'rewards/chosen': -0.012794261798262596, 'rewards/rejected': -0.22954945266246796, 'rewards/accuracies': 0.715624988079071, 'rewards/margins': 0.2167551964521408, 'logps/chosen': -1.3334813117980957, 'logps/rejected': -7.961485862731934, 'logits/chosen': -2.9358203411102295, 'logits/rejected': -2.9358325004577637, 'epoch': 0.03}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  3%|▎         | 51/1900 [08:20<5:24:15, 10.52s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  3%|▎         | 52/1900 [08:29<5:11:17, 10.11s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3619, 'grad_norm': 1.2429243326187134, 'learning_rate': 9.977809823015401e-05, 'rewards/chosen': 0.05875256657600403, 'rewards/rejected': -0.21471326053142548, 'rewards/accuracies': 0.75, 'rewards/margins': 0.2734658122062683, 'logps/chosen': -1.4259860515594482, 'logps/rejected': -8.677461624145508, 'logits/chosen': -3.0916411876678467, 'logits/rejected': -3.0856261253356934, 'epoch': 0.03}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  3%|▎         | 61/1900 [09:53<4:44:21,  9.28s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  3%|▎         | 62/1900 [10:04<4:56:33,  9.68s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3201, 'grad_norm': 1.720345377922058, 'learning_rate': 9.969349472991838e-05, 'rewards/chosen': -0.013931727036833763, 'rewards/rejected': -0.2509463429450989, 'rewards/accuracies': 0.7406250238418579, 'rewards/margins': 0.23701460659503937, 'logps/chosen': -1.6610157489776611, 'logps/rejected': -8.568689346313477, 'logits/chosen': -3.0592751502990723, 'logits/rejected': -3.069071054458618, 'epoch': 0.04}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  4%|▎         | 71/1900 [11:31<4:53:01,  9.61s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  4%|▍         | 72/1900 [11:41<4:54:45,  9.67s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3151, 'grad_norm': 1.3373539447784424, 'learning_rate': 9.959530552441005e-05, 'rewards/chosen': -0.02396850846707821, 'rewards/rejected': -0.2806222140789032, 'rewards/accuracies': 0.71875, 'rewards/margins': 0.25665372610092163, 'logps/chosen': -1.870222806930542, 'logps/rejected': -7.967897891998291, 'logits/chosen': -3.0840578079223633, 'logits/rejected': -3.0883822441101074, 'epoch': 0.04}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  4%|▍         | 81/1900 [13:08<4:53:17,  9.67s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  4%|▍         | 82/1900 [13:21<5:18:50, 10.52s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.6851, 'grad_norm': 1.3373322486877441, 'learning_rate': 9.948355745757741e-05, 'rewards/chosen': -0.36413443088531494, 'rewards/rejected': -0.48332515358924866, 'rewards/accuracies': 0.675000011920929, 'rewards/margins': 0.1191907748579979, 'logps/chosen': -6.775357246398926, 'logps/rejected': -12.22442626953125, 'logits/chosen': -3.0489273071289062, 'logits/rejected': -3.055819511413574, 'epoch': 0.05}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  5%|▍         | 91/1900 [14:49<4:51:25,  9.67s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  5%|▍         | 92/1900 [14:58<4:42:29,  9.37s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using:

{'loss': 1.3016, 'grad_norm': 1.055905818939209, 'learning_rate': 9.93582810802261e-05, 'rewards/chosen': 0.025665516033768654, 'rewards/rejected': -0.23776881396770477, 'rewards/accuracies': 0.7281249761581421, 'rewards/margins': 0.26343435049057007, 'logps/chosen': -1.1233984231948853, 'logps/rejected': -8.92518424987793, 'logits/chosen': -2.8834452629089355, 'logits/rejected': -2.8889517784118652, 'epoch': 0.05}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  0%|          | 0/157 [00:00<?, ?it/s][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  1%|▏         | 2/157 [00:05<06:32,  2.53s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  2%|▏         | 3/157 [00:10<09:12,  3.59s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎         | 4/157 [00:17<12:44,  4.99s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎       

{'eval_loss': 1.4505045413970947, 'eval_runtime': 798.6634, 'eval_samples_per_second': 3.13, 'eval_steps_per_second': 0.197, 'eval_rewards/chosen': 0.12580052018165588, 'eval_rewards/rejected': -0.05580322444438934, 'eval_rewards/accuracies': 0.6970541477203369, 'eval_rewards/margins': 0.18160374462604523, 'eval_logps/chosen': -1.7522999048233032, 'eval_logps/rejected': -8.0077543258667, 'eval_logits/chosen': -2.963391065597534, 'eval_logits/rejected': -2.9671361446380615, 'epoch': 0.05}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  5%|▌         | 101/1900 [30:29<131:39:19, 263.46s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  5%|▌         | 102/1900 [30:37<93:18:05, 186.81s/it] GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False

{'loss': 1.3504, 'grad_norm': 5.408817768096924, 'learning_rate': 9.921951064166684e-05, 'rewards/chosen': 0.08587377518415451, 'rewards/rejected': -0.17494723200798035, 'rewards/accuracies': 0.75, 'rewards/margins': 0.26082098484039307, 'logps/chosen': -2.2931759357452393, 'logps/rejected': -8.96303939819336, 'logits/chosen': -2.931837558746338, 'logits/rejected': -2.9277141094207764, 'epoch': 0.06}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  6%|▌         | 111/1900 [32:04<8:13:52, 16.56s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  6%|▌         | 112/1900 [32:12<6:59:57, 14.09s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.4802, 'grad_norm': 2.405055284500122, 'learning_rate': 9.90672840803519e-05, 'rewards/chosen': 0.104533851146698, 'rewards/rejected': -0.03731768950819969, 'rewards/accuracies': 0.6937500238418579, 'rewards/margins': 0.141851544380188, 'logps/chosen': -3.1759777069091797, 'logps/rejected': -9.297985076904297, 'logits/chosen': -2.946321964263916, 'logits/rejected': -2.9531266689300537, 'epoch': 0.06}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  6%|▋         | 121/1900 [33:27<4:10:52,  8.46s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  6%|▋         | 122/1900 [33:35<4:07:31,  8.35s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.3856, 'grad_norm': 1.5967676639556885, 'learning_rate': 9.890164301350318e-05, 'rewards/chosen': 0.04347836971282959, 'rewards/rejected': -0.15920259058475494, 'rewards/accuracies': 0.6625000238418579, 'rewards/margins': 0.20268091559410095, 'logps/chosen': -2.44564151763916, 'logps/rejected': -9.538412094116211, 'logits/chosen': -2.978193759918213, 'logits/rejected': -2.968479633331299, 'epoch': 0.07}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  7%|▋         | 131/1900 [35:01<4:49:19,  9.81s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  7%|▋         | 132/1900 [35:11<4:56:28, 10.06s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.4208, 'grad_norm': 13.352676391601562, 'learning_rate': 9.872263272573443e-05, 'rewards/chosen': 0.17795750498771667, 'rewards/rejected': -0.10246620327234268, 'rewards/accuracies': 0.762499988079071, 'rewards/margins': 0.28042370080947876, 'logps/chosen': -2.1795578002929688, 'logps/rejected': -9.221504211425781, 'logits/chosen': -2.9648168087005615, 'logits/rejected': -2.9682772159576416, 'epoch': 0.07}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  7%|▋         | 141/1900 [36:47<5:10:07, 10.58s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  7%|▋         | 142/1900 [36:56<4:53:27, 10.02s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.447, 'grad_norm': 1.6532777547836304, 'learning_rate': 9.855013315614725e-05, 'rewards/chosen': -0.14868628978729248, 'rewards/rejected': -0.39737609028816223, 'rewards/accuracies': 0.7437499761581421, 'rewards/margins': 0.24868977069854736, 'logps/chosen': -5.997104644775391, 'logps/rejected': -12.96562671661377, 'logits/chosen': -2.9848792552948, 'logits/rejected': -2.9823288917541504, 'epoch': 0.08}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  8%|▊         | 151/1900 [38:26<4:43:41,  9.73s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  8%|▊         | 152/1900 [38:34<4:27:53,  9.20s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.3133, 'grad_norm': 1.4087140560150146, 'learning_rate': 9.834585918739936e-05, 'rewards/chosen': -0.013261613436043262, 'rewards/rejected': -0.2484828531742096, 'rewards/accuracies': 0.71875, 'rewards/margins': 0.2352212369441986, 'logps/chosen': -1.5176520347595215, 'logps/rejected': -8.212911605834961, 'logits/chosen': -3.042861223220825, 'logits/rejected': -3.0448005199432373, 'epoch': 0.09}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  8%|▊         | 161/1900 [40:00<4:46:35,  9.89s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  9%|▊         | 162/1900 [40:10<4:43:23,  9.78s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.2936, 'grad_norm': 1.9692555665969849, 'learning_rate': 9.812836794348004e-05, 'rewards/chosen': -0.0022856860887259245, 'rewards/rejected': -0.26751071214675903, 'rewards/accuracies': 0.7281249761581421, 'rewards/margins': 0.26522502303123474, 'logps/chosen': -1.273679494857788, 'logps/rejected': -8.744241714477539, 'logits/chosen': -2.9630467891693115, 'logits/rejected': -2.9640426635742188, 'epoch': 0.09}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  9%|▉         | 171/1900 [41:35<4:32:41,  9.46s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  9%|▉         | 172/1900 [41:45<4:34:18,  9.52s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.4126, 'grad_norm': 1.0978591442108154, 'learning_rate': 9.789771888432375e-05, 'rewards/chosen': -0.1108459085226059, 'rewards/rejected': -0.35796239972114563, 'rewards/accuracies': 0.715624988079071, 'rewards/margins': 0.24711644649505615, 'logps/chosen': -4.107837677001953, 'logps/rejected': -11.062274932861328, 'logits/chosen': -2.8394811153411865, 'logits/rejected': -2.8414738178253174, 'epoch': 0.1}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 10%|▉         | 181/1900 [43:13<4:37:47,  9.70s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 10%|▉         | 182/1900 [43:24<4:48:46, 10.09s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.3165, 'grad_norm': 1.9309916496276855, 'learning_rate': 9.765397506708023e-05, 'rewards/chosen': 0.021279683336615562, 'rewards/rejected': -0.2551126480102539, 'rewards/accuracies': 0.7593749761581421, 'rewards/margins': 0.2763923108577728, 'logps/chosen': -1.7882492542266846, 'logps/rejected': -10.603620529174805, 'logits/chosen': -2.871643543243408, 'logits/rejected': -2.8691911697387695, 'epoch': 0.1}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 10%|█         | 191/1900 [44:50<4:34:05,  9.62s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 10%|█         | 192/1900 [44:59<4:27:35,  9.40s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, usin

{'loss': 1.3156, 'grad_norm': 1.869264006614685, 'learning_rate': 9.739720312887535e-05, 'rewards/chosen': -0.003554611001163721, 'rewards/rejected': -0.2623003125190735, 'rewards/accuracies': 0.699999988079071, 'rewards/margins': 0.2587457299232483, 'logps/chosen': -1.8520609140396118, 'logps/rejected': -9.107900619506836, 'logits/chosen': -2.9276041984558105, 'logits/rejected': -2.9297664165496826, 'epoch': 0.11}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  0%|          | 0/157 [00:00<?, ?it/s][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  1%|▏         | 2/157 [00:05<06:39,  2.58s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  2%|▏         | 3/157 [00:10<09:31,  3.71s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎         | 4/157 [00:17<13:02,  5.11s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎       

{'eval_loss': 1.3509576320648193, 'eval_runtime': 756.4811, 'eval_samples_per_second': 3.305, 'eval_steps_per_second': 0.208, 'eval_rewards/chosen': 0.06265164166688919, 'eval_rewards/rejected': -0.17284759879112244, 'eval_rewards/accuracies': 0.7165604829788208, 'eval_rewards/margins': 0.23549921810626984, 'eval_logps/chosen': -1.4308454990386963, 'eval_logps/rejected': -8.019100189208984, 'eval_logits/chosen': -2.9274184703826904, 'eval_logits/rejected': -2.9300389289855957, 'epoch': 0.11}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 11%|█         | 201/1900 [59:49<118:18:14, 250.67s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 11%|█         | 202/1900 [59:58<84:00:16, 178.10s/it] GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False

{'loss': 1.3504, 'grad_norm': 3.06105375289917, 'learning_rate': 9.712747326859315e-05, 'rewards/chosen': -0.023498181253671646, 'rewards/rejected': -0.22220472991466522, 'rewards/accuracies': 0.671875, 'rewards/margins': 0.19870653748512268, 'logps/chosen': -1.9328422546386719, 'logps/rejected': -8.998971939086914, 'logits/chosen': -2.8187854290008545, 'logits/rejected': -2.8268160820007324, 'epoch': 0.11}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 11%|█         | 211/1900 [1:01:17<7:20:56, 15.66s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 11%|█         | 212/1900 [1:01:26<6:23:35, 13.63s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.3372, 'grad_norm': 6.269824981689453, 'learning_rate': 9.684485922768422e-05, 'rewards/chosen': -0.061076659709215164, 'rewards/rejected': -0.3517860770225525, 'rewards/accuracies': 0.7250000238418579, 'rewards/margins': 0.29070940613746643, 'logps/chosen': -2.9718871116638184, 'logps/rejected': -10.495363235473633, 'logits/chosen': -2.8195300102233887, 'logits/rejected': -2.822136878967285, 'epoch': 0.12}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 12%|█▏        | 221/1900 [1:02:55<4:41:41, 10.07s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 12%|█▏        | 222/1900 [1:03:05<4:41:00, 10.05s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.3159, 'grad_norm': 4.1651930809021, 'learning_rate': 9.654943827000548e-05, 'rewards/chosen': -0.02073289081454277, 'rewards/rejected': -0.2970975935459137, 'rewards/accuracies': 0.731249988079071, 'rewards/margins': 0.27636468410491943, 'logps/chosen': -1.9388418197631836, 'logps/rejected': -9.341755867004395, 'logits/chosen': -2.8834125995635986, 'logits/rejected': -2.8824024200439453, 'epoch': 0.12}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 12%|█▏        | 231/1900 [1:04:31<4:35:22,  9.90s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 12%|█▏        | 232/1900 [1:04:40<4:29:45,  9.70s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.3379, 'grad_norm': 3.5861988067626953, 'learning_rate': 9.624129116069694e-05, 'rewards/chosen': 0.06099650263786316, 'rewards/rejected': -0.24122044444084167, 'rewards/accuracies': 0.7281249761581421, 'rewards/margins': 0.30221694707870483, 'logps/chosen': -1.7300220727920532, 'logps/rejected': -9.117856979370117, 'logits/chosen': -2.8699488639831543, 'logits/rejected': -2.8690104484558105, 'epoch': 0.13}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 13%|█▎        | 241/1900 [1:06:10<4:31:36,  9.82s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 13%|█▎        | 242/1900 [1:06:19<4:23:09,  9.52s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.4702, 'grad_norm': 6.665899753570557, 'learning_rate': 9.59205021441015e-05, 'rewards/chosen': 0.10469353199005127, 'rewards/rejected': -0.08070000261068344, 'rewards/accuracies': 0.71875, 'rewards/margins': 0.1853935271501541, 'logps/chosen': -2.095228433609009, 'logps/rejected': -9.005245208740234, 'logits/chosen': -2.8187196254730225, 'logits/rejected': -2.820932388305664, 'epoch': 0.13}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 13%|█▎        | 251/1900 [1:07:48<4:41:39, 10.25s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 13%|█▎        | 252/1900 [1:07:57<4:30:52,  9.86s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.4359, 'grad_norm': 1.7949516773223877, 'learning_rate': 9.558715892073323e-05, 'rewards/chosen': 0.08078832924365997, 'rewards/rejected': -0.1634647399187088, 'rewards/accuracies': 0.706250011920929, 'rewards/margins': 0.24425306916236877, 'logps/chosen': -2.5200119018554688, 'logps/rejected': -9.610771179199219, 'logits/chosen': -2.9178307056427, 'logits/rejected': -2.919278621673584, 'epoch': 0.14}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 14%|█▎        | 261/1900 [1:09:33<4:52:25, 10.70s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 14%|█▍        | 262/1900 [1:09:49<5:30:53, 12.12s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.6633, 'grad_norm': 9.56406307220459, 'learning_rate': 9.524135262330098e-05, 'rewards/chosen': 0.2987476587295532, 'rewards/rejected': 0.11543931812047958, 'rewards/accuracies': 0.699999988079071, 'rewards/margins': 0.18330833315849304, 'logps/chosen': -4.495893955230713, 'logps/rejected': -11.640830993652344, 'logits/chosen': -2.9249229431152344, 'logits/rejected': -2.9320712089538574, 'epoch': 0.14}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 14%|█▍        | 271/1900 [1:11:42<6:10:19, 13.64s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 14%|█▍        | 272/1900 [1:11:58<6:29:57, 14.37s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 3.4579, 'grad_norm': 19.057165145874023, 'learning_rate': 9.488317779179361e-05, 'rewards/chosen': 1.963983178138733, 'rewards/rejected': 1.9570579528808594, 'rewards/accuracies': 0.671875, 'rewards/margins': 0.00692532816901803, 'logps/chosen': -14.595129013061523, 'logps/rejected': -21.312320709228516, 'logits/chosen': -2.8814969062805176, 'logits/rejected': -2.886118173599243, 'epoch': 0.15}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 15%|█▍        | 281/1900 [1:14:12<6:50:07, 15.20s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 15%|█▍        | 282/1900 [1:14:21<5:59:44, 13.34s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 2.1291, 'grad_norm': 18.835241317749023, 'learning_rate': 9.451273234763371e-05, 'rewards/chosen': 0.6949002742767334, 'rewards/rejected': 0.5685508251190186, 'rewards/accuracies': 0.699999988079071, 'rewards/margins': 0.12634943425655365, 'logps/chosen': -9.3750638961792, 'logps/rejected': -16.35053253173828, 'logits/chosen': -2.7858822345733643, 'logits/rejected': -2.7899367809295654, 'epoch': 0.15}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 15%|█▌        | 291/1900 [1:16:11<5:11:15, 11.61s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 15%|█▌        | 292/1900 [1:16:20<4:51:47, 10.89s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.3374, 'grad_norm': 2.1093661785125732, 'learning_rate': 9.413011756690685e-05, 'rewards/chosen': -0.025317272171378136, 'rewards/rejected': -0.24628722667694092, 'rewards/accuracies': 0.721875011920929, 'rewards/margins': 0.22096996009349823, 'logps/chosen': -1.8214004039764404, 'logps/rejected': -9.357161521911621, 'logits/chosen': -2.780010461807251, 'logits/rejected': -2.783396005630493, 'epoch': 0.16}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  0%|          | 0/157 [00:00<?, ?it/s][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  1%|▏         | 2/157 [00:05<06:42,  2.60s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  2%|▏         | 3/157 [00:10<09:37,  3.75s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎         | 4/157 [00:18<13:12,  5.18s/it][AGPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  3%|▎       

{'eval_loss': 1.609375, 'eval_runtime': 806.5525, 'eval_samples_per_second': 3.1, 'eval_steps_per_second': 0.195, 'eval_rewards/chosen': 0.3178027868270874, 'eval_rewards/rejected': 0.10459093004465103, 'eval_rewards/accuracies': 0.674761176109314, 'eval_rewards/margins': 0.21321183443069458, 'eval_logps/chosen': -1.792191982269287, 'eval_logps/rejected': -8.252504348754883, 'eval_logits/chosen': -2.852276563644409, 'eval_logits/rejected': -2.8527638912200928, 'epoch': 0.16}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 16%|█▌        | 301/1900 [1:31:09<111:35:36, 251.24s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 16%|█▌        | 302/1900 [1:31:18<79:19:11, 178.69s/it] GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: F

{'loss': 1.3385, 'grad_norm': 4.050545692443848, 'learning_rate': 9.373543805267368e-05, 'rewards/chosen': 0.021589551120996475, 'rewards/rejected': -0.27833205461502075, 'rewards/accuracies': 0.75, 'rewards/margins': 0.2999216318130493, 'logps/chosen': -2.028925657272339, 'logps/rejected': -9.600972175598145, 'logits/chosen': -2.7888801097869873, 'logits/rejected': -2.7937958240509033, 'epoch': 0.17}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 16%|█▋        | 311/1900 [1:32:51<7:48:56, 17.71s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 16%|█▋        | 312/1900 [1:33:03<7:03:09, 15.99s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.4964, 'grad_norm': 2.0754055976867676, 'learning_rate': 9.332880170637252e-05, 'rewards/chosen': 0.08100108802318573, 'rewards/rejected': -0.19720326364040375, 'rewards/accuracies': 0.7093750238418579, 'rewards/margins': 0.2782043516635895, 'logps/chosen': -5.431931495666504, 'logps/rejected': -13.580818176269531, 'logits/chosen': -2.8152778148651123, 'logits/rejected': -2.820136070251465, 'epoch': 0.17}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 17%|█▋        | 321/1900 [1:34:45<5:11:33, 11.84s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 17%|█▋        | 322/1900 [1:34:53<4:45:12, 10.84s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 1.666, 'grad_norm': 18.281597137451172, 'learning_rate': 9.291031969832026e-05, 'rewards/chosen': 0.37582749128341675, 'rewards/rejected': 0.08631499111652374, 'rewards/accuracies': 0.703125, 'rewards/margins': 0.28951239585876465, 'logps/chosen': -6.288638591766357, 'logps/rejected': -15.090845108032227, 'logits/chosen': -2.8717195987701416, 'logits/rejected': -2.8729825019836426, 'epoch': 0.18}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 17%|█▋        | 331/1900 [1:36:34<4:30:43, 10.35s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 17%|█▋        | 332/1900 [1:36:43<4:24:42, 10.13s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

{'loss': 2.3286, 'grad_norm': 17.521100997924805, 'learning_rate': 9.248010643731935e-05, 'rewards/chosen': 0.26242995262145996, 'rewards/rejected': 0.17432793974876404, 'rewards/accuracies': 0.659375011920929, 'rewards/margins': 0.08810202032327652, 'logps/chosen': -23.099464416503906, 'logps/rejected': -28.480777740478516, 'logits/chosen': -2.843937873840332, 'logits/rejected': -2.843336582183838, 'epoch': 0.18}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 18%|█▊        | 341/1900 [1:38:46<5:57:43, 13.77s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 18%|█▊        | 342/1900 [1:38:55<5:25:48, 12.55s/it]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, 

In [27]:
trainer.push_to_hub('yet_another_dpo')




spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s][A[A[A

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s][A[A
Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s][A



training_args.bin:   0%|          | 0.00/6.14k [00:00<?, ?B/s][A[A[A[A

model.safetensors:   0%|          | 90.1k/892M [00:00<17:46, 836kB/s][A[A


spiece.model:   9%|▉         | 90.1k/1.00M [00:00<00:01, 784kB/s][A[A[A

training_args.bin: 100%|██████████| 6.14k/6.14k [00:00<00:00, 25.0kB/s][A[A


model.safetensors:   1%|          | 5.53M/892M [00:00<00:52, 17.0MB/s][A[A

model.safetensors:   1%|          | 8.35M/892M [00:00<01:08, 13.0MB/s][A[A

spiece.model: 100%|██████████| 1.00M/1.00M [00:00<00:00, 1.15MB/s]B/s][A[A


model.safetensors:   2%|▏         | 16.0M/892M [00:01<01:09, 12.6MB/s][A[A

model.safetensors:   3%|▎         | 23.9M/892M [00:01<00:38, 22.5MB/s][A[A

model.safetensors:   3%|▎         | 27.3M/892M [00:01<00:57, 15.1MB/s][A[A

model.safetensors: 

CommitInfo(commit_url='https://huggingface.co/mika5883/gec_t5_dpo_A_v2/commit/44914b180551929997b38b30e1ef8568f43eb74c', commit_message='yet_another_dpo', commit_description='', oid='44914b180551929997b38b30e1ef8568f43eb74c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mika5883/gec_t5_dpo_A_v2', endpoint='https://huggingface.co', repo_type='model', repo_id='mika5883/gec_t5_dpo_A_v2'), pr_revision=None, pr_num=None)