diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md index 39169ce768..350c07d2ac 100644 --- a/docs/source/Instruction/GRPO.md +++ b/docs/source/Instruction/GRPO.md @@ -43,7 +43,7 @@ swift rlhf \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ - --learning_rate 2e-5 \ + --learning_rate 2e-6 \ --gradient_accumulation_steps 8 \ --save_total_limit 2 \ --logging_steps 5 \ @@ -69,7 +69,7 @@ swift rlhf \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ - --learning_rate 2e-5 \ + --learning_rate 2e-6 \ --gradient_accumulation_steps 8 \ --save_total_limit 2 \ --logging_steps 5 \ diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md index e46efd221e..28e97801ac 100644 --- a/docs/source_en/Instruction/GRPO.md +++ b/docs/source_en/Instruction/GRPO.md @@ -43,7 +43,7 @@ swift rlhf \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ - --learning_rate 2e-5 \ + --learning_rate 2e-6 \ --gradient_accumulation_steps 16 \ --save_total_limit 2 \ --logging_steps 5 \ @@ -69,7 +69,7 @@ swift rlhf \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ - --learning_rate 2e-5 \ + --learning_rate 2e-6 \ --gradient_accumulation_steps 16 \ --save_total_limit 2 \ --logging_steps 5 \ diff --git a/examples/train/grpo/grpo.py b/examples/train/grpo/grpo.py index 8df994e5bb..193678c8a3 100644 --- a/examples/train/grpo/grpo.py +++ b/examples/train/grpo/grpo.py @@ -50,7 +50,6 @@ def run(self): # dataset dataset = ['AI-MO/NuminaMath-TIR'] # dataset_id or dataset_path data_seed = 42 - max_new_tokens = 512 split_dataset_ratio = 0.01 # Split validation set num_proc = 4 # The number of processes for data loading. @@ -80,7 +79,7 @@ def run(self): reward_funcs=reward_funcs, split_dataset_ratio=split_dataset_ratio, output_dir=output_dir, - learning_rate=2e-5, + learning_rate=2e-6, gradient_checkpointing=True, weight_decay=0.1, lr_scheduler_type='cosine', diff --git a/examples/train/grpo/grpo.sh b/examples/train/grpo/grpo.sh index d66b37b476..328f7bfcd4 100644 --- a/examples/train/grpo/grpo.sh +++ b/examples/train/grpo/grpo.sh @@ -15,7 +15,7 @@ swift rlhf \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ - --learning_rate 2e-5 \ + --learning_rate 2e-6 \ --gradient_accumulation_steps 8 \ --save_total_limit 2 \ --logging_steps 5 \ diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index ac13f24649..da42b54f27 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -2,7 +2,7 @@ # Part of the implementation is borrowed from huggingface/trl. import inspect from collections import defaultdict -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union from unittest.mock import patch import torch @@ -33,8 +33,7 @@ def __init__(self, model: Optional[Union[PreTrainedModel, nn.Module]] = None, ref_model: Optional[Union[PreTrainedModel, nn.Module]] = None, reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None, - reward_funcs: List[str, callable] = None, - *_args, + reward_funcs: Optional[List[Union[str, Callable]]] = None * _args, **kwargs): args = kwargs['args'] @@ -239,7 +238,7 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: if isinstance(reward_func, nn.Module): # Module instead of PretrainedModel for compat with compiled models reward_func_name = reward_func.config._name_or_path.split('/')[-1] else: - if isinstance(reward_func, callable): + if callable(reward_func): reward_func_name = reward_func.__name__ # function else: reward_func_name = reward_func.__class__.__name__ # object