-
Notifications
You must be signed in to change notification settings - Fork 220
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[TorchAcc][Experimental] Integrate more model in torchacc (#683)
- Loading branch information
Showing
31 changed files
with
1,181 additions
and
13 deletions.
There are no files selected for viewing
34 changes: 34 additions & 0 deletions
34
examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
# torchacc dp | ||
export USE_TORCHACC=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
MASTER_PORT=27829 \ | ||
swift sft \ | ||
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \ | ||
--model_layer_cls_name BaichuanLayer \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 12 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
34 changes: 34 additions & 0 deletions
34
examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
# torchacc fsdp | ||
export USE_TORCHACC=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \ | ||
--model_layer_cls_name BaichuanLayer \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 16 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--fsdp_num 2 \ | ||
--report_to 'none' |
27 changes: 27 additions & 0 deletions
27
examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
|
||
# MASTER_ADDR=127.0.0.1 \ | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--dtype AUTO \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 2 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
35 changes: 35 additions & 0 deletions
35
examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
# torchacc dp | ||
export USE_TORCHACC=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
MASTER_PORT=27829 \ | ||
swift sft \ | ||
--model_id_or_path ZhipuAI/chatglm3-6b \ | ||
--model_layer_cls_name GLMBlock \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 16 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
35 changes: 35 additions & 0 deletions
35
examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
# torchacc fsdp | ||
export USE_TORCHACC=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path ZhipuAI/chatglm3-6b \ | ||
--model_layer_cls_name GLMBlock \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 16 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--fsdp_num 2 \ | ||
--report_to 'none' |
27 changes: 27 additions & 0 deletions
27
examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
|
||
# MASTER_ADDR=127.0.0.1 \ | ||
# MASTER_PORT=12356 \ | ||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path ZhipuAI/chatglm3-6b \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--dtype AUTO \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 4 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
35 changes: 35 additions & 0 deletions
35
examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
|
||
export USE_TORCHACC=1 | ||
export TORCHACC_TRIM_GRAPH=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path modelscope/Llama-2-13b-chat-ms \ | ||
--model_layer_cls_name LlamaDecoderLayer \ | ||
--dataset codefuse-python-en \ | ||
--template_type llama \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 16 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
36 changes: 36 additions & 0 deletions
36
examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
export USE_TORCHACC=1 | ||
export TORCHACC_TRIM_GRAPH=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
MASTER_PORT=27829 \ | ||
swift sft \ | ||
--model_id_or_path modelscope/Llama-2-13b-chat-ms \ | ||
--model_layer_cls_name LlamaDecoderLayer \ | ||
--dataset codefuse-python-en \ | ||
--template_type llama \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 24 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--fsdp_num 2 \ | ||
--report_to 'none' |
27 changes: 27 additions & 0 deletions
27
examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
|
||
# MASTER_ADDR=127.0.0.1 \ | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
swift sft \ | ||
--model_id_or_path modelscope/Llama-2-13b-chat-ms \ | ||
--dataset codefuse-python-en \ | ||
--sft_type lora \ | ||
--dtype AUTO \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 16 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
37 changes: 37 additions & 0 deletions
37
examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
|
||
export USE_TORCHACC=1 | ||
export TORCHACC_TRIM_GRAPH=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
export XLA_COORDINATOR_PORT=12457 | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
MASTER_PORT=21779 \ | ||
swift sft \ | ||
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \ | ||
--model_layer_cls_name LlamaDecoderLayer \ | ||
--dataset codefuse-python-en \ | ||
--template_type llama3 \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 12 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--report_to 'none' |
37 changes: 37 additions & 0 deletions
37
examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Experimental environment: 2 * A100 | ||
# 80GB GPU memory | ||
# Note: TorchAcc is currently only available internally. | ||
export USE_TORCHACC=1 | ||
export TORCHACC_TRIM_GRAPH=1 | ||
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization' | ||
export XLA_IR_SHAPE_CACHE_SIZE=100000000 | ||
export XLA_ALLOCATOR_FRACTION=0.95 | ||
export XLA_EXPERIMENTAL=nonzero:masked_select | ||
# export XLA_COORDINATOR_PORT=12457 | ||
|
||
NPROC_PER_NODE=2 \ | ||
CUDA_VISIBLE_DEVICES=0,1 \ | ||
MASTER_PORT=27829 \ | ||
swift sft \ | ||
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \ | ||
--model_layer_cls_name LlamaDecoderLayer \ | ||
--dataset codefuse-python-en \ | ||
--template_type llama3 \ | ||
--sft_type lora \ | ||
--output_dir output \ | ||
--num_train_epochs 1 \ | ||
--max_length 2048 \ | ||
--batch_size 12 \ | ||
--use_flash_attn true \ | ||
--gradient_accumulation_steps 1 \ | ||
--gradient_checkpointing no \ | ||
--tuner_backend 'peft' \ | ||
--dataset_test_ratio 0 \ | ||
--save_strategy no \ | ||
--eval_steps 2000000 \ | ||
--save_steps 2000000 \ | ||
--logging_steps 100 \ | ||
--preprocess_num_proc 1 \ | ||
--metric_warmup_step 0.1 \ | ||
--fsdp_num 2 \ | ||
--report_to 'none' |
Oops, something went wrong.