[TorchAcc][Experimental] Integrate more model in torchacc (#683)

modelscope · May 22, 2024 · fdb7a4d · fdb7a4d
1 parent c8f6153
commit fdb7a4d
Show file tree

Hide file tree

Showing 31 changed files with 1,181 additions and 13 deletions.
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh
@@ -0,0 +1,34 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc dp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --model_layer_cls_name BaichuanLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,34 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --model_layer_cls_name BaichuanLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 2 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh
@@ -0,0 +1,35 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc dp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh
@@ -0,0 +1,35 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+# MASTER_PORT=12356 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  4 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh
@@ -0,0 +1,35 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 24 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh
@@ -0,0 +1,37 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+export XLA_COORDINATOR_PORT=12457
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=21779 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
@@ -0,0 +1,37 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+# export XLA_COORDINATOR_PORT=12457
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'