Skip to content

Commit

Permalink
[TorchAcc][Experimental] Integrate more model in torchacc (#683)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhikaiiii committed May 22, 2024
1 parent c8f6153 commit fdb7a4d
Show file tree
Hide file tree
Showing 31 changed files with 1,181 additions and 13 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc dp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
--model_layer_cls_name BaichuanLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
--model_layer_cls_name BaichuanLayer \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.

# MASTER_ADDR=127.0.0.1 \

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 2 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc dp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select


NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--model_layer_cls_name GLMBlock \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select


NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--model_layer_cls_name GLMBlock \
--dataset codefuse-python-en \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.

# MASTER_ADDR=127.0.0.1 \
# MASTER_PORT=12356 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path ZhipuAI/chatglm3-6b \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.

export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 24 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.

# MASTER_ADDR=127.0.0.1 \

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
--dataset codefuse-python-en \
--sft_type lora \
--dtype AUTO \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.

export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
export XLA_COORDINATOR_PORT=12457

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=21779 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
# export XLA_COORDINATOR_PORT=12457

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--tuner_backend 'peft' \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 2 \
--report_to 'none'
Loading

0 comments on commit fdb7a4d

Please sign in to comment.