diff --git a/.circleci/config.yml b/.circleci/config.yml index c095373e..7c41ad3a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -247,7 +247,10 @@ commands: default: "cpu" type: string layers: - default: "mha dpmha gsm_dpmha embedding gsm_embedding instancenorm gsm_instancenorm groupnorm gsm_groupnorm layernorm gsm_layernorm lstm dplstm gsm_dplstm rnn dprnn gsm_dprnn linear gsm_linear gru dpgru gsm_dpgru" + default: "mha dpmha embedding instancenorm groupnorm layernorm lstm dplstm rnn dprnn linear gru dpgru" + type: string + grad_sample_modes: + default: "baseline hooks" type: string runtime_ratio_threshold: default: "7.0" @@ -363,52 +366,62 @@ jobs: - run_nvidia_smi - benchmark_layers_integration_test: device: "cuda" - layers: "groupnorm gsm_groupnorm instancenorm gsm_instancenorm layernorm gsm_layernorm mha dpmha" + layers: "groupnorm instancenorm layernorm mha dpmha" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "3.0" memory_ratio_threshold: "1.6" - benchmark_layers_integration_test: device: "cuda" - layers: "linear gsm_linear" + layers: "linear" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "3.6" memory_ratio_threshold: "13.0" - benchmark_layers_integration_test: device: "cuda" - layers: "mha gsm_dpmha" + layers: "mha" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "3.5" memory_ratio_threshold: "2.0" - benchmark_layers_integration_test: device: "cuda" layers: "gru dpgru" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "18.5" memory_ratio_threshold: "1.2" - benchmark_layers_integration_test: device: "cuda" - layers: "gru gsm_dpgru" + layers: "gru" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "40" memory_ratio_threshold: "1.6" - benchmark_layers_integration_test: device: "cuda" layers: "lstm dplstm" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "16.5" memory_ratio_threshold: "1.2" - benchmark_layers_integration_test: device: "cuda" - layers: "lstm gsm_dplstm" + layers: "lstm" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "38.0" memory_ratio_threshold: "1.8" - benchmark_layers_integration_test: device: "cuda" layers: "rnn dprnn" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "10.0" memory_ratio_threshold: "1.2" - benchmark_layers_integration_test: device: "cuda" - layers: "rnn gsm_dprnn" + layers: "rnn" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "33.0" memory_ratio_threshold: "1.2" - benchmark_layers_integration_test: device: "cuda" - layers: "embedding gsm_embedding" + layers: "embedding" + grad_sample_modes: "baseline hooks" runtime_ratio_threshold: "8.0" memory_ratio_threshold: "15.0"