diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json new file mode 100644 index 0000000000..36890b63be --- /dev/null +++ b/.azure_pipelines/performance_check/best_metrics.json @@ -0,0 +1,62 @@ +{ + "bert": { + "cpu": { + "accuracy-accuracy": 0.88, + "latency-avg": { + "8272CL": 15.5, + "E5-2673": 44.5, + "8171M": 20.89, + "8370C": 18 + } + }, + "gpu": { + "accuracy-accuracy": 0.9, + "latency-avg": 1.61 + } + }, + "deberta": { + "cpu": { + "accuracy-accuracy": 0.84, + "latency-avg": { + "8272CL": 57.74, + "E5-2673": 117.00, + "8171M": 93.37, + "8370C": 0 + } + }, + "gpu": { + "accuracy-accuracy": 0.88, + "latency-avg": 8.011 + } + }, + "distilbert": { + "cpu": { + "accuracy-accuracy": 0.94, + "latency-avg": { + "8272CL": 4.5, + "E5-2673": 11.0, + "8171M": 5.9, + "8370C": 4.57 + } + }, + "gpu": { + "accuracy-accuracy": 0.94, + "latency-avg": 0.91 + } + }, + "roberta_large": { + "cpu": { + "accuracy-accuracy": 0.88, + "latency-avg": { + "8272CL": 52.38, + "E5-2673": 140.34, + "8171M": 69.33, + "8370C": 0 + } + }, + "gpu": { + "accuracy-accuracy": 0.89, + "latency-avg": 6.164 + } + } +} diff --git a/.azure_pipelines/performance_check/configs/bert.json b/.azure_pipelines/performance_check/configs/bert.json index b8ffee925d..4544781596 100644 --- a/.azure_pipelines/performance_check/configs/bert.json +++ b/.azure_pipelines/performance_check/configs/bert.json @@ -25,15 +25,13 @@ "type": "accuracy", "backend": "huggingface_metrics", "sub_types": [ - {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}} - ] + {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}} ] }, { "name": "latency", "type": "latency", "sub_types": [ - {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}} - ] + {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 5}} ] } ] } @@ -81,7 +79,7 @@ "clean_cache": true, "evaluator": "common_evaluator", "execution_providers": ["CPUExecutionProvider"], - "cache_dir": "cache", - "output_dir" : "models/bert_ptq" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/bert_ptq" } } diff --git a/.azure_pipelines/performance_check/configs/bert_gpu.json b/.azure_pipelines/performance_check/configs/bert_gpu.json index 15fc41d15a..bf809bf691 100644 --- a/.azure_pipelines/performance_check/configs/bert_gpu.json +++ b/.azure_pipelines/performance_check/configs/bert_gpu.json @@ -49,9 +49,6 @@ "type": "OrtTransformersOptimization", "disable_search": true, "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, "float16": true } }, @@ -79,7 +76,7 @@ "evaluator": "common_evaluator", "execution_providers": ["CUDAExecutionProvider"], "clean_cache": true, - "cache_dir": "cache", - "output_dir" : "models/bert_gpu" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/bert_gpu" } } diff --git a/.azure_pipelines/performance_check/configs/deberta.json b/.azure_pipelines/performance_check/configs/deberta.json index 894065962d..847707936a 100644 --- a/.azure_pipelines/performance_check/configs/deberta.json +++ b/.azure_pipelines/performance_check/configs/deberta.json @@ -30,7 +30,7 @@ "type": "accuracy", "backend": "huggingface_metrics", "sub_types": [ - {"name": "accuracy", "priority": 1} + {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}} ] }, @@ -53,17 +53,12 @@ }, "transformers_optimization": { "type": "OrtTransformersOptimization", - "disable_search": true, - "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, - "float16": false - } + "disable_search": true }, "quantization": { "type": "OnnxQuantization", "config": { + "quant_mode": "dynamic", "quant_preprocess": true, "data_config": "__input_model_data_config__" } @@ -87,7 +82,7 @@ "clean_cache": true, "evaluator": "common_evaluator", "execution_providers": ["CPUExecutionProvider"], - "cache_dir": "cache", - "output_dir" : "models/microsoft-deberta" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/microsoft-deberta" } } diff --git a/.azure_pipelines/performance_check/configs/deberta_gpu.json b/.azure_pipelines/performance_check/configs/deberta_gpu.json index bb9be2e749..c0a4ba8a83 100644 --- a/.azure_pipelines/performance_check/configs/deberta_gpu.json +++ b/.azure_pipelines/performance_check/configs/deberta_gpu.json @@ -30,7 +30,7 @@ "type": "accuracy", "backend": "huggingface_metrics", "sub_types": [ - {"name": "accuracy", "priority": 1} + {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}} ] }, @@ -38,7 +38,7 @@ "name": "latency", "type": "latency", "sub_types": [ - {"name": "avg", "priority": 2} + {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}} ] } ] @@ -55,9 +55,6 @@ "type": "OrtTransformersOptimization", "disable_search": true, "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, "float16": true } }, @@ -85,7 +82,7 @@ "clean_cache": true, "evaluator": "common_evaluator", "execution_providers": ["CUDAExecutionProvider"], - "cache_dir": "cache", - "output_dir" : "models/microsoft-deberta_cuda" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/microsoft-deberta_cuda" } } diff --git a/.azure_pipelines/performance_check/configs/distilbert.json b/.azure_pipelines/performance_check/configs/distilbert.json index b6c9b8c7bc..4a9dbecb08 100644 --- a/.azure_pipelines/performance_check/configs/distilbert.json +++ b/.azure_pipelines/performance_check/configs/distilbert.json @@ -26,14 +26,14 @@ "type": "accuracy", "backend": "huggingface_metrics", "sub_types": [ - {"name": "accuracy", "priority": 1} + {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}} ] }, { "name": "latency", "type": "latency", "sub_types": [ - {"name": "avg", "priority": 2} + {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}} ] } ] @@ -48,13 +48,7 @@ }, "transformers_optimization": { "type": "OrtTransformersOptimization", - "disable_search": true, - "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, - "float16": false - } + "disable_search": true }, "quantization": { "type": "OnnxQuantization", @@ -79,10 +73,11 @@ "seed": 0 } }, + "log_severity_level": 0, "clean_cache": true, "evaluator": "common_evaluator", "execution_providers": ["CPUExecutionProvider"], - "cache_dir": "cache", - "output_dir" : "models/distilbert" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/distilbert" } } diff --git a/.azure_pipelines/performance_check/configs/distilbert_gpu.json b/.azure_pipelines/performance_check/configs/distilbert_gpu.json index a7abebbe34..28096bdd99 100644 --- a/.azure_pipelines/performance_check/configs/distilbert_gpu.json +++ b/.azure_pipelines/performance_check/configs/distilbert_gpu.json @@ -49,9 +49,6 @@ "type": "OrtTransformersOptimization", "disable_search": true, "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, "float16": true } }, @@ -79,7 +76,7 @@ "evaluator": "common_evaluator", "execution_providers": ["CUDAExecutionProvider"], "clean_cache": true, - "cache_dir": "cache", - "output_dir" : "models/distilbert_cuda" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/distilbert_cuda" } } diff --git a/.azure_pipelines/performance_check/configs/roberta_large.json b/.azure_pipelines/performance_check/configs/roberta_large.json index 52c36ccc59..2d2fd8fe7f 100644 --- a/.azure_pipelines/performance_check/configs/roberta_large.json +++ b/.azure_pipelines/performance_check/configs/roberta_large.json @@ -52,13 +52,7 @@ }, "transformers_optimization": { "type": "OrtTransformersOptimization", - "disable_search": true, - "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, - "float16": false - } + "disable_search": true }, "quantization": { "type": "OnnxQuantization", @@ -86,7 +80,7 @@ "clean_cache": true, "evaluator": "common_evaluator", "execution_providers": ["CPUExecutionProvider"], - "cache_dir": "cache", - "output_dir" : "models/roberta_large" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/roberta_large" } } diff --git a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json index f79fd8374b..ef9b2f1bef 100644 --- a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json +++ b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json @@ -54,9 +54,6 @@ "type": "OrtTransformersOptimization", "disable_search": true, "config": { - "model_type": "bert", - "num_heads": 12, - "hidden_size": 768, "float16": true } }, @@ -84,7 +81,7 @@ "evaluator": "common_evaluator", "execution_providers": ["CUDAExecutionProvider"], "clean_cache": true, - "cache_dir": "cache", - "output_dir" : "models/roberta_large" + "cache_dir": "run_cache/olive/cache", + "output_dir" : "run_cache/olive/roberta_large" } } diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py index ae014bd26e..99439a7207 100644 --- a/.azure_pipelines/performance_check/run_performance_check.py +++ b/.azure_pipelines/performance_check/run_performance_check.py @@ -70,6 +70,7 @@ "label_cols": ["label"], "batch_size": 1, "max_samples": 100, + "component_kwargs": {"pre_process_data": {"align_labels": True}}, }, }, "roberta_large": { @@ -92,13 +93,13 @@ "name": "accuracy", "type": "accuracy", "backend": "huggingface_metrics", - "sub_types": [{"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}], + "sub_types": [{"name": "accuracy", "priority": 1}], } LAT_METRIC = { "name": "latency", "type": "latency", - "sub_types": [{"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}], + "sub_types": [{"name": "avg", "priority": 2}], } @@ -155,6 +156,7 @@ def export_optimum_dynamic_quantization(onnx_model_path, model_root_path): def run_with_config(tool, olive_config, metric_res): + print(f"Start evaluating {tool} model") outputs = olive_run(olive_config) if tool == "olive": metric = str(next(iter(next(iter(outputs.values())).nodes.values())).metrics.value) @@ -237,11 +239,12 @@ def run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num): olive_config = f"{model_name}.json" if device == "cpu" else f"{model_name}_gpu.json" olive_config_path = cur_dir / "configs" / olive_config run_with_config("olive", olive_config_path, metric_res) - print(metric_res) + print(f"All metric results {metric_res}") for model, v in metric_res.items(): for metric_name, metric_value_list in v.items(): vsum = sum(float(v) for v in metric_value_list) - metric_res[model][metric_name] = vsum / len(metric_value_list) + metric_res[model][metric_name] = round((vsum / len(metric_value_list)), 4) + print(f"Avg metric results {metric_res}") return metric_res @@ -256,6 +259,66 @@ def print_perf_table(metric_res, device): print(table) +def regression_check(model_name, metrics, device, cpu_info): + best_metric_path = Path(__file__).absolute().parent / "best_metrics.json" + if not best_metric_path.exists(): + print(f"Best metrics file {best_metric_path} does not exist, skip regression check") + return + metrics_of_interest = ["accuracy-accuracy", "latency-avg"] + regression_res = {} + with best_metric_path.open("r") as f: + best_metric_json = json.load(f) + best_metrics = best_metric_json[model_name][device] + for metric_name in metrics_of_interest: + # There are 4 types of cpus in our cpu pool + # Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz + # Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz + # Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz + # Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz + # Need to collect the best metrics for each type of cpu + best_metric = best_metrics[metric_name] + if device == "cpu" and metric_name == "latency-avg": + if "8272CL" in cpu_info: + best_metric = best_metric["8272CL"] + elif "E5-2673" in cpu_info: + best_metric = best_metric["E5-2673"] + elif "8171M" in cpu_info: + best_metric = best_metric["8171M"] + elif "8370C" in cpu_info: + best_metric = best_metric["8370C"] + else: + print(f"Unknown cpu type {cpu_info}, skip regression check") + return + if best_metric == 0: + print("No data found, skip regression check") + return + no_regression, percentage_change = regression_cal( + best_metric, metrics[metric_name], metric_name.startswith("accuracy") + ) + regression_res[metric_name] = { + "best_metric": best_metric, + "actual_metric": metrics[metric_name], + "no_regression": no_regression, + "percentage_change": percentage_change, + } + print(f"Regression check result: {regression_res}") + for metric_name, metric_value in regression_res.items(): + assert metric_value["no_regression"], ( + f"Regression check failed for {metric_name} metric with {metric_value['percentage_change']}" + "percentage change" + ) + + +def regression_cal(best_metric, real_metric, is_acc): + percentage_change = (real_metric - best_metric) / best_metric + tolerance = 0.01 if is_acc else 0.05 + diff = real_metric - best_metric + if is_acc: + diff = -diff + no_regression = diff <= tolerance * abs(best_metric) + return no_regression, percentage_change + + def main(): args = get_args() model_name = args.model_name @@ -282,10 +345,9 @@ def main(): export_optimum_dynamic_quantization(onnx_model_path, model_root_path) metric_res = run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num) - + lscpu = subprocess.check_output(["lscpu"]).decode("utf-8") + print(lscpu) if device == "cpu": - lscpu = subprocess.check_output(["lscpu"]) - print(lscpu.decode("utf-8")) import psutil process = [(proc.name(), proc.cpu_percent()) for proc in psutil.process_iter()] @@ -294,6 +356,7 @@ def main(): nvidia_smi = subprocess.check_output(["nvidia-smi"]) print(nvidia_smi.decode("utf-8")) print_perf_table(metric_res, device) + regression_check(model_name, metric_res["olive"], device, lscpu) if __name__ == "__main__": diff --git a/olive/engine/engine.py b/olive/engine/engine.py index 9a0102a9de..732145e5a8 100644 --- a/olive/engine/engine.py +++ b/olive/engine/engine.py @@ -748,18 +748,19 @@ def resolve_goals( for sub_type_name, goal in sub_type_goals.items(): # TODO(trajep): make the logic cleaner resolved_goal_value = None - baseline_sub_type = baseline.get_value(metric_name, sub_type_name) - multiplier = multipliers[metric_name][sub_type_name] - if goal.type == "threshold": - resolved_goal_value = goal.value - elif goal.type == "max-degradation": - resolved_goal_value = baseline_sub_type - multiplier * goal.value - elif goal.type == "min-improvement": - resolved_goal_value = baseline_sub_type + multiplier * goal.value - elif goal.type == "percent-max-degradation": - resolved_goal_value = baseline_sub_type * (1 - multiplier * goal.value / 100) - elif goal.type == "percent-min-improvement": - resolved_goal_value = baseline_sub_type * (1 + multiplier * goal.value / 100) + if goal is not None: + baseline_sub_type = baseline.get_value(metric_name, sub_type_name) + multiplier = multipliers[metric_name][sub_type_name] + if goal.type == "threshold": + resolved_goal_value = goal.value + elif goal.type == "max-degradation": + resolved_goal_value = baseline_sub_type - multiplier * goal.value + elif goal.type == "min-improvement": + resolved_goal_value = baseline_sub_type + multiplier * goal.value + elif goal.type == "percent-max-degradation": + resolved_goal_value = baseline_sub_type * (1 - multiplier * goal.value / 100) + elif goal.type == "percent-min-improvement": + resolved_goal_value = baseline_sub_type * (1 + multiplier * goal.value / 100) resolved_goals[joint_metric_key(metric_name, sub_type_name)] = resolved_goal_value if len(resolved_goals) > 0: diff --git a/olive/model/hf_mappings.py b/olive/model/hf_mappings.py index 71eeea8965..754c2b2420 100644 --- a/olive/model/hf_mappings.py +++ b/olive/model/hf_mappings.py @@ -72,14 +72,16 @@ # To extend following list/map from huggingface config # there is the priority order: NUM_HEADS_NAMES[0] and HIDDEN_SIZE_NAMES[0] are the first choice # which means user can override the value in config file -NUM_HEADS_NAMES = ["num_heads", "num_attention_heads", "n_head", "encoder_attention_heads"] -HIDDEN_SIZE_NAMES = ["hidden_size", "d_model", "n_embd"] +NUM_HEADS_NAMES = ["num_heads", "num_attention_heads", "n_head", "n_heads", "encoder_attention_heads"] +HIDDEN_SIZE_NAMES = ["hidden_size", "dim", "d_model", "n_embd"] MODEL_TYPE_MAPPING = { "whisper": "bart", "camembert": "bert", "deberta": "bert", "deberta-v2": "bert", + "distilbert": "bert", "gpt_neox": "gpt2", "gpt-j": "gpt2", "llama": "gpt2", + "roberta": "bert", }