From 2b648f1fd020e428d577db0c81a25c900c757454 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 14 Jun 2023 18:42:15 +0000
Subject: [PATCH 01/67] added code in utils.py-docker

---
 olive/systems/docker/utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/olive/systems/docker/utils.py b/olive/systems/docker/utils.py
index 011bffebb7..f58ac0bbb3 100644
--- a/olive/systems/docker/utils.py
+++ b/olive/systems/docker/utils.py
@@ -82,8 +82,17 @@ def create_metric_volumes_list(metrics: List[Metric], container_root_path: Path,
 
 
 def create_model_mount(model: OliveModel, container_root_path: Path):
-    model_mount_path = str(container_root_path / Path(model.model_path).name)
-    model_mount_str = f"{str(Path(model.model_path).resolve())}:{model_mount_path}"
+    model_resource_path = None
+    if not model.model_resource_path:
+        model_resource_path = None
+    elif model.model_resource_path.is_local_resource() or model.model_resource_path.is_string_name():
+        model_resource_path = model.model_resource_path
+    else:
+        assert model.local_model_path, "local model path not set"
+        model_resource_path = model.local_model_path
+    model_path = model_resource_path.get_path()
+    model_mount_path = str(container_root_path / Path(model_path).name)
+    model_mount_str = f"{str(Path(model_path).resolve())}:{model_mount_path}"
     model_mount_str_list = [model_mount_str]
 
     if model.framework == Framework.PYTORCH:

From 0a9be4b0a01b9dbe1d986715a505f8e0ecd3ad87 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 21 Jun 2023 18:23:11 +0000
Subject: [PATCH 02/67] created scripts and test file to run on CI

---
 .../olive-perf-monitoring-template.yaml       | 56 ++++++++++++++
 .azure_pipelines/olive-ci.yaml                |  9 +++
 .azure_pipelines/perfmonitoring-ci .yaml      | 46 +++++++++++
 Makefile                                      |  4 +
 perf_monitoring/bert_workflow_cpu.json        | 77 +++++++++++++++++++
 perf_monitoring/requirements.txt              | 10 +++
 .../test_perf_monitoring_bert_cpu.py          | 28 +++++++
 perf_monitoring/utils.py                      | 29 +++++++
 scripts/perf_monitoring.bat                   | 29 +++++++
 scripts/perf_monitoring.sh                    | 26 +++++++
 10 files changed, 314 insertions(+)
 create mode 100644 .azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
 create mode 100644 .azure_pipelines/perfmonitoring-ci .yaml
 create mode 100644 perf_monitoring/bert_workflow_cpu.json
 create mode 100644 perf_monitoring/requirements.txt
 create mode 100644 perf_monitoring/test_perf_monitoring_bert_cpu.py
 create mode 100644 perf_monitoring/utils.py
 create mode 100644 scripts/perf_monitoring.bat
 create mode 100644 scripts/perf_monitoring.sh

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
new file mode 100644
index 0000000000..e86a5820f5
--- /dev/null
+++ b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
@@ -0,0 +1,56 @@
+# Olive performance monitoring template on Azure DevOps
+
+parameters:
+  name: ''
+  pool: ''
+
+jobs:
+- job: ${{parameters.name}}_Examples_performance_monitoring_olive
+  timeoutInMinutes: 300
+  pool:
+    name: ${{ parameters.pool}}
+  strategy:
+    matrix:
+      ${{ insert }}: ${{ parameters.examples }}
+  variables:
+    WINDOWS: ${{ parameters.windows }}
+    runCodesignValidationInjection: false
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: 3.8
+    displayName: Use Python 3.8
+
+  - script: make install-olive PIPELINE=True INSTALL_EXTRAS=[cpu]
+    displayName: Install Olive
+
+  - task: AzureCLI@1
+    inputs:
+      azureSubscription: $(OLIVE_RG_SERVICE_CONNECTION)
+      scriptLocation: 'inlineScript'
+      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName)
+    displayName: performance monitoring
+    env:
+      OLIVEWHEELS_STORAGE_CONNECTION_STRING: $(olive-wheels-storage-connection-string)
+      WORKSPACE_SUBSCRIPTION_ID: $(workspace-subscription-id)
+      WORKSPACE_RESOURCE_GROUP: $(workspace-resource-group)
+      WORKSPACE_NAME: $(workspace-name)
+
+  - task: ComponentGovernanceComponentDetection@0
+    inputs:
+      scanType: 'Register'
+      verbosity: 'Verbose'
+      alertWarningLevel: 'High'
+    displayName: Component Detection
+
+  - task: PublishTestResults@2
+    condition: succeededOrFailed()
+    inputs:
+      testResultsFiles: '**/*TestOlive*.xml'
+      testRunTitle: '$(Build.BuildNumber)[$(Agent.JobName)]'
+    displayName: Upload pipeline run test results
+
+  - script: make clean WINDOWS=$(WINDOWS)
+    condition: always()
+    displayName: Clean remaining artifacts
diff --git a/.azure_pipelines/olive-ci.yaml b/.azure_pipelines/olive-ci.yaml
index 4ecceb9742..806d60f00f 100644
--- a/.azure_pipelines/olive-ci.yaml
+++ b/.azure_pipelines/olive-ci.yaml
@@ -80,3 +80,12 @@ jobs:
     display_name: Test Build Docs
     pool: $(OLIVE_POOL_UBUNTU2004)
     publish_docs: false
+
+- template: job_templates/olive-perf-monitoring-template.yaml
+  parameters:
+    name: Windows_CI
+    pool: $(OLIVE_POOL_WIN2019)
+    windows: True
+    examples:
+      bert:
+        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
new file mode 100644
index 0000000000..f266e4ceff
--- /dev/null
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -0,0 +1,46 @@
+trigger:
+  branches:
+    include:
+    - main
+  paths:
+    exclude:
+    - docs/*
+    - examples/README.md
+    - examples/**/README.md
+    - README.md
+    - CONTRIBUTING.md
+    - LICENSE
+pr:
+  branches:
+    include:
+    - main
+  paths:
+    exclude:
+    - docs/*
+    - examples/README.md
+    - examples/**/README.md
+    - README.md
+    - CONTRIBUTING.md
+    - LICENSE
+
+variables:
+    ComponentDetection.Timeout: 2400
+
+jobs:
+- template: job_templates/olive-perf-monitoring-template.yaml
+  parameters:
+    name: Windows_CI
+    pool: $(OLIVE_POOL_WIN2019)
+    windows: True
+    examples:
+      bert:
+        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
+
+- template: job_templates/olive-perf-monitoring-template.yaml
+  parameters:
+    name: Linux_CI
+    pool: $(OLIVE_POOL_UBUNTU2004)
+    windows: False
+    examples:
+      bert:
+        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
diff --git a/Makefile b/Makefile
index b52204dfb2..b135ffc8b7 100644
--- a/Makefile
+++ b/Makefile
@@ -48,3 +48,7 @@ test-examples:
 .PHONY: clean
 clean:
 	git clean -dfX
+
+.PHONY: perf-monitoring
+perf-monitoring:
+	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)
diff --git a/perf_monitoring/bert_workflow_cpu.json b/perf_monitoring/bert_workflow_cpu.json
new file mode 100644
index 0000000000..0f85ea7a43
--- /dev/null
+++ b/perf_monitoring/bert_workflow_cpu.json
@@ -0,0 +1,77 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Intel/bert-base-uncased-mrpc",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}},
+                        {"name": "f1"}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}},
+                        {"name": "max"},
+                        {"name": "min"}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "exhaustive"
+
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
new file mode 100644
index 0000000000..5585a17093
--- /dev/null
+++ b/perf_monitoring/requirements.txt
@@ -0,0 +1,10 @@
+azure-ai-ml
+azure-identity
+datasets
+evaluate
+docker
+onnxruntime
+neural-compressor
+scipy
+scikit-learn
+transformers
diff --git a/perf_monitoring/test_perf_monitoring_bert_cpu.py b/perf_monitoring/test_perf_monitoring_bert_cpu.py
new file mode 100644
index 0000000000..d9ac8f710e
--- /dev/null
+++ b/perf_monitoring/test_perf_monitoring_bert_cpu.py
@@ -0,0 +1,28 @@
+import os
+from pathlib import Path
+
+import pytest
+from utils import check_search_output, patch_config
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup():
+    """setup any state specific to the execution of the given module."""
+    cur_dir = Path(__file__).resolve().parent.parent
+    example_dir = cur_dir / "perf_monitoring"
+    os.chdir(example_dir)
+    yield
+    os.chdir(cur_dir)
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["bert_workflow_cpu.json"],
+)
+def test_bert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    check_search_output(footprint)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
new file mode 100644
index 0000000000..604968d65b
--- /dev/null
+++ b/perf_monitoring/utils.py
@@ -0,0 +1,29 @@
+import json
+
+
+def check_search_output(footprints):
+    """Check if the search output is valid."""
+    assert footprints, "footprints is empty. The search must have failed for all accelerator specs."
+    for footprint in footprints.values():
+        assert footprint.nodes
+        for v in footprint.nodes.values():
+            assert all([metric_result.value > 0 for metric_result in v.metrics.value.values()])
+
+
+def check_no_search_output(outputs):
+    assert outputs, "outputs is empty. The run must have failed for all accelerator specs."
+    for output in outputs.values():
+        output_metrics = output["metrics"]
+        for item in output_metrics.values():
+            assert item.value > 0
+
+
+def patch_config(config_json_path: str):
+    """Load the config json file and patch it with default search algorithm (exhaustive)"""
+    with open(config_json_path, "r") as fin:
+        olive_config = json.load(fin)
+    # set default logger severity
+    olive_config["engine"]["log_severity_level"] = 0
+    # set clean cache
+    olive_config["engine"]["clean_cache"] = True
+    return olive_config
diff --git a/scripts/perf_monitoring.bat b/scripts/perf_monitoring.bat
new file mode 100644
index 0000000000..11b008d251
--- /dev/null
+++ b/scripts/perf_monitoring.bat
@@ -0,0 +1,29 @@
+REM -------------------------------------------------------------------------
+REM Copyright (c) Microsoft Corporation. All rights reserved.
+REM Licensed under the MIT License.
+REM --------------------------------------------------------------------------
+@echo off
+
+set PIPELINE=%1
+set ROOT_DIR=%2
+set PERF_MONITORING_SCRIPT_NAME=%3
+
+if "%PIPELINE%"=="True" (
+    call olive-venv\\Scripts\\activate.bat || goto :error
+)
+
+rem install pytest
+call python -m pip install pytest
+
+rem performance monitoring
+call echo "performance monitoring examples"
+call python -m pip install -r %ROOT_DIR%\\perf_monitoring\\requirements.txt || goto :error
+
+call python -m pytest -v -s --log-cli-level=WARNING --junitxml=%ROOT_DIR%\\logs\\performance-monitoring-TestOlive.xml^
+ %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py || goto :error
+
+goto :EOF
+
+:error
+echo Failed with error #%errorlevel%.
+exit /b %errorlevel%
diff --git a/scripts/perf_monitoring.sh b/scripts/perf_monitoring.sh
new file mode 100644
index 0000000000..9df637b8b3
--- /dev/null
+++ b/scripts/perf_monitoring.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+set -eoux pipefail
+
+PIPELINE=$1
+ROOT_DIR=$2
+PERF_MONITORING_SCRIPT_NAME=$3
+
+echo $PIPELINE
+if [[ "$PIPELINE" == "True" ]]; then
+    set +x
+    source olive-venv/bin/activate
+    set -x
+fi
+
+# install pytest
+python -m pip install pytest
+
+# performance monitoring
+echo "performance monitoring examples"
+python -m pip install -r $ROOT_DIR/perf_monitoring/requirements.txt
+
+python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py

From f1d0f631393969ad71d6f0a4862beae3ed484dc7 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 22 Jun 2023 21:21:27 +0000
Subject: [PATCH 03/67] fixed bug in Makefile

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index b135ffc8b7..1318e920a0 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ ifeq ($(WINDOWS), True)
 	INSTALL_OLIVE_CMD       = "scripts\\install_olive.bat"
 	TEST_CMD                = "scripts\\test.bat"
 	TEST_EXAMPLES_CMD       = "scripts\\test_examples.bat"
+	PERFORMANCE_MONITORING_CMD = "scripts\\performance_monitoring.bat"
 	OVERWRITE_VERSION       = "python scripts\\overwrite_version.py --version $(VERSION)"
 else
 	CURRENT_DIR             = ${CURDIR}
@@ -18,6 +19,7 @@ else
 	INSTALL_OLIVE_CMD       = bash scripts/install_olive.sh
 	TEST_CMD                = bash scripts/test.sh
 	TEST_EXAMPLES_CMD       = bash scripts/test_examples.sh
+	PERFORMANCE_MONITORING_CMD = bash scripts/performance_monitoring.sh
 	OVERWRITE_VERSION       = python scripts/overwrite_version.py --version $(VERSION)
 endif
 

From 876329be0dbfd5f7923e689fd40027b0a1a662e3 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 22 Jun 2023 21:29:20 +0000
Subject: [PATCH 04/67] fixed bug in Makefile

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 1318e920a0..3b0adee7d2 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ ifeq ($(WINDOWS), True)
 	INSTALL_OLIVE_CMD       = "scripts\\install_olive.bat"
 	TEST_CMD                = "scripts\\test.bat"
 	TEST_EXAMPLES_CMD       = "scripts\\test_examples.bat"
-	PERFORMANCE_MONITORING_CMD = "scripts\\performance_monitoring.bat"
+	PERFORMANCE_MONITORING_CMD = "scripts\\perf_monitoring.bat"
 	OVERWRITE_VERSION       = "python scripts\\overwrite_version.py --version $(VERSION)"
 else
 	CURRENT_DIR             = ${CURDIR}
@@ -19,7 +19,7 @@ else
 	INSTALL_OLIVE_CMD       = bash scripts/install_olive.sh
 	TEST_CMD                = bash scripts/test.sh
 	TEST_EXAMPLES_CMD       = bash scripts/test_examples.sh
-	PERFORMANCE_MONITORING_CMD = bash scripts/performance_monitoring.sh
+	PERFORMANCE_MONITORING_CMD = bash scripts/perf_monitoring.sh
 	OVERWRITE_VERSION       = python scripts/overwrite_version.py --version $(VERSION)
 endif
 

From 94837c73de995488554cf27abef9332186320fde Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 22 Jun 2023 23:11:52 +0000
Subject: [PATCH 05/67] fixing bug in Makefile

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 3b0adee7d2..6a22d8421f 100644
--- a/Makefile
+++ b/Makefile
@@ -52,5 +52,6 @@ clean:
 	git clean -dfX
 
 .PHONY: perf-monitoring
+perf-monitoring: logs/
 perf-monitoring:
 	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)

From ae8692f4b5894ba06b1bb9cb58b48a19faa42e9b Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 22 Jun 2023 23:21:44 +0000
Subject: [PATCH 06/67] fixing bug in Makefile

---
 .azure_pipelines/perfmonitoring-ci .yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index f266e4ceff..0df719f0b7 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -34,7 +34,7 @@ jobs:
     windows: True
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
+        perfMonitoringScriptName: perf_monitoring_bert_cpu
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
@@ -43,4 +43,4 @@ jobs:
     windows: False
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
+        perfMonitoringScriptName: perf_monitoring_bert_cpu

From 4c52adf9ba632732ae7b4fd52b82a237d6b92988 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 22 Jun 2023 23:58:51 +0000
Subject: [PATCH 07/67] fixing bug in Makefile

---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6a22d8421f..3b0adee7d2 100644
--- a/Makefile
+++ b/Makefile
@@ -52,6 +52,5 @@ clean:
 	git clean -dfX
 
 .PHONY: perf-monitoring
-perf-monitoring: logs/
 perf-monitoring:
 	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)

From 3602c4ee72dcf64ce5fcdcffb4c9a9bd4616b07b Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Sat, 24 Jun 2023 19:22:28 +0000
Subject: [PATCH 08/67] changed search algorithm in json

---
 Makefile                               | 1 +
 perf_monitoring/bert_workflow_cpu.json | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3b0adee7d2..05fa691334 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ PIPELINE                   ?= False
 INSTALL_DEV_MODE           ?= False
 EXAMPLE_FOLDER             ?=
 EXAMPLE_NAME               ?=
+PERF_MONITORING_SCRIPT_NAME ?=
 INSTALL_EXTRAS             ?=
 VERSION                    ?=
 ifeq ($(WINDOWS), True)
diff --git a/perf_monitoring/bert_workflow_cpu.json b/perf_monitoring/bert_workflow_cpu.json
index 0f85ea7a43..e37b7d34eb 100644
--- a/perf_monitoring/bert_workflow_cpu.json
+++ b/perf_monitoring/bert_workflow_cpu.json
@@ -66,8 +66,11 @@
     "engine": {
         "search_strategy": {
             "execution_order": "joint",
-            "search_algorithm": "exhaustive"
-
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
         },
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],

From 6a8e79b29c07d0f25fa9c22413640c455da379dd Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 28 Jun 2023 10:43:55 +0000
Subject: [PATCH 09/67] testing for best metrics on vm

---
 perf_monitoring/bert_workflow_cpu.json | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/perf_monitoring/bert_workflow_cpu.json b/perf_monitoring/bert_workflow_cpu.json
index e37b7d34eb..6cb92e6655 100644
--- a/perf_monitoring/bert_workflow_cpu.json
+++ b/perf_monitoring/bert_workflow_cpu.json
@@ -24,17 +24,15 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}},
-                        {"name": "f1"}
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+
                     ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}},
-                        {"name": "max"},
-                        {"name": "min"}
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
                     ]
                 }
             ]
@@ -66,11 +64,7 @@
     "engine": {
         "search_strategy": {
             "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
+            "search_algorithm": "exhaustive"
         },
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],

From 6293ac4b01f603a8d38aceb4cda9dfd559f35ccc Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 29 Jun 2023 04:05:07 +0000
Subject: [PATCH 10/67] w wrote script for extracting best model

---
 perf_monitoring/bert_workflow_cpu.json        | 10 +++--
 .../test_perf_monitoring_bert_cpu.py          |  4 +-
 perf_monitoring/utils.py                      | 38 ++++++++++++++++++-
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/perf_monitoring/bert_workflow_cpu.json b/perf_monitoring/bert_workflow_cpu.json
index 6cb92e6655..bff54468de 100644
--- a/perf_monitoring/bert_workflow_cpu.json
+++ b/perf_monitoring/bert_workflow_cpu.json
@@ -24,7 +24,7 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                        {"name": "accuracy", "priority": 1}
 
                     ]
                 },
@@ -32,7 +32,7 @@
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                        {"name": "avg", "priority": 2}
                     ]
                 }
             ]
@@ -64,7 +64,11 @@
     "engine": {
         "search_strategy": {
             "execution_order": "joint",
-            "search_algorithm": "exhaustive"
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
         },
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
diff --git a/perf_monitoring/test_perf_monitoring_bert_cpu.py b/perf_monitoring/test_perf_monitoring_bert_cpu.py
index d9ac8f710e..ee8953ba83 100644
--- a/perf_monitoring/test_perf_monitoring_bert_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_bert_cpu.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 import pytest
-from utils import check_search_output, patch_config
+from utils import extract_best_models, patch_config
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -25,4 +25,4 @@ def test_bert(olive_json):
 
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
-    check_search_output(footprint)
+    extract_best_models(footprint)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 604968d65b..3500095e83 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -25,5 +25,41 @@ def patch_config(config_json_path: str):
     # set default logger severity
     olive_config["engine"]["log_severity_level"] = 0
     # set clean cache
-    olive_config["engine"]["clean_cache"] = True
+    olive_config["engine"]["clean_cache"] = False
     return olive_config
+
+
+def extract_best_models(footprint):
+    footprint = list(footprint.values())[0]
+    metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
+    # gather the metrics from all pareto frontier nodes
+    all_metrics = []
+    # we iterate over the nodes in the pareto frontier
+    for node in footprint.nodes.values():
+        metrics = []
+        # collecting the metrics of interest
+        for name in metrics_of_interest:
+            # (value of metric * direction of comparison)
+            # now higher is better for all metrics
+            metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])
+        all_metrics.append(metrics)
+    # sort the metrics
+    # this sorts it
+    sorted_metrics = sorted(all_metrics, reverse=True)
+    # get best metrics
+    # last one is the best
+    best_metrics = sorted_metrics[0]
+    save_best_metrics(best_metrics)
+
+
+def save_best_metrics(best_metrics):
+    # open best metrics json
+    with open("best_metrics.json") as f:
+        data = json.load(f)
+        print(data[0], data[1])
+        print(best_metrics[0], best_metrics[1])
+        if best_metrics[0] > data[0] and best_metrics[1] < data[1]:
+            best_metrics = data
+    # save best metrics to json
+    with open("best_metrics.json", "w") as f:
+        json.dump(best_metrics, f)

From 13ba055b926c08fc7788c7b544f7ffb7de51ffc0 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 29 Jun 2023 04:13:56 +0000
Subject: [PATCH 11/67] w wrote script for extracting best model

---
 perf_monitoring/best_metrics.json |   1 +
 perf_monitoring/readme.md         |   7 +
 perf_monitoring/testtt.ipynb      | 288 ++++++++++++++++++++++++++++++
 3 files changed, 296 insertions(+)
 create mode 100644 perf_monitoring/best_metrics.json
 create mode 100644 perf_monitoring/readme.md
 create mode 100644 perf_monitoring/testtt.ipynb

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
new file mode 100644
index 0000000000..87adabe561
--- /dev/null
+++ b/perf_monitoring/best_metrics.json
@@ -0,0 +1 @@
+[0.1, -20]
diff --git a/perf_monitoring/readme.md b/perf_monitoring/readme.md
new file mode 100644
index 0000000000..7e7e785873
--- /dev/null
+++ b/perf_monitoring/readme.md
@@ -0,0 +1,7 @@
+## Extracting best models
+
+-The file named testtt.ipynb is a place holder file to test and see the structure of my outputs so i could write my script
+-The file named best_models.py is the script that extracts the best models from the output of the script and compared with the metrics of the original model, if the metrics of the best model is better than the original model, the best model is saved in a json named best_metrics.json.
+-I added a function called extract best models in my utils file to extract the best models from the output of the script and compared with the metrics of the original model, if the metrics of the best model is better than the original model, the best model is saved in a json named best_metrics.json.
+### How to run the script
+Use pytest -s -v to run the script with the script name.
diff --git a/perf_monitoring/testtt.ipynb b/perf_monitoring/testtt.ipynb
new file mode 100644
index 0000000000..e17b1aa1fc
--- /dev/null
+++ b/perf_monitoring/testtt.ipynb
@@ -0,0 +1,288 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-06-29 00:10:57,266] [DEBUG] [engine.py:577:resolve_goals] Resolving goals: {'accuracy': {'accuracy': None}, 'latency': {'avg': None}}\n",
+      "[2023-06-29 00:10:57,267] [DEBUG] [engine.py:596:resolve_goals] No baseline got as no goal is provided the the goal is threshold\n",
+      "[2023-06-29 00:10:57,277] [DEBUG] [engine.py:498:run_search] Step 1 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'static', 'calibrate_method': 'MinMax', 'quant_format': 'QOperator', 'MatMulConstBOnly': False, 'weight_type': 'QUInt8', 'activation_type': 'QUInt8', 'per_channel': False, 'reduce_range': False, 'optimize_model': True, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
+      "[2023-06-29 00:10:57,277] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
+      "[2023-06-29 00:10:57,278] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:10:57,279] [DEBUG] [__init__.py:582:get_dummy_inputs] Using hf_config.dataset to get dummy inputs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/optuna/samplers/_tpe/sampler.py:278: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n",
+      "  warnings.warn(\n",
+      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/optuna/samplers/_tpe/sampler.py:289: ExperimentalWarning: ``group`` option is an experimental feature. The interface can change in the future.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-06-29 00:10:57,717] [DEBUG] [conversion.py:73:_run_for_config] Using hf config to get io_config for the model.\n",
+      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 =============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n",
+      "[2023-06-29 00:11:06,068] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
+      "[2023-06-29 00:11:06,070] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:11:24,090] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
+      "[2023-06-29 00:11:24,094] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:11:24,095] [INFO] [quantization.py:333:_run_for_config] Preprocessing model for quantization\n",
+      "[2023-06-29 00:12:08,588] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
+      "[2023-06-29 00:12:08,592] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:12:10,386] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:12:22,750] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:12:44,367] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 208.2179\n",
+      "[2023-06-29 00:12:44,369] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 34.99482\n",
+      "[2023-06-29 00:12:44,369] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 39.99097\n",
+      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 35.78461\n",
+      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 27.77902\n",
+      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.60358\n",
+      "[2023-06-29 00:12:44,371] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.29312\n",
+      "[2023-06-29 00:12:44,371] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.40254\n",
+      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 31.82336\n",
+      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 205.92308\n",
+      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.58837\n",
+      "[2023-06-29 00:12:44,373] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 39.44375\n",
+      "[2023-06-29 00:12:44,373] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 31.32929\n",
+      "[2023-06-29 00:12:44,373] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}, 'io_bind': False, 'latency_ms': 27.77902}\n",
+      "[2023-06-29 00:12:44,376] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
+      "[2023-06-29 00:12:44,377] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:12:58,223] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8357843137254902, 'latency-avg': 28.00311}\n",
+      "[2023-06-29 00:12:58,228] [DEBUG] [engine.py:498:run_search] Step 2 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'dynamic', 'calibrate_method': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'quant_format': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'MatMulConstBOnly': True, 'weight_type': 'QUInt8', 'activation_type': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'per_channel': True, 'reduce_range': True, 'optimize_model': True, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
+      "[2023-06-29 00:12:58,228] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
+      "[2023-06-29 00:12:58,230] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
+      "[2023-06-29 00:12:58,233] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
+      "[2023-06-29 00:12:58,235] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
+      "[2023-06-29 00:12:58,237] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
+      "[2023-06-29 00:12:58,241] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:12:58,244] [INFO] [quantization.py:336:_run_for_config] Already processed model for quantization, skipping preprocessing\n",
+      "[2023-06-29 00:13:29,778] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
+      "[2023-06-29 00:13:29,782] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:13:31,522] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:13:42,631] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:14:01,500] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 205.27577\n",
+      "[2023-06-29 00:14:01,501] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.17117\n",
+      "[2023-06-29 00:14:01,502] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.52153\n",
+      "[2023-06-29 00:14:01,502] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 28.28242\n",
+      "[2023-06-29 00:14:01,503] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 25.07949\n",
+      "[2023-06-29 00:14:01,503] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.26937\n",
+      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.00565\n",
+      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 35.17373\n",
+      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.54845\n",
+      "[2023-06-29 00:14:01,505] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 202.92866\n",
+      "[2023-06-29 00:14:01,505] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 26.6119\n",
+      "[2023-06-29 00:14:01,506] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.80204\n",
+      "[2023-06-29 00:14:01,506] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.87976\n",
+      "[2023-06-29 00:14:01,507] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}, 'io_bind': False, 'latency_ms': 25.07949}\n",
+      "[2023-06-29 00:14:01,509] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
+      "[2023-06-29 00:14:01,510] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:14:15,862] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.17589}\n",
+      "[2023-06-29 00:14:15,867] [DEBUG] [engine.py:498:run_search] Step 3 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'dynamic', 'calibrate_method': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'quant_format': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'MatMulConstBOnly': True, 'weight_type': 'QInt8', 'activation_type': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'per_channel': True, 'reduce_range': True, 'optimize_model': False, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
+      "[2023-06-29 00:14:15,867] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
+      "[2023-06-29 00:14:15,868] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
+      "[2023-06-29 00:14:15,871] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
+      "[2023-06-29 00:14:15,872] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
+      "[2023-06-29 00:14:15,875] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
+      "[2023-06-29 00:14:15,878] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:14:15,880] [INFO] [quantization.py:336:_run_for_config] Already processed model for quantization, skipping preprocessing\n",
+      "[2023-06-29 00:14:45,593] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
+      "[2023-06-29 00:14:45,596] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:14:47,326] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:14:58,233] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
+      "[2023-06-29 00:15:17,108] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 196.05226\n",
+      "[2023-06-29 00:15:17,109] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 26.61709\n",
+      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 36.43117\n",
+      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 28.41241\n",
+      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.891\n",
+      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 33.73918\n",
+      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 33.83653\n",
+      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 34.48713\n",
+      "[2023-06-29 00:15:17,112] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.04336\n",
+      "[2023-06-29 00:15:17,112] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 197.81958\n",
+      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 25.34003\n",
+      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.36773\n",
+      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 27.06558\n",
+      "[2023-06-29 00:15:17,114] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': 12}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': 12}, 'io_bind': False, 'latency_ms': 25.34003}\n",
+      "[2023-06-29 00:15:17,117] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
+      "[2023-06-29 00:15:17,117] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
+      "[2023-06-29 00:15:30,319] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.7194}\n",
+      "[2023-06-29 00:15:30,324] [INFO] [footprint.py:167:get_pareto_frontier] pareto frontier points: 5_OrtPerfTuning-4-600614b69719e936ca21efbf07971aec {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.17589}\n",
+      "[2023-06-29 00:15:30,324] [INFO] [engine.py:513:run_search] Output all 1 models\n",
+      "[2023-06-29 00:15:30,325] [INFO] [engine.py:337:run] No packaging config provided, skip packaging artifacts\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from utils import check_search_output, patch_config\n",
+    "\n",
+    "\n",
+    "from olive.workflows import run as olive_run\n",
+    "\n",
+    "olive_config = patch_config(\"bert_workflow_cpu.json\")\n",
+    "footprint = olive_run(olive_config)\n",
+    "check_search_output(footprint)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "value=MetricResult(__root__={'accuracy-accuracy': SubMetricResult(value=0.8455882352941176, priority=1, higher_is_better=True), 'latency-avg': SubMetricResult(value=24.17589, priority=2, higher_is_better=False)}) cmp_direction={'accuracy-accuracy': 1, 'latency-avg': -1} is_goals_met=True\n",
+      "('value', MetricResult(__root__={'accuracy-accuracy': SubMetricResult(value=0.8455882352941176, priority=1, higher_is_better=True), 'latency-avg': SubMetricResult(value=24.17589, priority=2, higher_is_better=False)}))\n",
+      "('cmp_direction', {'accuracy-accuracy': 1, 'latency-avg': -1})\n",
+      "('is_goals_met', True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "footprint\n",
+    "#get first item from dict\n",
+    "fooott = list(footprint.values())[0] \n",
+    "\n",
+    "for node in fooott.nodes.values():\n",
+    "    print(node.metrics)\n",
+    "    for node in node.metrics:\n",
+    "        print(node)\n",
+    "   # print(type(node.metrics.value))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from olive.engine.footprint import Footprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pf = Footprint.from_file(\"models/bert_workflow_cpu/cpu-cpu_pareto_frontier_footprints.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0.8455882352941176, -24.17589]"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metrics_of_interest = ['accuracy-accuracy', 'latency-avg']\n",
+    "# gather the metrics from all pareto frontier nodes\n",
+    "all_metrics = []\n",
+    "# we iterate over the nodes in the pareto frontier\n",
+    "for node in pf.nodes.values():\n",
+    "    metrics = []\n",
+    "    # collecting the metrics of interest\n",
+    "    for name in metrics_of_interest:\n",
+    "        # (value of metric * direction of comparison)\n",
+    "        # now higher is better for all metrics\n",
+    "        metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])\n",
+    "    all_metrics.append(metrics)\n",
+    "# sort the metrics\n",
+    "# this sorts it\n",
+    "sorted_metrics = sorted(all_metrics, reverse=True)\n",
+    "# get best metrics\n",
+    "# last one is the best\n",
+    "best_metrics = sorted_metrics[0]\n",
+    "best_metrics\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.1 -20\n",
+      "0.8455882352941176 -24.17589\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "#open best metrics json\n",
+    "with open('best_metrics.json') as f:\n",
+    "    data = json.load(f)\n",
+    "    print(data[0], data[1])\n",
+    "    print(best_metrics[0], best_metrics[1])\n",
+    "    if best_metrics[0] > data[0] and best_metrics[1] < data[1]:\n",
+    "        best_metrics = data\n",
+    "#save best metrics to json\n",
+    "with open('best_metrics.json', 'w') as f:\n",
+    "    json.dump(best_metrics, f)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "emmanuel-onnx",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From adecf5cfaa19bcc859026cf434ee7a5b0690ae82 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 6 Jul 2023 02:52:29 +0000
Subject: [PATCH 12/67] fixing error in CamemBERT test

---
 perf_monitoring/bert_workflow_cpu.json        | 78 -------------------
 perf_monitoring/best_metrics.json             |  6 +-
 perf_monitoring/requirements.txt              |  3 +
 .../test_perf_monitoring_bert_cpu.py          | 28 -------
 perf_monitoring/utils.py                      | 31 +++++---
 5 files changed, 29 insertions(+), 117 deletions(-)
 delete mode 100644 perf_monitoring/bert_workflow_cpu.json
 delete mode 100644 perf_monitoring/test_perf_monitoring_bert_cpu.py

diff --git a/perf_monitoring/bert_workflow_cpu.json b/perf_monitoring/bert_workflow_cpu.json
deleted file mode 100644
index bff54468de..0000000000
--- a/perf_monitoring/bert_workflow_cpu.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "Intel/bert-base-uncased-mrpc",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mrpc",
-                    "split": "validation",
-                    "input_cols": ["sentence1", "sentence2"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index 87adabe561..b35cf90eb6 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1 +1,5 @@
-[0.1, -20]
+{
+    "bert": [],
+    "CamemBERT": []
+
+}
diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
index 5585a17093..dc90059d04 100644
--- a/perf_monitoring/requirements.txt
+++ b/perf_monitoring/requirements.txt
@@ -8,3 +8,6 @@ neural-compressor
 scipy
 scikit-learn
 transformers
+sentencepiece
+evaluate
+seqeval
diff --git a/perf_monitoring/test_perf_monitoring_bert_cpu.py b/perf_monitoring/test_perf_monitoring_bert_cpu.py
deleted file mode 100644
index ee8953ba83..0000000000
--- a/perf_monitoring/test_perf_monitoring_bert_cpu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-from pathlib import Path
-
-import pytest
-from utils import extract_best_models, patch_config
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup():
-    """setup any state specific to the execution of the given module."""
-    cur_dir = Path(__file__).resolve().parent.parent
-    example_dir = cur_dir / "perf_monitoring"
-    os.chdir(example_dir)
-    yield
-    os.chdir(cur_dir)
-
-
-@pytest.mark.parametrize(
-    "olive_json",
-    ["bert_workflow_cpu.json"],
-)
-def test_bert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 3500095e83..ce716570cb 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -25,11 +25,11 @@ def patch_config(config_json_path: str):
     # set default logger severity
     olive_config["engine"]["log_severity_level"] = 0
     # set clean cache
-    olive_config["engine"]["clean_cache"] = False
+    olive_config["engine"]["clean_cache"] = True
     return olive_config
 
 
-def extract_best_models(footprint):
+def extract_best_models(footprint, model_name):
     footprint = list(footprint.values())[0]
     metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
     # gather the metrics from all pareto frontier nodes
@@ -49,17 +49,28 @@ def extract_best_models(footprint):
     # get best metrics
     # last one is the best
     best_metrics = sorted_metrics[0]
-    save_best_metrics(best_metrics)
+    print("Best metrics: ", best_metrics)
+    compared_metric = compare_metrics(best_metrics, model_name)
+    print("Compared metrics: ", compared_metric)
 
 
-def save_best_metrics(best_metrics):
+def no_regression(actual, expected, rel_tol):
+    if actual > expected:
+        return True
+    return abs(actual - expected) <= rel_tol * abs(expected)
+
+
+def compare_metrics(best_metrics, model_name):
     # open best metrics json
     with open("best_metrics.json") as f:
         data = json.load(f)
-        print(data[0], data[1])
+        model_data = data[model_name]
+        if len(model_data) == 0:
+            print("No data in best_metrics.json")
+            return {"accuracy": True, "latency": True}
+        print(model_data[0], model_data[1])
         print(best_metrics[0], best_metrics[1])
-        if best_metrics[0] > data[0] and best_metrics[1] < data[1]:
-            best_metrics = data
-    # save best metrics to json
-    with open("best_metrics.json", "w") as f:
-        json.dump(best_metrics, f)
+        return {
+            "accuracy": no_regression(best_metrics[0], model_data[0], 0.05),
+            "latency": no_regression(best_metrics[1], model_data[1], 0.05),
+        }

From fb26ff539d8052cac7c9941adf3f7b88a715e6a6 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 6 Jul 2023 02:53:42 +0000
Subject: [PATCH 13/67] fixing error in CamemBERT test

---
 .../perf_models/CamemBERT/cpu_config.json     | 112 ++++++++++
 .../perf_models/CamemBERT/user_script.py      | 194 ++++++++++++++++++
 .../perf_models/bert/bert_workflow_cpu.json   |  78 +++++++
 .../test_perf_monitoring_models_cpu.py        |  41 ++++
 4 files changed, 425 insertions(+)
 create mode 100644 perf_monitoring/perf_models/CamemBERT/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/CamemBERT/user_script.py
 create mode 100644 perf_monitoring/perf_models/bert/bert_workflow_cpu.json
 create mode 100644 perf_monitoring/test_perf_monitoring_models_cpu.py

diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
new file mode 100644
index 0000000000..20d089bdeb
--- /dev/null
+++ b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
@@ -0,0 +1,112 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Jean-Baptiste/camembert-ner",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"Jean-Baptiste/wikiner_fr",
+                    "split": "test",
+                    "input_cols": ["input_ids", "attention_mask"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+
+        }
+    },
+
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "input_names": [
+                    "input_ids",
+                    "attention_mask"
+                ],
+                "input_shapes": [
+                    [
+                        1,
+                        128
+                    ],
+                    [
+                        1,
+                        128
+                    ]
+                ],
+                "input_types": [
+                    "int64",
+                    "int64"
+                ],
+                "output_names": [
+                    "output"
+                ],
+                "dynamic_axes": {
+                    "input_ids": {
+                        "0": "batch_size",
+                        "1": "seq_length"
+                    },
+                    "attention_mask": {
+                        "0": "batch_size",
+                        "1": "seq_length"
+                    }
+                },
+                "target_opset": 17
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "clean_cache": false,
+        "cache_dir": "cache",
+        "output_dir" : "models/CamemBERT_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/CamemBERT/user_script.py b/perf_monitoring/perf_models/CamemBERT/user_script.py
new file mode 100644
index 0000000000..3adb5ad0e9
--- /dev/null
+++ b/perf_monitoring/perf_models/CamemBERT/user_script.py
@@ -0,0 +1,194 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import evaluate
+import numpy
+import torch
+from datasets import load_dataset
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from tqdm import tqdm
+from transformers import AutoTokenizer, CamembertForTokenClassification
+
+from olive.constants import Framework
+
+# https://huggingface.co/Jean-Baptiste/camembert-ner
+model_name = "Jean-Baptiste/camembert-ner"
+dataset_name = "Jean-Baptiste/wikiner_fr"
+split = "test"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = CamembertForTokenClassification.from_pretrained(model_name)
+    model = model.to("cpu")
+    return model
+
+
+# -------------------- dataset -------------------
+def align_labels_with_tokens(labels, word_ids):
+    new_labels = []
+    current_word = None
+    for word_id in word_ids:
+        if word_id != current_word:
+            # Start of a new word!
+            current_word = word_id
+            label = 0 if word_id is None else labels[word_id]
+            new_labels.append(label)
+        elif word_id is None:
+            # Special token
+            new_labels.append(0)
+        else:
+            # Same word as previous token
+            label = labels[word_id]
+            # If the label is B-XXX we change it to I-XXX
+            if label % 2 == 1:
+                label += 1
+            new_labels.append(label)
+
+    return new_labels
+
+
+def tokenize_and_align_labels(examples):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized_inputs = tokenizer(
+        examples["tokens"],
+        truncation=True,
+        padding=True,
+        is_split_into_words=True,
+        add_special_tokens=False,
+        return_tensors="pt",
+    )
+    all_labels = examples["ner_tags"]
+    new_labels = []
+    for i, labels in enumerate(all_labels):
+        word_ids = tokenized_inputs.word_ids(i)
+        new_labels.append(align_labels_with_tokens(labels, word_ids))
+
+    tokenized_inputs["labels"] = torch.LongTensor(new_labels)
+    return tokenized_inputs
+
+
+def create_evaluation_dataset():
+    dataset = load_dataset(dataset_name, split=split)
+    tokenized_datasets = dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        remove_columns=dataset.column_names,
+    )
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            # return 5
+            return len(self.dataset)
+
+    return _Dateset(tokenized_datasets)
+
+
+def create_dataloader(data_dir="", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset()
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+# -------------------- post process -------------------
+def _convert_idx_to_ner_tags(labels):
+    id2label = {0: "O", 1: "I-LOC", 2: "I-PER", 3: "I-MISC", 4: "I-ORG"}
+    return [id2label[t.item()] for t in labels]
+
+
+def post_process(model_output, model):
+    if model.framework == Framework.ONNX:
+        logits = model_output[0]
+    else:
+        logits = model_output.logits
+    predicted_token_class_ids = logits.argmax(-1)
+    predicted_tokens_classes = _convert_idx_to_ner_tags(predicted_token_class_ids[0])
+    return predicted_tokens_classes
+
+
+# -------------------- evaluations -------------------
+def _evaluate(pre, ref, computer_func=None):
+    if computer_func is None:
+        return None
+    return computer_func.compute(predictions=pre, references=ref)
+
+
+def evaluate_accuracy_gpu(model, data_dir, batch_size, device="gpu"):
+    evaluate_accuracy(model, data_dir, batch_size, device=device)
+
+
+def evaluate_accuracy(model, data_dir, batch_size, device):
+    prepared_model = model.prepare_session(inference_settings=None, device=device)
+    dataloader = create_dataloader(batch_size=batch_size)
+    seqeval = evaluate.load("seqeval")
+
+    pre = []
+    ref = []
+
+    for item in tqdm(dataloader):
+        for v in item[-1]:
+            ref.append(_convert_idx_to_ner_tags(v))
+
+        item = item[0]
+        if model.framework == Framework.ONNX:
+            input_ids = numpy.ascontiguousarray(item["input_ids"].cpu().numpy())
+            attention_mask = numpy.ascontiguousarray(item["attention_mask"].cpu().numpy())
+            input = {"input_ids": input_ids, "attention_mask": attention_mask}
+            ort_outputs = prepared_model.run(None, input)
+            outputs = post_process(ort_outputs, model)
+            pre.append(outputs)
+
+        elif model.framework == Framework.PYTORCH:
+            with torch.no_grad():
+                ort_outputs = prepared_model(input_ids=item["input_ids"], attention_mask=item["attention_mask"])
+                outputs = post_process(ort_outputs, model)
+                pre.append(outputs)
+    _rls = _evaluate(pre, ref, seqeval)
+    rls = _rls["overall_accuracy"]
+    return rls
diff --git a/perf_monitoring/perf_models/bert/bert_workflow_cpu.json b/perf_monitoring/perf_models/bert/bert_workflow_cpu.json
new file mode 100644
index 0000000000..bff54468de
--- /dev/null
+++ b/perf_monitoring/perf_models/bert/bert_workflow_cpu.json
@@ -0,0 +1,78 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Intel/bert-base-uncased-mrpc",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
new file mode 100644
index 0000000000..e6742f4465
--- /dev/null
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -0,0 +1,41 @@
+import os
+from pathlib import Path
+
+import pytest
+from utils import extract_best_models, patch_config
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup():
+    """setup any state specific to the execution of the given module."""
+    cur_dir = Path(__file__).resolve().parent.parent
+    example_dir = cur_dir / "perf_monitoring"
+    os.chdir(example_dir)
+    yield
+    os.chdir(cur_dir)
+
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/bert/bert_workflow_cpu.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "bert")
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/CamemBERT/cpu_config.json"],
+)
+def test_bert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "CamemBERT")

From 0fd885b9b0937254092ca145008ca2498fef097b Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 6 Jul 2023 13:24:43 +0000
Subject: [PATCH 14/67] updated camembert to use user_script

---
 .../perf_models/CamemBERT/cpu_config.json     | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
index 20d089bdeb..fca05d33cd 100644
--- a/perf_monitoring/perf_models/CamemBERT/cpu_config.json
+++ b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
@@ -3,39 +3,39 @@
     "input_model": {
         "type": "PyTorchModel",
         "config": {
-            "hf_config": {
-                "model_name": "Jean-Baptiste/camembert-ner",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"Jean-Baptiste/wikiner_fr",
-                    "split": "test",
-                    "input_cols": ["input_ids", "attention_mask"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
+            "model_path": null,
+            "is_file": false,
+            "model_loader": "load_model",
+            "model_script": "user_script.py"
                 }
-            }
 
-        }
+
     },
 
     "evaluators": {
         "common_evaluator": {
-            "metrics":[
+            "metrics": [
                 {
                     "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
+                    "type": "custom",
+                    "is_first_priority": true,
+                    "user_config": {
+                        "evaluate_func": "evaluate_accuracy",
+                        "user_script": "user_script.py",
+                        "batch_size": 1
+                    },
+                    "goal": {"type": "max-degradation", "value": 0.01}
                 },
                 {
                     "name": "latency",
                     "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
+                    "sub_type": "avg",
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    },
+                    "goal": {"type": "percent-min-improvement", "value": 20}
                 }
             ]
         }
@@ -91,7 +91,12 @@
             "type": "OnnxDynamicQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning"
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
         }
     },
     "engine": {
@@ -105,7 +110,6 @@
         },
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
-        "clean_cache": false,
         "cache_dir": "cache",
         "output_dir" : "models/CamemBERT_workflow_cpu.json"
     }

From 356634e8aa8864174e20bc94e431b61031d160f4 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 13 Jul 2023 19:18:02 +0000
Subject: [PATCH 15/67] fixing error in text-classification models test

---
 .../perf_models/CamemBERT/cpu_config.json     |  90 +++++-----
 .../perf_models/CamemBERT/user_script.py      |   8 +-
 .../cpu_config.json                           | 104 +++++++++++
 .../user_script.py                            | 155 +++++++++++++++++
 .../cpu_config.json                           | 105 ++++++++++++
 .../main.py                                   |   7 +
 .../user_script.py                            | 153 +++++++++++++++++
 .../cpu_config.json                           | 105 ++++++++++++
 .../user_script.py                            | 155 +++++++++++++++++
 .../roberta-large-mnli/cpu_config.json        | 105 ++++++++++++
 .../roberta-large-mnli/user_script.py         | 155 +++++++++++++++++
 .../cpu_config.json                           | 105 ++++++++++++
 .../user_script.py                            | 161 ++++++++++++++++++
 perf_monitoring/requirements.txt              |   1 +
 .../test_perf_monitoring_models_cpu.py        |  66 ++++++-
 perf_monitoring/utils.py                      |   7 +-
 16 files changed, 1422 insertions(+), 60 deletions(-)
 create mode 100644 perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
 create mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
 create mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
 create mode 100644 perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
 create mode 100644 perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/roberta-large-mnli/user_script.py
 create mode 100644 perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
 create mode 100644 perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py

diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
index fca05d33cd..37f9c0b67a 100644
--- a/perf_monitoring/perf_models/CamemBERT/cpu_config.json
+++ b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
@@ -3,80 +3,69 @@
     "input_model": {
         "type": "PyTorchModel",
         "config": {
-            "model_path": null,
-            "is_file": false,
             "model_loader": "load_model",
-            "model_script": "user_script.py"
+            "model_script": "./perf_models/CamemBERT/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
                 }
-
-
+            }
+        }
     },
-
     "evaluators": {
         "common_evaluator": {
             "metrics": [
                 {
                     "name": "accuracy",
                     "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
                     "is_first_priority": true,
                     "user_config": {
                         "evaluate_func": "evaluate_accuracy",
-                        "user_script": "user_script.py",
+                        "user_script": "./perf_models/CamemBERT/user_script.py",
                         "batch_size": 1
-                    },
-                    "goal": {"type": "max-degradation", "value": 0.01}
+                    }
                 },
                 {
                     "name": "latency",
                     "type": "latency",
-                    "sub_type": "avg",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
                     "user_config": {
-                        "user_script": "user_script.py",
+                        "user_script": "./perf_models/CamemBERT/user_script.py",
                         "dataloader_func": "create_dataloader",
                         "batch_size": 1
-                    },
-                    "goal": {"type": "percent-min-improvement", "value": 20}
+                    }
                 }
             ]
         }
     },
     "passes": {
         "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "input_names": [
-                    "input_ids",
-                    "attention_mask"
-                ],
-                "input_shapes": [
-                    [
-                        1,
-                        128
-                    ],
-                    [
-                        1,
-                        128
-                    ]
-                ],
-                "input_types": [
-                    "int64",
-                    "int64"
-                ],
-                "output_names": [
-                    "output"
-                ],
-                "dynamic_axes": {
-                    "input_ids": {
-                        "0": "batch_size",
-                        "1": "seq_length"
-                    },
-                    "attention_mask": {
-                        "0": "batch_size",
-                        "1": "seq_length"
-                    }
-                },
-                "target_opset": 17
-            }
+            "type": "OnnxConversion"
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
@@ -93,7 +82,7 @@
         "perf_tuning": {
             "type": "OrtPerfTuning",
             "config": {
-                "user_script": "user_script.py",
+                "user_script": "./perf_models/CamemBERT/user_script.py",
                 "dataloader_func": "create_dataloader",
                 "batch_size": 1
             }
@@ -109,8 +98,7 @@
             }
         },
         "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir" : "models/CamemBERT_workflow_cpu.json"
+        "output_dir": "models/CamemBERT_workflow_cpu.json"
     }
 }
diff --git a/perf_monitoring/perf_models/CamemBERT/user_script.py b/perf_monitoring/perf_models/CamemBERT/user_script.py
index 3adb5ad0e9..7e20074572 100644
--- a/perf_monitoring/perf_models/CamemBERT/user_script.py
+++ b/perf_monitoring/perf_models/CamemBERT/user_script.py
@@ -112,8 +112,8 @@ def __getitem__(self, index):
             return self.dataset[index], self.dataset[index]["labels"]
 
         def __len__(self):
-            # return 5
-            return len(self.dataset)
+            return 5
+            # return len(self.dataset)
 
     return _Dateset(tokenized_datasets)
 
@@ -160,10 +160,10 @@ def _evaluate(pre, ref, computer_func=None):
 
 
 def evaluate_accuracy_gpu(model, data_dir, batch_size, device="gpu"):
-    evaluate_accuracy(model, data_dir, batch_size, device=device)
+    evaluate_accuracy(model, data_dir, batch_size, device=device, ep=None)
 
 
-def evaluate_accuracy(model, data_dir, batch_size, device):
+def evaluate_accuracy(model, data_dir, batch_size, device, ep):
     prepared_model = model.prepare_session(inference_settings=None, device=device)
     dataloader = create_dataloader(batch_size=batch_size)
     seqeval = evaluate.load("seqeval")
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
new file mode 100644
index 0000000000..956a5a9ff9
--- /dev/null
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
@@ -0,0 +1,104 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_loader": "load_model",
+            "model_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
+                    "is_first_priority": true,
+                    "user_config":{
+                        "evaluate_func": "eval_accuracy",
+                        "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
+                        "batch_size": 1
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion"
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "cache_dir": "cache",
+        "output_dir": "models/bertweet-base-sentiment-analysis_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
new file mode 100644
index 0000000000..577da6a156
--- /dev/null
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
@@ -0,0 +1,155 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from datasets import load_dataset
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from olive.constants import Framework
+from olive.evaluator.accuracy import AccuracyScore
+from olive.model import OliveModel
+
+# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
+model_name = "finiteautomata/bertweet-base-sentiment-analysis"
+# dataset_name = "mteb/tweet_sentiment_extraction"
+dataset_name = "cardiffnlp/tweet_sentiment_multilingual"
+subset = "english"
+split = "test"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    model = model.to("cpu")
+    return model
+
+
+# -------------------- dataset -------------------
+def tokenize_and_align_labels(examples):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized_inputs = tokenizer(
+        examples["text"],
+        truncation=True,
+        padding=True,
+        return_tensors="pt",
+    )
+    tokenized_inputs["labels"] = examples["label"]
+    return tokenized_inputs
+
+
+def create_evaluation_dataset():
+    dataset = load_dataset(dataset_name, subset, split=split)
+    tokenized_datasets = dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        remove_columns=dataset.column_names,
+    )
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            return 5
+            # return len(self.dataset)
+
+    return _Dateset(tokenized_datasets)
+
+
+def create_dataloader(data_dir="", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset()
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+def post_process(output):
+    import torch
+    import transformers
+
+    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
+        preds = torch.argmax(output.logits, dim=-1)
+    else:
+        preds = torch.argmax(output, dim=-1)
+    print(preds.tolist(), "this is preds")
+    return preds
+
+
+def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
+    dataloader = create_dataloader(data_dir, batch_size)
+    print(dataloader, "this is dataloader")
+    preds = []
+    target = []
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    if model.framework == Framework.ONNX:
+        input_names = [i.name for i in sess.get_inputs()]
+        output_names = [o.name for o in sess.get_outputs()]
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
+            else:
+                inputs = inputs.tolist()
+                input_dict = dict(zip(input_names, [inputs]))
+            res = sess.run(input_feed=input_dict, output_names=None)
+            if len(output_names) == 1:
+                result = torch.Tensor(res[0])
+            else:
+                result = torch.Tensor(res)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    elif model.framework == Framework.PYTORCH:
+        for inputs, labels in dataloader:
+            print(inputs, "this is inputs", labels, "this is labels")
+            if isinstance(inputs, dict):
+                result = sess(**inputs)
+            else:
+                result = sess(inputs)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    print(preds, "this is preds")
+    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
new file mode 100644
index 0000000000..6ff0d6bfbb
--- /dev/null
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
@@ -0,0 +1,105 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_loader": "load_model",
+            "model_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+                }
+            }
+        }
+    },
+
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
+                    "is_first_priority": true,
+                    "user_config": {
+                        "evaluate_func": "eval_accuracy",
+                        "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
+                        "batch_size": 1
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion"
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "cache_dir": "cache",
+        "output_dir": "models/distilbert-base-uncased-finetuned-sst-2-english_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
new file mode 100644
index 0000000000..ec02da659b
--- /dev/null
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
@@ -0,0 +1,7 @@
+from olive.workflows import run as olive_run
+
+config = "./cpu_config.json"
+# config = "./gpu_config.json"
+config = "./aml_cpu_config.json"
+rls = olive_run(config)
+print(rls)
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
new file mode 100644
index 0000000000..e151087beb
--- /dev/null
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
@@ -0,0 +1,153 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from datasets import load_dataset
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from olive.constants import Framework
+from olive.evaluator.accuracy import AccuracyScore
+from olive.model import OliveModel
+
+# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
+model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+dataset_name = "mteb/tweet_sentiment_extraction"
+dataset_name = "glue"
+subset = "sst2"
+split = "validation"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model
+
+
+# -------------------- dataset -------------------
+def tokenize_and_align_labels(examples):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized_inputs = tokenizer(
+        examples["sentence"],
+        truncation=True,
+        padding=True,
+        return_tensors="pt",
+    )
+    # pre process
+
+    tokenized_inputs["labels"] = examples["label"]
+    return tokenized_inputs
+
+
+def create_evaluation_dataset():
+    dataset = load_dataset(dataset_name, subset, split=split)
+    tokenized_datasets = dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        remove_columns=dataset.column_names,
+    )
+    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            return 5
+
+        # return len(self.dataset)
+
+    return _Dateset(tokenized_datasets)
+
+
+def create_dataloader(data_dir="", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset()
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+def post_process(output):
+    import torch
+    import transformers
+
+    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
+        preds = torch.argmax(output.logits, dim=-1)
+    else:
+        preds = torch.argmax(output, dim=-1)
+    return preds
+
+
+def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
+    dataloader = create_dataloader(data_dir, batch_size)
+    preds = []
+    target = []
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    if model.framework == Framework.ONNX:
+        input_names = [i.name for i in sess.get_inputs()]
+        output_names = [o.name for o in sess.get_outputs()]
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
+            else:
+                inputs = inputs.tolist()
+                input_dict = dict(zip(input_names, [inputs]))
+            res = sess.run(input_feed=input_dict, output_names=None)
+            if len(output_names) == 1:
+                result = torch.Tensor(res[0])
+            else:
+                result = torch.Tensor(res)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    elif model.framework == Framework.PYTORCH:
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                result = sess(**inputs)
+            else:
+                result = sess(inputs)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
new file mode 100644
index 0000000000..c475f1ecb3
--- /dev/null
+++ b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
@@ -0,0 +1,105 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_loader": "load_model",
+            "model_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
+                    "is_first_priority": true,
+                    "user_config":{
+                        "evaluate_func": "eval_accuracy",
+                        "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion"
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "cache_dir": "cache",
+        "output_dir": "models/microsoft-deberta-base-mnli_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
new file mode 100644
index 0000000000..3bf18b2515
--- /dev/null
+++ b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
@@ -0,0 +1,155 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from datasets import load_dataset
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from olive.constants import Framework
+from olive.evaluator.accuracy import AccuracyScore
+from olive.model import OliveModel
+
+# https://huggingface.co/microsoft/deberta-base-mnli
+model_name = "microsoft/deberta-base-mnli"
+dataset_name = "glue"
+subset = "mnli_matched"
+split = "validation"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model
+
+
+# -------------------- dataset -------------------
+def tokenize_and_align_labels(examples):
+    if isinstance(examples["label"], list):
+        label = list(map(lambda x: 2 - x, examples["label"]))
+    elif isinstance(examples["label"], int):
+        label = 2 - examples["label"]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized_inputs = tokenizer(
+        examples["premise"],
+        examples["hypothesis"],
+        truncation=True,
+        padding=True,
+        return_tensors="pt",
+    )
+    # pre process
+    tokenized_inputs["labels"] = torch.LongTensor(label)
+    return tokenized_inputs
+
+
+def create_evaluation_dataset():
+    dataset = load_dataset(dataset_name, subset, split=split)
+    tokenized_datasets = dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        remove_columns=dataset.column_names,
+    )
+    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            return 5
+            # return len(self.dataset)
+
+    return _Dateset(tokenized_datasets)
+
+
+def create_dataloader(data_dir="", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset()
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+def post_process(output):
+    import torch
+    import transformers
+
+    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
+        preds = torch.argmax(output.logits, dim=-1)
+    else:
+        preds = torch.argmax(output, dim=-1)
+    return preds
+
+
+def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
+    dataloader = create_dataloader(data_dir, batch_size)
+    preds = []
+    target = []
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    if model.framework == Framework.ONNX:
+        input_names = [i.name for i in sess.get_inputs()]
+        output_names = [o.name for o in sess.get_outputs()]
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
+            else:
+                inputs = inputs.tolist()
+                input_dict = dict(zip(input_names, [inputs]))
+            res = sess.run(input_feed=input_dict, output_names=None)
+            if len(output_names) == 1:
+                result = torch.Tensor(res[0])
+            else:
+                result = torch.Tensor(res)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    elif model.framework == Framework.PYTORCH:
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                result = sess(**inputs)
+            else:
+                result = sess(inputs)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json b/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
new file mode 100644
index 0000000000..eefcf36843
--- /dev/null
+++ b/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
@@ -0,0 +1,105 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_loader": "load_model",
+            "model_script": "./perf_models/roberta-large-mnli/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
+                    "is_first_priority": true,
+                    "user_config":{
+                        "evaluate_func": "eval_accuracy",
+                        "user_script": "./perf_models/roberta-large-mnli/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "./perf_models/roberta-large-mnli/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion"
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "./perf_models/roberta-large-mnli/user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "cache_dir": "cache",
+        "output_dir": "models/roberta-large-mnli_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
new file mode 100644
index 0000000000..d0edf764ad
--- /dev/null
+++ b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
@@ -0,0 +1,155 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from datasets import load_dataset
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from olive.constants import Framework
+from olive.evaluator.accuracy import AccuracyScore
+from olive.model import OliveModel
+
+# https://huggingface.co/roberta-large-mnli
+model_name = "roberta-large-mnli"
+dataset_name = "glue"
+subset = "mnli_matched"
+split = "validation"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model
+
+
+# -------------------- dataset -------------------
+def tokenize_and_align_labels(examples):
+    if isinstance(examples["label"], list):
+        label = list(map(lambda x: 2 - x, examples["label"]))
+    elif isinstance(examples["label"], int):
+        label = 2 - examples["label"]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized_inputs = tokenizer(
+        examples["premise"],
+        examples["hypothesis"],
+        truncation=True,
+        padding=True,
+        return_tensors="pt",
+    )
+    # pre process
+    tokenized_inputs["labels"] = torch.LongTensor(label)
+    return tokenized_inputs
+
+
+def create_evaluation_dataset():
+    dataset = load_dataset(dataset_name, subset, split=split)
+    tokenized_datasets = dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        remove_columns=dataset.column_names,
+    )
+    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            return 5
+            # return len(self.dataset)
+
+    return _Dateset(tokenized_datasets)
+
+
+def create_dataloader(data_dir="", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset()
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+def post_process(output):
+    import torch
+    import transformers
+
+    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
+        preds = torch.argmax(output.logits, dim=-1)
+    else:
+        preds = torch.argmax(output, dim=-1)
+    return preds
+
+
+def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
+    dataloader = create_dataloader(data_dir, batch_size)
+    preds = []
+    target = []
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    if model.framework == Framework.ONNX:
+        input_names = [i.name for i in sess.get_inputs()]
+        output_names = [o.name for o in sess.get_outputs()]
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
+            else:
+                inputs = inputs.tolist()
+                input_dict = dict(zip(input_names, [inputs]))
+            res = sess.run(input_feed=input_dict, output_names=None)
+            if len(output_names) == 1:
+                result = torch.Tensor(res[0])
+            else:
+                result = torch.Tensor(res)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    elif model.framework == Framework.PYTORCH:
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                result = sess(**inputs)
+            else:
+                result = sess(inputs)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json b/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
new file mode 100644
index 0000000000..c45ff20284
--- /dev/null
+++ b/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
@@ -0,0 +1,105 @@
+{
+    "verbose": true,
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_loader": "load_model",
+            "model_script": "./perf_models/roberta-large-openai-detector/user_script.py",
+            "io_config" : {
+                "input_names": ["input_ids", "attention_mask"],
+                "input_shapes": [[1, 128], [1, 128]],
+                "input_types": ["int64", "int64"],
+                "output_names": ["output"],
+                "dynamic_axes": {
+                    "input_ids": {"0": "batch_size", "1": "seq_length"},
+                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.01
+                            }
+                        }
+                    ],
+                    "is_first_priority": true,
+                    "user_config":{
+                        "evaluate_func": "eval_accuracy",
+                        "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 20
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
+                        "dataloader_func": "create_dataloader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion"
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxDynamicQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
+                "dataloader_func": "create_dataloader",
+                "batch_size": 1
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 5,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "cache_dir": "cache",
+        "output_dir": "models/roberta-large-openai-detector_workflow_cpu.json"
+    }
+}
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
new file mode 100644
index 0000000000..995055e191
--- /dev/null
+++ b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
@@ -0,0 +1,161 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import json
+import os
+
+import torch
+from onnxruntime.quantization import CalibrationDataReader
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from olive.constants import Framework
+from olive.evaluator.accuracy import AccuracyScore
+from olive.model import OliveModel
+
+# https://huggingface.co/roberta-large-openai-detector
+model_name = "roberta-large-openai-detector"
+dataset_name = "glue"
+subset = "mnli_matched"
+split = "validation"
+
+
+class CalibrationDataLoader(CalibrationDataReader):
+    def __init__(self, dataloader, post_func, num_samplers=100):
+        self.dataloader = dataloader
+        self.iter = iter(dataloader)
+        self.post_func = post_func
+        self.counter = 0
+        self.num_samplers = num_samplers
+
+    def get_next(self):
+        if self.counter >= self.num_samplers:
+            return None
+        self.counter += 1
+        if self.iter is None:
+            self.iter = iter(self.dataloader)
+        try:
+            return self.post_func(next(self.iter))
+        except StopIteration:
+            return None
+
+    def rewind(self):
+        self.iter = None
+        self.counter = 0
+
+
+# -------------------- model -------------------
+def load_model(model_path=None):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model
+
+
+# -------------------- dataset -------------------
+
+
+def create_evaluation_dataset(dataset_dir):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    rls_ordered = []
+    for item, label in [("small-117M.valid.jsonl", 0), ("webtext.valid.jsonl", 1)]:
+        valid_file = os.path.join(dataset_dir, item)
+        with open(valid_file, "r") as f:
+            for line in f:
+                line = json.loads(line)
+                input = tokenizer(line["text"], return_tensors="pt", padding=True, truncation=True)
+                rls_ordered.append((input, label))
+
+    rls = []
+    for i in range(len(rls_ordered) // 2):
+        rls.append(
+            {
+                "input_ids": rls_ordered[i][0].input_ids[0],
+                "attention_mask": rls_ordered[i][0].attention_mask[0],
+                "labels": rls_ordered[i][1],
+            }
+        )
+        next_i = i + len(rls_ordered) // 2
+        rls.append(
+            {
+                "input_ids": rls_ordered[next_i][0].input_ids[0],
+                "attention_mask": rls_ordered[next_i][0].attention_mask[0],
+                "labels": rls_ordered[next_i][1],
+            }
+        )
+
+    class _Dateset(Dataset):
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def __getitem__(self, index):
+            return self.dataset[index], self.dataset[index]["labels"]
+
+        def __len__(self):
+            return 5
+            # return len(self.dataset)
+
+    return _Dateset(rls)
+
+
+def create_dataloader(data_dir="data", batch_size=2):
+    def _collate_fn(batch):
+        batch = default_collate(batch)
+        return batch
+
+    dataset = create_evaluation_dataset(data_dir)
+    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
+
+
+def create_cali_dataloader():
+    def _post_func(sampler):
+        return sampler
+
+    dataloader = create_dataloader()
+    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
+    return cali_dataloader
+
+
+def post_process(output):
+    import torch
+    import transformers
+
+    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
+        pre = torch.argmax(output.logits, dim=-1)
+    else:
+        pre = torch.argmax(output, dim=-1)
+    return pre
+
+
+def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
+    dataloader = create_dataloader(data_dir, batch_size)
+    preds = []
+    target = []
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    if model.framework == Framework.ONNX:
+        input_names = [i.name for i in sess.get_inputs()]
+        output_names = [o.name for o in sess.get_outputs()]
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
+            else:
+                inputs = inputs.tolist()
+                input_dict = dict(zip(input_names, [inputs]))
+            res = sess.run(input_feed=input_dict, output_names=None)
+            if len(output_names) == 1:
+                result = torch.Tensor(res[0])
+            else:
+                result = torch.Tensor(res)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    elif model.framework == Framework.PYTORCH:
+        for inputs, labels in dataloader:
+            if isinstance(inputs, dict):
+                result = sess(**inputs)
+            else:
+                result = sess(inputs)
+            outputs = post_process(result)
+            preds.extend(outputs.tolist())
+            target.extend(labels.data.tolist())
+    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
index dc90059d04..0aa2a6f1d3 100644
--- a/perf_monitoring/requirements.txt
+++ b/perf_monitoring/requirements.txt
@@ -11,3 +11,4 @@ transformers
 sentencepiece
 evaluate
 seqeval
+emoji==0.6.0
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index e6742f4465..31899962a6 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -28,9 +28,34 @@ def setup():
 #     extract_best_models(footprint, "bert")
 
 
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/CamemBERT/cpu_config.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "CamemBERT")
+
+
 @pytest.mark.parametrize(
     "olive_json",
-    ["perf_models/CamemBERT/cpu_config.json"],
+    ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
 )
 def test_bert(olive_json):
     print(olive_json)
@@ -38,4 +63,41 @@ def test_bert(olive_json):
 
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
-    extract_best_models(footprint, "CamemBERT")
+    extract_best_models(footprint, "bertweet-base-sentiment-analysis")
+
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/roberta-large-mnli/cpu_config.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "roberta-large-mnli")
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/roberta-large-openai-detector/cpu_config.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "roberta-large-openai-detector")
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index ce716570cb..39061a71d4 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -30,8 +30,9 @@ def patch_config(config_json_path: str):
 
 
 def extract_best_models(footprint, model_name):
-    footprint = list(footprint.values())[0]
-    metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
+    print("Footprint: ", footprint)
+    footprint = list(footprint.values())[0]  #
+    metrics_of_interest = ["accuracy-accuracy_custom", "latency-avg"]
     # gather the metrics from all pareto frontier nodes
     all_metrics = []
     # we iterate over the nodes in the pareto frontier
@@ -54,7 +55,7 @@ def extract_best_models(footprint, model_name):
     print("Compared metrics: ", compared_metric)
 
 
-def no_regression(actual, expected, rel_tol):
+def no_regression(actual, expected, rel_tol):  # check for tolerance
     if actual > expected:
         return True
     return abs(actual - expected) <= rel_tol * abs(expected)

From bc607532020b6f7b99fdba673b2cec7590165dfc Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 08:56:34 +0000
Subject: [PATCH 16/67] pushing to test on pipeline

---
 perf_monitoring/best_metrics.json                  | 11 ++++++-----
 .../user_script.py                                 |  7 +++++--
 .../user_script.py                                 | 10 ++++++----
 .../microsoft-deberta-base-mnli/user_script.py     |  7 +++++--
 .../perf_models/roberta-large-mnli/user_script.py  |  7 +++++--
 .../roberta-large-openai-detector/cpu_config.json  |  3 ++-
 .../roberta-large-openai-detector/user_script.py   |  5 +++++
 perf_monitoring/test_perf_monitoring_models_cpu.py |  3 +++
 perf_monitoring/utils.py                           | 14 +++++++++++++-
 9 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index b35cf90eb6..5596b4bd70 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,5 +1,6 @@
-{
-    "bert": [],
-    "CamemBERT": []
-
-}
+{"bert": [],
+"CamemBERT": [],
+"bertweet-base-sentiment-analysis": [1.0, -16.03689],
+ "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203],
+  "microsoft-deberta-base-mnli": [1.0, -111.79317],
+  "roberta-large-mnli": [1.0, -146.89287]}
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
index 577da6a156..dd3d4b8d54 100644
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
@@ -72,14 +72,17 @@ def create_evaluation_dataset():
         batched=True,
         remove_columns=dataset.column_names,
     )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
 
     class _Dateset(Dataset):
         def __init__(self, dataset):
             self.dataset = dataset
 
         def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
+            labels = self.dataset[index]["labels"]
+            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
+            return inputs, labels
+            # return self.dataset[index], self.dataset[index]["labels"]
 
         def __len__(self):
             return 5
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
index e151087beb..e2f05308e6 100644
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
@@ -73,19 +73,21 @@ def create_evaluation_dataset():
         batched=True,
         remove_columns=dataset.column_names,
     )
-    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
 
     class _Dateset(Dataset):
         def __init__(self, dataset):
             self.dataset = dataset
 
         def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
+            labels = self.dataset[index]["labels"]
+            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
+            return inputs, labels
+            # return self.dataset[index], self.dataset[index]["labels"]
 
         def __len__(self):
             return 5
-
-        # return len(self.dataset)
+            # return len(self.dataset)
 
     return _Dateset(tokenized_datasets)
 
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
index 3bf18b2515..2d72fd7ea7 100644
--- a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
+++ b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
@@ -76,14 +76,17 @@ def create_evaluation_dataset():
         batched=True,
         remove_columns=dataset.column_names,
     )
-    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
 
     class _Dateset(Dataset):
         def __init__(self, dataset):
             self.dataset = dataset
 
         def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
+            labels = self.dataset[index]["labels"]
+            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
+            return inputs, labels
+            # return self.dataset[index], self.dataset[index]["labels"]
 
         def __len__(self):
             return 5
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
index d0edf764ad..366cce76c2 100644
--- a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
+++ b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
@@ -76,14 +76,17 @@ def create_evaluation_dataset():
         batched=True,
         remove_columns=dataset.column_names,
     )
-    tokenized_datasets.set_format("torch", columns=tokenized_datasets.column_names)
+    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
 
     class _Dateset(Dataset):
         def __init__(self, dataset):
             self.dataset = dataset
 
         def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
+            labels = self.dataset[index]["labels"]
+            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
+            return inputs, labels
+            # return self.dataset[index], self.dataset[index]["labels"]
 
         def __len__(self):
             return 5
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json b/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
index c45ff20284..ba9ffe39fb 100644
--- a/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
+++ b/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
@@ -39,7 +39,8 @@
                         "evaluate_func": "eval_accuracy",
                         "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
                         "dataloader_func": "create_dataloader",
-                        "batch_size": 1
+                        "batch_size": 1,
+                        "data_dir": "data"
                     }
                 },
                 {
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
index 995055e191..6e21bd484c 100644
--- a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
+++ b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
@@ -56,6 +56,7 @@ def load_model(model_path=None):
 
 
 def create_evaluation_dataset(dataset_dir):
+    print(f"Dataset directory: {dataset_dir}")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     rls_ordered = []
     for item, label in [("small-117M.valid.jsonl", 0), ("webtext.valid.jsonl", 1)]:
@@ -104,6 +105,7 @@ def _collate_fn(batch):
         return batch
 
     dataset = create_evaluation_dataset(data_dir)
+    print(f"Data directory: {data_dir}")
     return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
 
 
@@ -129,6 +131,7 @@ def post_process(output):
 
 def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
     dataloader = create_dataloader(data_dir, batch_size)
+    print(dataloader, "this is dataloader")
     preds = []
     target = []
     sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
@@ -151,6 +154,7 @@ def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_pro
             target.extend(labels.data.tolist())
     elif model.framework == Framework.PYTORCH:
         for inputs, labels in dataloader:
+            print(inputs, "this is inputs", labels, "this is labels")
             if isinstance(inputs, dict):
                 result = sess(**inputs)
             else:
@@ -158,4 +162,5 @@ def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_pro
             outputs = post_process(result)
             preds.extend(outputs.tolist())
             target.extend(labels.data.tolist())
+    print(preds, "this is preds")
     return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 31899962a6..2d8677ede3 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -40,6 +40,7 @@ def setup():
 #     footprint = olive_run(olive_config)
 #     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
+
 # @pytest.mark.parametrize(
 #     "olive_json",
 #     ["perf_models/CamemBERT/cpu_config.json"],
@@ -78,6 +79,7 @@ def test_bert(olive_json):
 #     footprint = olive_run(olive_config)
 #     extract_best_models(footprint, "microsoft-deberta-base-mnli")
 
+
 # @pytest.mark.parametrize(
 #     "olive_json",
 #     ["perf_models/roberta-large-mnli/cpu_config.json"],
@@ -90,6 +92,7 @@ def test_bert(olive_json):
 #     footprint = olive_run(olive_config)
 #     extract_best_models(footprint, "roberta-large-mnli")
 
+
 # @pytest.mark.parametrize(
 #     "olive_json",
 #     ["perf_models/roberta-large-openai-detector/cpu_config.json"],
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 39061a71d4..ac36f8800d 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -65,13 +65,25 @@ def compare_metrics(best_metrics, model_name):
     # open best metrics json
     with open("best_metrics.json") as f:
         data = json.load(f)
+
+    if model_name in data:
         model_data = data[model_name]
         if len(model_data) == 0:
             print("No data in best_metrics.json")
             return {"accuracy": True, "latency": True}
         print(model_data[0], model_data[1])
         print(best_metrics[0], best_metrics[1])
-        return {
+        comparison_result = {
             "accuracy": no_regression(best_metrics[0], model_data[0], 0.05),
             "latency": no_regression(best_metrics[1], model_data[1], 0.05),
         }
+    else:
+        print(f"{model_name} not found in best_metrics.json, creating new entry...")
+        data[model_name] = best_metrics
+        comparison_result = {"accuracy": True, "latency": True}
+
+    # Save the updated data back to the file
+    with open("best_metrics.json", "w") as f:
+        json.dump(data, f)
+
+    return comparison_result

From 182862d3b64b0e97949f323299db0a9233f8f002 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 09:13:37 +0000
Subject: [PATCH 17/67] fixed script name error in makefile

---
 .azure_pipelines/perfmonitoring-ci .yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 0df719f0b7..de771e42f8 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -34,7 +34,7 @@ jobs:
     windows: True
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_bert_cpu
+        perfMonitoringScriptName: test_perf_monitoring_models_cpu
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
@@ -43,4 +43,4 @@ jobs:
     windows: False
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_bert_cpu
+        perfMonitoringScriptName: test_perf_monitoring_models_cpu

From ea7adb31c79494c0e6a38323a9ff815af68e0414 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 09:18:18 +0000
Subject: [PATCH 18/67] fixed script name error in makefile

---
 .azure_pipelines/perfmonitoring-ci .yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index de771e42f8..f43fd48f1a 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -34,7 +34,7 @@ jobs:
     windows: True
     examples:
       bert:
-        perfMonitoringScriptName: test_perf_monitoring_models_cpu
+        perfMonitoringScriptName: perf_monitoring_models_cpu
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
@@ -43,4 +43,4 @@ jobs:
     windows: False
     examples:
       bert:
-        perfMonitoringScriptName: test_perf_monitoring_models_cpu
+        perfMonitoringScriptName: perf_monitoring_models_cpu

From 5ad2066c55e8e25bb0e61b53f0bc93d2cc218e3e Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 09:44:45 +0000
Subject: [PATCH 19/67] fixed olive run error

---
 perf_monitoring/best_metrics.json                          | 7 +------
 .../bertweet-base-sentiment-analysis/user_script.py        | 4 ----
 perf_monitoring/test_perf_monitoring_models_cpu.py         | 5 ++---
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index 5596b4bd70..ada84d68be 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,6 +1 @@
-{"bert": [],
-"CamemBERT": [],
-"bertweet-base-sentiment-analysis": [1.0, -16.03689],
- "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203],
-  "microsoft-deberta-base-mnli": [1.0, -111.79317],
-  "roberta-large-mnli": [1.0, -146.89287]}
+{"bert": [], "CamemBERT": [], "bertweet-base-sentiment-analysis": [1.0, -16.03689], "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203], "microsoft-deberta-base-mnli": [1.0, -111.79317], "roberta-large-mnli": [1.0, -146.89287]}
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
index dd3d4b8d54..bf7841bd41 100644
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
@@ -117,13 +117,11 @@ def post_process(output):
         preds = torch.argmax(output.logits, dim=-1)
     else:
         preds = torch.argmax(output, dim=-1)
-    print(preds.tolist(), "this is preds")
     return preds
 
 
 def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
     dataloader = create_dataloader(data_dir, batch_size)
-    print(dataloader, "this is dataloader")
     preds = []
     target = []
     sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
@@ -146,7 +144,6 @@ def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_pro
             target.extend(labels.data.tolist())
     elif model.framework == Framework.PYTORCH:
         for inputs, labels in dataloader:
-            print(inputs, "this is inputs", labels, "this is labels")
             if isinstance(inputs, dict):
                 result = sess(**inputs)
             else:
@@ -154,5 +151,4 @@ def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_pro
             outputs = post_process(result)
             preds.extend(outputs.tolist())
             target.extend(labels.data.tolist())
-    print(preds, "this is preds")
     return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 2d8677ede3..360f9dd0dd 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -4,6 +4,8 @@
 import pytest
 from utils import extract_best_models, patch_config
 
+from olive.workflows import run as olive_run
+
 
 @pytest.fixture(scope="module", autouse=True)
 def setup():
@@ -59,9 +61,6 @@ def setup():
     ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
 )
 def test_bert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
     extract_best_models(footprint, "bertweet-base-sentiment-analysis")

From e0f096d9e1d22369ca8b4a5830e8b1c97002ee95 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 09:55:36 +0000
Subject: [PATCH 20/67] testing 2 models

---
 .../test_perf_monitoring_models_cpu.py        | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 360f9dd0dd..a98a84ca5f 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -4,8 +4,6 @@
 import pytest
 from utils import extract_best_models, patch_config
 
-from olive.workflows import run as olive_run
-
 
 @pytest.fixture(scope="module", autouse=True)
 def setup():
@@ -17,22 +15,35 @@ def setup():
     os.chdir(cur_dir)
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/bert/bert_workflow_cpu.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/bert/bert_workflow_cpu.json"],
+)
+def test_bert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "bert")
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "bert")
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+)
+def test_distilbert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
 
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+#     ["perf_models/CamemBERT/cpu_config.json"],
 # )
 # def test_bert(olive_json):
 #     print(olive_json)
@@ -40,30 +51,19 @@ def setup():
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+#     extract_best_models(footprint, "CamemBERT")
 
 
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/CamemBERT/cpu_config.json"],
+#     ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
 # )
 # def test_bert(olive_json):
-#     print(olive_json)
 #     from olive.workflows import run as olive_run
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "CamemBERT")
-
-
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
-)
-def test_bert(olive_json):
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "bertweet-base-sentiment-analysis")
+#     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
 
 
 # @pytest.mark.parametrize(

From 0ab5f57e29462c9ab5053a69e6d3c78035e9da66 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 10:02:13 +0000
Subject: [PATCH 21/67] testing bert model

---
 .../test_perf_monitoring_models_cpu.py        | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index a98a84ca5f..3b7b3f47a1 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -28,17 +28,17 @@ def test_bert(olive_json):
     extract_best_models(footprint, "bert")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
-)
-def test_distilbert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+# )
+# def test_distilbert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
 
 # @pytest.mark.parametrize(

From bf366cf33452d013da461e3240f8e74195bcb4e8 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 17:10:54 +0000
Subject: [PATCH 22/67] fixing error on bert  model

---
 perf_monitoring/testtt.ipynb | 288 -----------------------------------
 perf_monitoring/utils.py     |   2 +-
 2 files changed, 1 insertion(+), 289 deletions(-)
 delete mode 100644 perf_monitoring/testtt.ipynb

diff --git a/perf_monitoring/testtt.ipynb b/perf_monitoring/testtt.ipynb
deleted file mode 100644
index e17b1aa1fc..0000000000
--- a/perf_monitoring/testtt.ipynb
+++ /dev/null
@@ -1,288 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-06-29 00:10:57,266] [DEBUG] [engine.py:577:resolve_goals] Resolving goals: {'accuracy': {'accuracy': None}, 'latency': {'avg': None}}\n",
-      "[2023-06-29 00:10:57,267] [DEBUG] [engine.py:596:resolve_goals] No baseline got as no goal is provided the the goal is threshold\n",
-      "[2023-06-29 00:10:57,277] [DEBUG] [engine.py:498:run_search] Step 1 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'static', 'calibrate_method': 'MinMax', 'quant_format': 'QOperator', 'MatMulConstBOnly': False, 'weight_type': 'QUInt8', 'activation_type': 'QUInt8', 'per_channel': False, 'reduce_range': False, 'optimize_model': True, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
-      "[2023-06-29 00:10:57,277] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
-      "[2023-06-29 00:10:57,278] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:10:57,279] [DEBUG] [__init__.py:582:get_dummy_inputs] Using hf_config.dataset to get dummy inputs\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/optuna/samplers/_tpe/sampler.py:278: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n",
-      "  warnings.warn(\n",
-      "/home/emmanuel/.conda/envs/emmanuel-onnx/lib/python3.8/site-packages/optuna/samplers/_tpe/sampler.py:289: ExperimentalWarning: ``group`` option is an experimental feature. The interface can change in the future.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-06-29 00:10:57,717] [DEBUG] [conversion.py:73:_run_for_config] Using hf config to get io_config for the model.\n",
-      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 =============\n",
-      "verbose: False, log level: Level.ERROR\n",
-      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
-      "\n",
-      "[2023-06-29 00:11:06,068] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
-      "[2023-06-29 00:11:06,070] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:11:24,090] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
-      "[2023-06-29 00:11:24,094] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:11:24,095] [INFO] [quantization.py:333:_run_for_config] Preprocessing model for quantization\n",
-      "[2023-06-29 00:12:08,588] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
-      "[2023-06-29 00:12:08,592] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:12:10,386] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:12:22,750] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:12:44,367] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 208.2179\n",
-      "[2023-06-29 00:12:44,369] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 34.99482\n",
-      "[2023-06-29 00:12:44,369] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 39.99097\n",
-      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 35.78461\n",
-      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 27.77902\n",
-      "[2023-06-29 00:12:44,370] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.60358\n",
-      "[2023-06-29 00:12:44,371] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.29312\n",
-      "[2023-06-29 00:12:44,371] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.40254\n",
-      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 31.82336\n",
-      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 205.92308\n",
-      "[2023-06-29 00:12:44,372] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.58837\n",
-      "[2023-06-29 00:12:44,373] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 39.44375\n",
-      "[2023-06-29 00:12:44,373] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 31.32929\n",
-      "[2023-06-29 00:12:44,373] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}, 'io_bind': False, 'latency_ms': 27.77902}\n",
-      "[2023-06-29 00:12:44,376] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
-      "[2023-06-29 00:12:44,377] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:12:58,223] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8357843137254902, 'latency-avg': 28.00311}\n",
-      "[2023-06-29 00:12:58,228] [DEBUG] [engine.py:498:run_search] Step 2 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'dynamic', 'calibrate_method': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'quant_format': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'MatMulConstBOnly': True, 'weight_type': 'QUInt8', 'activation_type': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'per_channel': True, 'reduce_range': True, 'optimize_model': True, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
-      "[2023-06-29 00:12:58,228] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
-      "[2023-06-29 00:12:58,230] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
-      "[2023-06-29 00:12:58,233] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
-      "[2023-06-29 00:12:58,235] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
-      "[2023-06-29 00:12:58,237] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
-      "[2023-06-29 00:12:58,241] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:12:58,244] [INFO] [quantization.py:336:_run_for_config] Already processed model for quantization, skipping preprocessing\n",
-      "[2023-06-29 00:13:29,778] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
-      "[2023-06-29 00:13:29,782] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:13:31,522] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:13:42,631] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:14:01,500] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 205.27577\n",
-      "[2023-06-29 00:14:01,501] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.17117\n",
-      "[2023-06-29 00:14:01,502] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.52153\n",
-      "[2023-06-29 00:14:01,502] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 28.28242\n",
-      "[2023-06-29 00:14:01,503] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 25.07949\n",
-      "[2023-06-29 00:14:01,503] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.26937\n",
-      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 37.00565\n",
-      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 35.17373\n",
-      "[2023-06-29 00:14:01,504] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.54845\n",
-      "[2023-06-29 00:14:01,505] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 202.92866\n",
-      "[2023-06-29 00:14:01,505] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 26.6119\n",
-      "[2023-06-29 00:14:01,506] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.80204\n",
-      "[2023-06-29 00:14:01,506] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.87976\n",
-      "[2023-06-29 00:14:01,507] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': None}, 'io_bind': False, 'latency_ms': 25.07949}\n",
-      "[2023-06-29 00:14:01,509] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
-      "[2023-06-29 00:14:01,510] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:14:15,862] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.17589}\n",
-      "[2023-06-29 00:14:15,867] [DEBUG] [engine.py:498:run_search] Step 3 with search point {'OnnxConversion': {}, 'OrtTransformersOptimization': {}, 'OnnxQuantization': {'quant_mode': 'dynamic', 'calibrate_method': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'quant_format': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'MatMulConstBOnly': True, 'weight_type': 'QInt8', 'activation_type': <SpecialParamValue.IGNORED: 'OLIVE_IGNORED_PARAM_VALUE'>, 'per_channel': True, 'reduce_range': True, 'optimize_model': False, 'quant_preprocess': True}, 'OrtPerfTuning': {}} ...\n",
-      "[2023-06-29 00:14:15,867] [INFO] [engine.py:837:_run_pass] Running pass OnnxConversion\n",
-      "[2023-06-29 00:14:15,868] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
-      "[2023-06-29 00:14:15,871] [INFO] [engine.py:837:_run_pass] Running pass OrtTransformersOptimization\n",
-      "[2023-06-29 00:14:15,872] [DEBUG] [engine.py:845:_run_pass] Loading model from cache ...\n",
-      "[2023-06-29 00:14:15,875] [INFO] [engine.py:837:_run_pass] Running pass OnnxQuantization\n",
-      "[2023-06-29 00:14:15,878] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:14:15,880] [INFO] [quantization.py:336:_run_for_config] Already processed model for quantization, skipping preprocessing\n",
-      "[2023-06-29 00:14:45,593] [INFO] [engine.py:837:_run_pass] Running pass OrtPerfTuning\n",
-      "[2023-06-29 00:14:45,596] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:14:47,326] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 0), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:14:58,233] [INFO] [perf_tuning.py:106:tune_onnx_model] Run tuning for: [('provider', 'CPUExecutionProvider'), ('execution_mode', 1), ('ort_opt_level', 99), ('io_bind', False)]\n",
-      "[2023-06-29 00:15:17,108] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 196.05226\n",
-      "[2023-06-29 00:15:17,109] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 26.61709\n",
-      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 36.43117\n",
-      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 28.41241\n",
-      "[2023-06-29 00:15:17,110] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 29.891\n",
-      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 33.73918\n",
-      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 33.83653\n",
-      "[2023-06-29 00:15:17,111] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 34.48713\n",
-      "[2023-06-29 00:15:17,112] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 30.04336\n",
-      "[2023-06-29 00:15:17,112] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 197.81958\n",
-      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 25.34003\n",
-      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 38.36773\n",
-      "[2023-06-29 00:15:17,113] [DEBUG] [perf_tuning.py:112:tune_onnx_model] Tuning result: 27.06558\n",
-      "[2023-06-29 00:15:17,114] [INFO] [perf_tuning.py:115:tune_onnx_model] Best result: {'test_name': \"execution_provider_[('CPUExecutionProvider', {})]_session_options_{'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': 12}__io_bind_False\", 'execution_provider': [('CPUExecutionProvider', {})], 'session_options': {'execution_mode': 1, 'graph_optimization_level': 99, 'extra_session_config': None, 'inter_op_num_threads': 1, 'intra_op_num_threads': 12}, 'io_bind': False, 'latency_ms': 25.34003}\n",
-      "[2023-06-29 00:15:17,117] [DEBUG] [engine.py:964:_evaluate_model] Evaluating model ...\n",
-      "[2023-06-29 00:15:17,117] [DEBUG] [engine.py:703:_prepare_non_local_model] Model path is None, local or string name. No need to prepare\n",
-      "[2023-06-29 00:15:30,319] [DEBUG] [engine.py:819:_run_passes] Signal: {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.7194}\n",
-      "[2023-06-29 00:15:30,324] [INFO] [footprint.py:167:get_pareto_frontier] pareto frontier points: 5_OrtPerfTuning-4-600614b69719e936ca21efbf07971aec {'accuracy-accuracy': 0.8455882352941176, 'latency-avg': 24.17589}\n",
-      "[2023-06-29 00:15:30,324] [INFO] [engine.py:513:run_search] Output all 1 models\n",
-      "[2023-06-29 00:15:30,325] [INFO] [engine.py:337:run] No packaging config provided, skip packaging artifacts\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "from utils import check_search_output, patch_config\n",
-    "\n",
-    "\n",
-    "from olive.workflows import run as olive_run\n",
-    "\n",
-    "olive_config = patch_config(\"bert_workflow_cpu.json\")\n",
-    "footprint = olive_run(olive_config)\n",
-    "check_search_output(footprint)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "value=MetricResult(__root__={'accuracy-accuracy': SubMetricResult(value=0.8455882352941176, priority=1, higher_is_better=True), 'latency-avg': SubMetricResult(value=24.17589, priority=2, higher_is_better=False)}) cmp_direction={'accuracy-accuracy': 1, 'latency-avg': -1} is_goals_met=True\n",
-      "('value', MetricResult(__root__={'accuracy-accuracy': SubMetricResult(value=0.8455882352941176, priority=1, higher_is_better=True), 'latency-avg': SubMetricResult(value=24.17589, priority=2, higher_is_better=False)}))\n",
-      "('cmp_direction', {'accuracy-accuracy': 1, 'latency-avg': -1})\n",
-      "('is_goals_met', True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "footprint\n",
-    "#get first item from dict\n",
-    "fooott = list(footprint.values())[0] \n",
-    "\n",
-    "for node in fooott.nodes.values():\n",
-    "    print(node.metrics)\n",
-    "    for node in node.metrics:\n",
-    "        print(node)\n",
-    "   # print(type(node.metrics.value))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from olive.engine.footprint import Footprint"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pf = Footprint.from_file(\"models/bert_workflow_cpu/cpu-cpu_pareto_frontier_footprints.json\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[0.8455882352941176, -24.17589]"
-      ]
-     },
-     "execution_count": 69,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "metrics_of_interest = ['accuracy-accuracy', 'latency-avg']\n",
-    "# gather the metrics from all pareto frontier nodes\n",
-    "all_metrics = []\n",
-    "# we iterate over the nodes in the pareto frontier\n",
-    "for node in pf.nodes.values():\n",
-    "    metrics = []\n",
-    "    # collecting the metrics of interest\n",
-    "    for name in metrics_of_interest:\n",
-    "        # (value of metric * direction of comparison)\n",
-    "        # now higher is better for all metrics\n",
-    "        metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])\n",
-    "    all_metrics.append(metrics)\n",
-    "# sort the metrics\n",
-    "# this sorts it\n",
-    "sorted_metrics = sorted(all_metrics, reverse=True)\n",
-    "# get best metrics\n",
-    "# last one is the best\n",
-    "best_metrics = sorted_metrics[0]\n",
-    "best_metrics\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.1 -20\n",
-      "0.8455882352941176 -24.17589\n"
-     ]
-    }
-   ],
-   "source": [
-    "import json\n",
-    "\n",
-    "#open best metrics json\n",
-    "with open('best_metrics.json') as f:\n",
-    "    data = json.load(f)\n",
-    "    print(data[0], data[1])\n",
-    "    print(best_metrics[0], best_metrics[1])\n",
-    "    if best_metrics[0] > data[0] and best_metrics[1] < data[1]:\n",
-    "        best_metrics = data\n",
-    "#save best metrics to json\n",
-    "with open('best_metrics.json', 'w') as f:\n",
-    "    json.dump(best_metrics, f)\n",
-    "    "
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "emmanuel-onnx",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.16"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index ac36f8800d..0b9d92f231 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -31,7 +31,7 @@ def patch_config(config_json_path: str):
 
 def extract_best_models(footprint, model_name):
     print("Footprint: ", footprint)
-    footprint = list(footprint.values())[0]  #
+    footprint = list(footprint.values())[0]
     metrics_of_interest = ["accuracy-accuracy_custom", "latency-avg"]
     # gather the metrics from all pareto frontier nodes
     all_metrics = []

From 3baf6998f196cd3c9c68a223d616433bc61e57d2 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 17:54:03 +0000
Subject: [PATCH 23/67] making changes

---
 .../perf_models/CamemBERT/user_script.py      |  2 +-
 .../user_script.py                            |  2 +-
 .../test_perf_monitoring_models_cpu.py        | 52 +++++++++----------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/perf_monitoring/perf_models/CamemBERT/user_script.py b/perf_monitoring/perf_models/CamemBERT/user_script.py
index 7e20074572..b0a8eb83d2 100644
--- a/perf_monitoring/perf_models/CamemBERT/user_script.py
+++ b/perf_monitoring/perf_models/CamemBERT/user_script.py
@@ -118,7 +118,7 @@ def __len__(self):
     return _Dateset(tokenized_datasets)
 
 
-def create_dataloader(data_dir="", batch_size=2):
+def create_dataloader(data_dir="", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
index e2f05308e6..19a4efcbb6 100644
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
@@ -92,7 +92,7 @@ def __len__(self):
     return _Dateset(tokenized_datasets)
 
 
-def create_dataloader(data_dir="", batch_size=2):
+def create_dataloader(data_dir="", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 3b7b3f47a1..9e45929618 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -15,43 +15,43 @@ def setup():
     os.chdir(cur_dir)
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/bert/bert_workflow_cpu.json"],
-)
-def test_bert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "bert")
-
-
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+#     ["perf_models/bert/bert_workflow_cpu.json"],
 # )
-# def test_distilbert(olive_json):
+# def test_bert(olive_json):
 #     print(olive_json)
 #     from olive.workflows import run as olive_run
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+#     extract_best_models(footprint, "bert")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/CamemBERT/cpu_config.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+)
+def test_distilbert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "CamemBERT")
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/CamemBERT/cpu_config.json"],
+)
+def test_Camembert(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "CamemBERT")
 
 
 # @pytest.mark.parametrize(

From 4dffbf31c554c7cc4a037a5d2cb289947ebe2c71 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 17 Jul 2023 21:01:28 +0000
Subject: [PATCH 24/67] testing 5 models on pipeline

---
 .../user_script.py                            |  2 +-
 .../user_script.py                            |  2 +-
 .../roberta-large-mnli/user_script.py         |  2 +-
 .../user_script.py                            |  2 +-
 .../test_perf_monitoring_models_cpu.py        | 58 +++++++++----------
 5 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
index bf7841bd41..cd1928b427 100644
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
@@ -91,7 +91,7 @@ def __len__(self):
     return _Dateset(tokenized_datasets)
 
 
-def create_dataloader(data_dir="", batch_size=2):
+def create_dataloader(data_dir="", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
index 2d72fd7ea7..a121cf43c5 100644
--- a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
+++ b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
@@ -95,7 +95,7 @@ def __len__(self):
     return _Dateset(tokenized_datasets)
 
 
-def create_dataloader(data_dir="", batch_size=2):
+def create_dataloader(data_dir="", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
index 366cce76c2..48f5c64c79 100644
--- a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
+++ b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
@@ -95,7 +95,7 @@ def __len__(self):
     return _Dateset(tokenized_datasets)
 
 
-def create_dataloader(data_dir="", batch_size=2):
+def create_dataloader(data_dir="", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
index 6e21bd484c..97504ef529 100644
--- a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
+++ b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
@@ -99,7 +99,7 @@ def __len__(self):
     return _Dateset(rls)
 
 
-def create_dataloader(data_dir="data", batch_size=2):
+def create_dataloader(data_dir="data", batch_size=2, model_framework=None):
     def _collate_fn(batch):
         batch = default_collate(batch)
         return batch
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 9e45929618..5ccfb9b822 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -54,42 +54,42 @@ def test_Camembert(olive_json):
     extract_best_models(footprint, "CamemBERT")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
-# )
-# def test_bert(olive_json):
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
+)
+def test_bertweet(olive_json):
+    from olive.workflows import run as olive_run
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "bertweet-base-sentiment-analysis")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
+)
+def test_microsoft(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "microsoft-deberta-base-mnli")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/roberta-large-mnli/cpu_config.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/roberta-large-mnli/cpu_config.json"],
+)
+def test_roberta_mnli(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "roberta-large-mnli")
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "roberta-large-mnli")
 
 
 # @pytest.mark.parametrize(

From 83069208e1832843ad1900c80d45331ff87be485 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 21 Jul 2023 09:22:55 +0000
Subject: [PATCH 25/67] testing env variable on pipeline

---
 .azure_pipelines/perfmonitoring-ci .yaml      |  6 +-
 Makefile                                      |  3 +-
 .../cpu_config.json                           | 81 ++++++----------
 .../test_perf_monitoring_models_cpu.py        | 92 +++++++++----------
 perf_monitoring/utils.py                      |  2 +-
 scripts/perf_monitoring.bat                   |  3 +-
 scripts/perf_monitoring.sh                    |  3 +-
 7 files changed, 85 insertions(+), 105 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index f43fd48f1a..d4113d12d1 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -29,18 +29,20 @@ variables:
 jobs:
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
-    name: Windows_CI
+    name: bert Windows_CI
     pool: $(OLIVE_POOL_WIN2019)
     windows: True
     examples:
       bert:
         perfMonitoringScriptName: perf_monitoring_models_cpu
+        perfMonitoringScriptFunction: test_bert
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
-    name: Linux_CI
+    name: bert Linux_CI
     pool: $(OLIVE_POOL_UBUNTU2004)
     windows: False
     examples:
       bert:
         perfMonitoringScriptName: perf_monitoring_models_cpu
+        perfMonitoringScriptFunction: test_bert
diff --git a/Makefile b/Makefile
index 5a15c12c2c..90a38569db 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ INSTALL_DEV_MODE           ?= False
 EXAMPLE_FOLDER             ?=
 EXAMPLE_NAME               ?=
 PERF_MONITORING_SCRIPT_NAME ?=
+PERF_MONITORING_SCRIPT_FUNCTION ?=
 INSTALL_EXTRAS             ?=
 VERSION                    ?=
 ifeq ($(WINDOWS), True)
@@ -58,4 +59,4 @@ clean:
 
 .PHONY: perf-monitoring
 perf-monitoring:
-	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)
+	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME) $(PERF_MONITORING_SCRIPT_FUNCTION)
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
index 956a5a9ff9..daff4580f0 100644
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
@@ -1,71 +1,50 @@
 {
-    "verbose": true,
     "input_model": {
         "type": "PyTorchModel",
         "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+            "hf_config": {
+                "model_name": "finiteautomata/bertweet-base-sentiment-analysis",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["input_ids", "token_type_ids", "attention_mask"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
                 }
             }
+
         }
     },
     "evaluators": {
         "common_evaluator": {
-            "metrics": [
+            "metrics":[
                 {
                     "name": "accuracy",
-                    "type": "custom",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
                     "sub_types": [
-                        {
-                            "name": "accuracy_custom",
-                            "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
-                            }
-                        }
-                    ],
-                    "is_first_priority": true,
-                    "user_config":{
-                        "evaluate_func": "eval_accuracy",
-                        "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
-                        "batch_size": 1
-                    }
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "avg", "priority": 2}
+                    ]
                 }
             ]
         }
     },
     "passes": {
         "conversion": {
-            "type": "OnnxConversion"
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
@@ -77,15 +56,10 @@
             }
         },
         "quantization": {
-            "type": "OnnxDynamicQuantization"
+            "type": "OnnxQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/bertweet-base-sentiment-analysis/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {
@@ -93,12 +67,13 @@
             "execution_order": "joint",
             "search_algorithm": "tpe",
             "search_algorithm_config": {
-                "num_samples": 5,
+                "num_samples": 3,
                 "seed": 0
             }
         },
         "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir": "models/bertweet-base-sentiment-analysis_workflow_cpu.json"
+        "output_dir" : "models/bertweet_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 5ccfb9b822..7048a4e756 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -15,43 +15,43 @@ def setup():
     os.chdir(cur_dir)
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/bert/bert_workflow_cpu.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "bert")
-
-
 @pytest.mark.parametrize(
     "olive_json",
-    ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+    ["perf_models/bert/bert_workflow_cpu.json"],
 )
-def test_distilbert(olive_json):
+def test_bert(olive_json):
     print(olive_json)
     from olive.workflows import run as olive_run
 
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
-    extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
+    extract_best_models(footprint, "bert")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/CamemBERT/cpu_config.json"],
-)
-def test_Camembert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
+# )
+# def test_distilbert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "CamemBERT")
+
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/CamemBERT/cpu_config.json"],
+# )
+# def test_Camembert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
+
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "CamemBERT")
 
 
 @pytest.mark.parametrize(
@@ -66,30 +66,30 @@ def test_bertweet(olive_json):
     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
-)
-def test_microsoft(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
+# )
+# def test_microsoft(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "microsoft-deberta-base-mnli")
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/roberta-large-mnli/cpu_config.json"],
-)
-def test_roberta_mnli(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/roberta-large-mnli/cpu_config.json"],
+# )
+# def test_roberta_mnli(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "roberta-large-mnli")
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "roberta-large-mnli")
 
 
 # @pytest.mark.parametrize(
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 0b9d92f231..2f9d84921b 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -32,7 +32,7 @@ def patch_config(config_json_path: str):
 def extract_best_models(footprint, model_name):
     print("Footprint: ", footprint)
     footprint = list(footprint.values())[0]
-    metrics_of_interest = ["accuracy-accuracy_custom", "latency-avg"]
+    metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
     # gather the metrics from all pareto frontier nodes
     all_metrics = []
     # we iterate over the nodes in the pareto frontier
diff --git a/scripts/perf_monitoring.bat b/scripts/perf_monitoring.bat
index 11b008d251..7171ffe65a 100644
--- a/scripts/perf_monitoring.bat
+++ b/scripts/perf_monitoring.bat
@@ -7,6 +7,7 @@ REM --------------------------------------------------------------------------
 set PIPELINE=%1
 set ROOT_DIR=%2
 set PERF_MONITORING_SCRIPT_NAME=%3
+set PERF_MONITORING_SCRIPT_FUNCTION=%4
 
 if "%PIPELINE%"=="True" (
     call olive-venv\\Scripts\\activate.bat || goto :error
@@ -20,7 +21,7 @@ call echo "performance monitoring examples"
 call python -m pip install -r %ROOT_DIR%\\perf_monitoring\\requirements.txt || goto :error
 
 call python -m pytest -v -s --log-cli-level=WARNING --junitxml=%ROOT_DIR%\\logs\\performance-monitoring-TestOlive.xml^
- %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py || goto :error
+ %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py::%PERF_MONITORING_SCRIPT_FUNCTION% || goto :error
 
 goto :EOF
 
diff --git a/scripts/perf_monitoring.sh b/scripts/perf_monitoring.sh
index 9df637b8b3..4e06e0b10b 100644
--- a/scripts/perf_monitoring.sh
+++ b/scripts/perf_monitoring.sh
@@ -8,6 +8,7 @@ set -eoux pipefail
 PIPELINE=$1
 ROOT_DIR=$2
 PERF_MONITORING_SCRIPT_NAME=$3
+PERF_MONITORING_SCRIPT_FUNCTION=$4
 
 echo $PIPELINE
 if [[ "$PIPELINE" == "True" ]]; then
@@ -23,4 +24,4 @@ python -m pip install pytest
 echo "performance monitoring examples"
 python -m pip install -r $ROOT_DIR/perf_monitoring/requirements.txt
 
-python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py
+python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py::$PERF_MONITORING_SCRIPT_FUNCTION

From 215a1346ad473a2bded76ec2a1ab7d73565e79db Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 21 Jul 2023 09:25:04 +0000
Subject: [PATCH 26/67] testing env variable on pipeline

---
 .azure_pipelines/perfmonitoring-ci .yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index d4113d12d1..8ad92ac57d 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -29,7 +29,7 @@ variables:
 jobs:
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
-    name: bert Windows_CI
+    name: Windows_CI
     pool: $(OLIVE_POOL_WIN2019)
     windows: True
     examples:
@@ -39,7 +39,7 @@ jobs:
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
-    name: bert Linux_CI
+    name: Linux_CI
     pool: $(OLIVE_POOL_UBUNTU2004)
     windows: False
     examples:

From af855fcab6c2fccece23c7ee5f83ebbd8d46e8b4 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 21 Jul 2023 09:37:28 +0000
Subject: [PATCH 27/67] testing env variable on pipeline

---
 .../job_templates/olive-perf-monitoring-template.yaml    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
index e86a5820f5..8a2df35e7c 100644
--- a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
+++ b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
@@ -29,7 +29,7 @@ jobs:
     inputs:
       azureSubscription: $(OLIVE_RG_SERVICE_CONNECTION)
       scriptLocation: 'inlineScript'
-      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName)
+      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName) PERF_MONITORING_SCRIPT_FUNCTION=$(perfMonitoringScriptFunction)
     displayName: performance monitoring
     env:
       OLIVEWHEELS_STORAGE_CONNECTION_STRING: $(olive-wheels-storage-connection-string)
@@ -51,6 +51,13 @@ jobs:
       testRunTitle: '$(Build.BuildNumber)[$(Agent.JobName)]'
     displayName: Upload pipeline run test results
 
+  - task:  PublishPipelineArtifact@1
+    inputs:
+      path: $(Build.SourcesDirectory)/perf_monitoring_results
+      artifactName: best_metrics
+      artifactType: pipeline
+    displayName: Publish models
+
   - script: make clean WINDOWS=$(WINDOWS)
     condition: always()
     displayName: Clean remaining artifacts

From 24da9bc583616a0670add6b4a1dd01768c9cc56c Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 24 Jul 2023 18:44:18 +0000
Subject: [PATCH 28/67] staging commits

---
 .azure_pipelines/perfmonitoring-ci .yaml      |  4 ++++
 .../cpu_config.json                           |  8 ++++----
 .../test_perf_monitoring_models_cpu.py        | 20 +++++++++----------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 8ad92ac57d..088254df19 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -46,3 +46,7 @@ jobs:
       bert:
         perfMonitoringScriptName: perf_monitoring_models_cpu
         perfMonitoringScriptFunction: test_bert
+
+      bertweet:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        perfMonitoringScriptFunction: test_bertweet
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
index daff4580f0..0221f579fa 100644
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
+++ b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
@@ -6,10 +6,10 @@
                 "model_name": "finiteautomata/bertweet-base-sentiment-analysis",
                 "task": "text-classification",
                 "dataset": {
-                    "data_name":"glue",
-                    "subset": "mrpc",
-                    "split": "validation",
-                    "input_cols": ["input_ids", "token_type_ids", "attention_mask"],
+                    "data_name":"cardiffnlp/tweet_sentiment_multilingual",
+                    "subset": "english",
+                    "split": "test",
+                    "input_cols": ["text"],
                     "label_cols": ["label"],
                     "batch_size": 1
                 }
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 7048a4e756..4c8d0b9553 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -15,17 +15,17 @@ def setup():
     os.chdir(cur_dir)
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/bert/bert_workflow_cpu.json"],
-)
-def test_bert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/bert/bert_workflow_cpu.json"],
+# )
+# def test_bert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "bert")
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "bert")
 
 
 # @pytest.mark.parametrize(

From 5cde01672959f4b651fee0860e0a86c6cd27f343 Mon Sep 17 00:00:00 2001
From: Jambay Kinley <jambaykinley@microsoft.com>
Date: Mon, 24 Jul 2023 20:59:32 +0000
Subject: [PATCH 29/67] expose data_files in hf load_dataset

---
 olive/data/component/load_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/olive/data/component/load_dataset.py b/olive/data/component/load_dataset.py
index 56e4e30604..c646daf671 100644
--- a/olive/data/component/load_dataset.py
+++ b/olive/data/component/load_dataset.py
@@ -23,7 +23,7 @@ def simple_dataset(input_data, label_cols=None, **kwargs):
 
 
 @Registry.register_dataset()
-def huggingface_dataset(data_name=None, subset=None, split="validation", **kwargs):
+def huggingface_dataset(data_name=None, subset=None, split="validation", data_files=None, **kwargs):
     """
     This function is used to create a dataset from huggingface datasets
     """
@@ -34,7 +34,7 @@ def huggingface_dataset(data_name=None, subset=None, split="validation", **kwarg
     from datasets import load_dataset
 
     assert data_name is not None, "Please specify the data name"
-    return load_dataset(path=data_name, name=subset, split=split, **kwargs)
+    return load_dataset(path=data_name, name=subset, split=split, data_files=data_files, **kwargs)
 
 
 @Registry.register_dataset()

From 210f709f2483c5acc6422c6dd05f83ac049e108f Mon Sep 17 00:00:00 2001
From: Jambay Kinley <jambaykinley@microsoft.com>
Date: Mon, 24 Jul 2023 22:24:02 +0000
Subject: [PATCH 30/67] add max_samples to huggingface data config

---
 olive/data/component/dataset.py          | 9 +++++++--
 olive/data/component/pre_process_data.py | 8 ++++----
 olive/data/template.py                   | 2 ++
 olive/passes/onnx/conversion.py          | 3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/olive/data/component/dataset.py b/olive/data/component/dataset.py
index 5665f54442..f05eae19aa 100644
--- a/olive/data/component/dataset.py
+++ b/olive/data/component/dataset.py
@@ -19,18 +19,23 @@ class BaseDataset(Dataset):
     The data should be a list or dict of numpy arrays or torch tensors
     """
 
-    def __init__(self, data, label_cols=None, **kwargs):
+    def __init__(self, data, label_cols=None, max_samples=None, **kwargs):
         """
         This function is used to initialize the dataset
         """
         self.data = data
         self.label_cols = label_cols or []
+        self.max_samples = max_samples
 
     def __len__(self):
         """
         This function is used to return the length of the dataset
         """
-        return len(self.data)
+        num_samples = len(self.data)
+        if self.max_samples is not None:
+            # if max_samples is not None, return the min of num_samples and max_samples
+            num_samples = min(num_samples, self.max_samples)
+        return num_samples
 
     def __getitem__(self, index):
         data = {k: v for k, v in self.data[index].items() if k not in self.label_cols}
diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py
index 0dc37f2520..6c27fee5f5 100644
--- a/olive/data/component/pre_process_data.py
+++ b/olive/data/component/pre_process_data.py
@@ -43,7 +43,7 @@ def _huggingface_pre_precess_helper(dataset, model_name, input_cols, label_cols,
 
 
 @Registry.register_pre_process()
-def huggingface_pre_process(_dataset, model_name, input_cols, label_cols, **kwargs):
+def huggingface_pre_process(_dataset, model_name, input_cols, label_cols, max_samples=None, **kwargs):
     """Pre-process data.
 
     Args:
@@ -73,11 +73,11 @@ def _tokenizer_and_align_labels(examples):
         _dataset, model_name, input_cols, label_cols, _tokenizer_and_align_labels, **kwargs
     )
     # label_cols is ["label"] since we added label_cols[0] as "label" to tokenized_inputs
-    return BaseDataset(tokenized_datasets, label_cols=["label"])
+    return BaseDataset(tokenized_datasets, label_cols=["label"], max_samples=max_samples)
 
 
 @Registry.register_pre_process()
-def ner_huggingface_preprocess(_dataset, model_name, input_cols, label_cols, **kwargs):
+def ner_huggingface_preprocess(_dataset, model_name, input_cols, label_cols, max_samples=None, **kwargs):
     """
     Pre-process data for ner task.
     """
@@ -125,4 +125,4 @@ def _tokenizer_and_align_labels(examples):
     tokenized_datasets = _huggingface_pre_precess_helper(
         _dataset, model_name, input_cols, label_cols, _tokenizer_and_align_labels, **kwargs
     )
-    return BaseDataset(tokenized_datasets, label_cols=["label"])
+    return BaseDataset(tokenized_datasets, label_cols=["label"], max_samples=max_samples)
diff --git a/olive/data/template.py b/olive/data/template.py
index 0a1377510d..f5c5e82f50 100644
--- a/olive/data/template.py
+++ b/olive/data/template.py
@@ -39,9 +39,11 @@ def huggingface_data_config_template(model_name, task, **kwargs) -> DataConfig:
         - `data_name`: str, data name in huggingface dataset, e.g.: "glue", "squad"
         - `subset`: str, subset of data, e.g.: "train", "validation", "test"
         - `split`: str, split of data, e.g.: "train", "validation", "test"
+        - `data_files`: str | list | dict, path to source data file(s).
         - `input_cols`: list, input columns of data
         - `label_cols`: list, label columns of data
         - `batch_size`: int, batch size of data
+        - `max_samples`: int, maximum number of samples in the dataset
         and other arguments in
             - olive.data.component.load_dataset.huggingface_dataset
             - olive.data.component.pre_process_data.huggingface_pre_process
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
index 0fa82fe3c7..2cc0b123de 100644
--- a/olive/passes/onnx/conversion.py
+++ b/olive/passes/onnx/conversion.py
@@ -118,7 +118,8 @@ def _run_for_config(
             if isinstance(dummy_inputs, dict):
                 dummy_input_keys = set(dummy_inputs.keys())
                 unused_keys = dummy_input_keys - set(input_names)
-                logger.debug(f"Removing unused dummy inputs: {unused_keys}")
+                if unused_keys:
+                    logger.debug(f"Removing unused dummy inputs: {unused_keys}")
                 for key in unused_keys:
                     del dummy_inputs[key]
 

From ff8ec9eeff22c50f1d1ad9089f3cdf7f4464f211 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 24 Jul 2023 23:49:57 +0000
Subject: [PATCH 31/67] changes

---
 .../perf_models/CamemBERT/cpu_config.json     | 78 +++++++----------
 .../cpu_config.json                           | 82 +++++++-----------
 .../cpu_config.json                           | 83 +++++++------------
 .../test_perf_monitoring_models_cpu.py        | 40 ++++-----
 4 files changed, 105 insertions(+), 178 deletions(-)

diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
index 37f9c0b67a..dc668bb6e5 100644
--- a/perf_monitoring/perf_models/CamemBERT/cpu_config.json
+++ b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
@@ -1,64 +1,47 @@
 {
-    "verbose": true,
-    "input_model": {
+    "input_model":{
         "type": "PyTorchModel",
         "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/CamemBERT/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+            "hf_config": {
+                "model_name": "Jean-Baptiste/camembert-ner",
+                "task": "ner",
+                "dataset": {
+                    "data_name":"Jean-Baptiste/wikiner_fr",
+                    "split": "test",
+                    "input_cols": ["tokens"],
+                    "label_cols": ["ner_tags"],
+                    "batch_size": 1
                 }
             }
         }
     },
     "evaluators": {
         "common_evaluator": {
-            "metrics": [
+            "metrics":[
                 {
                     "name": "accuracy",
-                    "type": "custom",
+                    "type": "accuracy",
                     "sub_types": [
                         {
-                            "name": "accuracy_custom",
+                            "name": "accuracy_score",
                             "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
+                            "goal": {"type": "max-degradation", "value": 0.01},
+                            "metric_config": {
+                                "task": "multiclass",
+                                "num_classes": "5",
+                                "top_k": 1
                             }
                         }
-                    ],
-                    "is_first_priority": true,
-                    "user_config": {
-                        "evaluate_func": "evaluate_accuracy",
-                        "user_script": "./perf_models/CamemBERT/user_script.py",
-                        "batch_size": 1
-                    }
+                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/CamemBERT/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}},
+                        {"name": "max"},
+                        {"name": "min"}
+                    ]
                 }
             ]
         }
@@ -77,28 +60,25 @@
             }
         },
         "quantization": {
-            "type": "OnnxDynamicQuantization"
+            "type": "OnnxQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/CamemBERT/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {
+        "log_severity_level": 0,
         "search_strategy": {
             "execution_order": "joint",
             "search_algorithm": "tpe",
             "search_algorithm_config": {
-                "num_samples": 5,
+                "num_samples": 3,
                 "seed": 0
             }
         },
         "evaluator": "common_evaluator",
+        "clean_cache": true,
         "cache_dir": "cache",
-        "output_dir": "models/CamemBERT_workflow_cpu.json"
+        "output_dir" : "models/camembert_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
index 6ff0d6bfbb..50547b2094 100644
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
+++ b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
@@ -1,18 +1,17 @@
 {
-    "verbose": true,
-    "input_model": {
+    "input_model":{
         "type": "PyTorchModel",
         "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+            "hf_config": {
+                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "sst2",
+                    "split": "validation",
+                    "input_cols": ["sentence"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
                 }
             }
         }
@@ -20,53 +19,32 @@
 
     "evaluators": {
         "common_evaluator": {
-            "metrics": [
+            "metrics":[
                 {
                     "name": "accuracy",
-                    "type": "custom",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
                     "sub_types": [
-                        {
-                            "name": "accuracy_custom",
-                            "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
-                            }
-                        }
-                    ],
-                    "is_first_priority": true,
-                    "user_config": {
-                        "evaluate_func": "eval_accuracy",
-                        "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
-                        "batch_size": 1
-                    }
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "avg", "priority": 2}
+                    ]
                 }
             ]
         }
     },
     "passes": {
         "conversion": {
-            "type": "OnnxConversion"
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
@@ -78,15 +56,10 @@
             }
         },
         "quantization": {
-            "type": "OnnxDynamicQuantization"
+            "type": "OnnxQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {
@@ -94,12 +67,13 @@
             "execution_order": "joint",
             "search_algorithm": "tpe",
             "search_algorithm_config": {
-                "num_samples": 5,
+                "num_samples": 3,
                 "seed": 0
             }
         },
         "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir": "models/distilbert-base-uncased-finetuned-sst-2-english_workflow_cpu.json"
+        "output_dir" : "models/bert_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
index c475f1ecb3..35c24b3b11 100644
--- a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
+++ b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
@@ -1,72 +1,49 @@
 {
-    "verbose": true,
-    "input_model": {
+    "input_model":{
         "type": "PyTorchModel",
         "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+            "hf_config": {
+                "model_name": "microsoft/deberta-base-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
                 }
             }
         }
     },
     "evaluators": {
         "common_evaluator": {
-            "metrics": [
+            "metrics":[
                 {
                     "name": "accuracy",
-                    "type": "custom",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
                     "sub_types": [
-                        {
-                            "name": "accuracy_custom",
-                            "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
-                            }
-                        }
-                    ],
-                    "is_first_priority": true,
-                    "user_config":{
-                        "evaluate_func": "eval_accuracy",
-                        "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "avg", "priority": 2}
+                    ]
                 }
             ]
         }
     },
     "passes": {
         "conversion": {
-            "type": "OnnxConversion"
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
@@ -78,15 +55,10 @@
             }
         },
         "quantization": {
-            "type": "OnnxDynamicQuantization"
+            "type": "OnnxQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/microsoft-deberta-base-mnli/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {
@@ -94,12 +66,13 @@
             "execution_order": "joint",
             "search_algorithm": "tpe",
             "search_algorithm_config": {
-                "num_samples": 5,
+                "num_samples": 3,
                 "seed": 0
             }
         },
         "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir": "models/microsoft-deberta-base-mnli_workflow_cpu.json"
+        "output_dir" : "models/deberta_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 4c8d0b9553..4f60634f7c 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -41,42 +41,42 @@ def setup():
 #     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/CamemBERT/cpu_config.json"],
-# )
-# def test_Camembert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "CamemBERT")
-
-
 @pytest.mark.parametrize(
     "olive_json",
-    ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
+    ["perf_models/CamemBERT/cpu_config.json"],
 )
-def test_bertweet(olive_json):
+def test_Camembert(olive_json):
+    print(olive_json)
     from olive.workflows import run as olive_run
 
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
-    extract_best_models(footprint, "bertweet-base-sentiment-analysis")
+    extract_best_models(footprint, "CamemBERT")
 
 
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
+#     ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
 # )
-# def test_microsoft(olive_json):
-#     print(olive_json)
+# def test_bertweet(olive_json):
 #     from olive.workflows import run as olive_run
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
+#     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
+)
+def test_microsoft(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "microsoft-deberta-base-mnli")
 
 
 # @pytest.mark.parametrize(

From 86dbc282ccf1dd34f871ab44d93fa3fd5d8e4036 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Tue, 25 Jul 2023 17:42:03 +0000
Subject: [PATCH 32/67] changes

---
 .../perf_models/CamemBERT/cpu_config.json     | 138 +++++++--------
 .../roberta-large-mnli/cpu_config.json        |  83 +++------
 .../cpu_config.json                           | 106 -----------
 .../user_script.py                            | 166 ------------------
 .../test_perf_monitoring_models_cpu.py        |  52 +++---
 5 files changed, 116 insertions(+), 429 deletions(-)
 delete mode 100644 perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py

diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
index dc668bb6e5..110800836e 100644
--- a/perf_monitoring/perf_models/CamemBERT/cpu_config.json
+++ b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
@@ -1,84 +1,70 @@
 {
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "Jean-Baptiste/camembert-ner",
-                "task": "ner",
-                "dataset": {
-                    "data_name":"Jean-Baptiste/wikiner_fr",
-                    "split": "test",
-                    "input_cols": ["tokens"],
-                    "label_cols": ["ner_tags"],
-                    "batch_size": 1
-                }
+"input_model":{
+    "type": "PyTorchModel",
+    "config": {
+        "hf_config": {
+            "model_name": "Jean-Baptiste/camembert-ner",
+            "task": "ner",
+            "dataset": {
+                "data_name":"Jean-Baptiste/wikiner_fr",
+                "split": "test",
+                "input_cols": ["tokens"],
+                "label_cols": ["ner_tags"],
+                "batch_size": 1,
+                "max_samples": 10
             }
         }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "sub_types": [
-                        {
-                            "name": "accuracy_score",
-                            "priority": 1,
-                            "goal": {"type": "max-degradation", "value": 0.01},
-                            "metric_config": {
-                                "task": "multiclass",
-                                "num_classes": "5",
-                                "top_k": 1
-                            }
+    }
+},
+"evaluators": {
+    "common_evaluator": {
+        "metrics":[
+            {
+                "name": "accuracy",
+                "type": "accuracy",
+                "sub_types": [
+                    {
+                        "name": "accuracy_score",
+                        "priority": 1,
+                        "metric_config": {
+                            "task": "multiclass",
+                            "num_classes": "5",
+                            "top_k": 1
                         }
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}},
-                        {"name": "max"},
-                        {"name": "min"}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion"
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
+                    }
+                ]
+            },
+            {
+                "name": "latency",
+                "type": "latency",
+                "sub_types": [
+                    {"name": "avg", "priority": 2}
+                ]
             }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
+        ]
+    }
+},
+"passes": {
+    "conversion": {
+        "type": "OnnxConversion",
+        "config": {
+            "target_opset": 13
         }
-    },
-    "engine": {
-        "log_severity_level": 0,
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "clean_cache": true,
-        "cache_dir": "cache",
-        "output_dir" : "models/camembert_workflow_cpu"
     }
+},
+"engine": {
+    "search_strategy": {
+        "execution_order": "joint",
+        "search_algorithm": "tpe",
+        "search_algorithm_config": {
+            "num_samples": 3,
+            "seed": 0
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "execution_providers": ["CPUExecutionProvider"],
+    "cache_dir": "cache",
+    "output_dir" : "models/camembert_workflow_cpu"
+}
 }
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json b/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
index eefcf36843..6760514c0c 100644
--- a/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
+++ b/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
@@ -1,72 +1,49 @@
 {
-    "verbose": true,
-    "input_model": {
+    "input_model":{
         "type": "PyTorchModel",
         "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/roberta-large-mnli/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
+            "hf_config": {
+                "model_name": "roberta-large-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
                 }
             }
         }
     },
     "evaluators": {
         "common_evaluator": {
-            "metrics": [
+            "metrics":[
                 {
                     "name": "accuracy",
-                    "type": "custom",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
                     "sub_types": [
-                        {
-                            "name": "accuracy_custom",
-                            "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
-                            }
-                        }
-                    ],
-                    "is_first_priority": true,
-                    "user_config":{
-                        "evaluate_func": "eval_accuracy",
-                        "user_script": "./perf_models/roberta-large-mnli/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/roberta-large-mnli/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
+                        {"name": "avg", "priority": 2}
+                    ]
                 }
             ]
         }
     },
     "passes": {
         "conversion": {
-            "type": "OnnxConversion"
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
@@ -78,15 +55,10 @@
             }
         },
         "quantization": {
-            "type": "OnnxDynamicQuantization"
+            "type": "OnnxQuantization"
         },
         "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/roberta-large-mnli/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {
@@ -94,12 +66,13 @@
             "execution_order": "joint",
             "search_algorithm": "tpe",
             "search_algorithm_config": {
-                "num_samples": 5,
+                "num_samples": 3,
                 "seed": 0
             }
         },
         "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir": "models/roberta-large-mnli_workflow_cpu.json"
+        "output_dir" : "models/roberta_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json b/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
deleted file mode 100644
index ba9ffe39fb..0000000000
--- a/perf_monitoring/perf_models/roberta-large-openai-detector/cpu_config.json
+++ /dev/null
@@ -1,106 +0,0 @@
-{
-    "verbose": true,
-    "input_model": {
-        "type": "PyTorchModel",
-        "config": {
-            "model_loader": "load_model",
-            "model_script": "./perf_models/roberta-large-openai-detector/user_script.py",
-            "io_config" : {
-                "input_names": ["input_ids", "attention_mask"],
-                "input_shapes": [[1, 128], [1, 128]],
-                "input_types": ["int64", "int64"],
-                "output_names": ["output"],
-                "dynamic_axes": {
-                    "input_ids": {"0": "batch_size", "1": "seq_length"},
-                    "attention_mask": {"0": "batch_size", "1": "seq_length"}
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics": [
-                {
-                    "name": "accuracy",
-                    "type": "custom",
-                    "sub_types": [
-                        {
-                            "name": "accuracy_custom",
-                            "priority": 1,
-                            "higher_is_better": true,
-                            "goal": {
-                                "type": "max-degradation",
-                                "value": 0.01
-                            }
-                        }
-                    ],
-                    "is_first_priority": true,
-                    "user_config":{
-                        "evaluate_func": "eval_accuracy",
-                        "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1,
-                        "data_dir": "data"
-                    }
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {
-                            "name": "avg",
-                            "priority": 2,
-                            "goal": {
-                                "type": "percent-min-improvement",
-                                "value": 20
-                            }
-                        }
-                    ],
-                    "user_config": {
-                        "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
-                        "dataloader_func": "create_dataloader",
-                        "batch_size": 1
-                    }
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion"
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxDynamicQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "user_script": "./perf_models/roberta-large-openai-detector/user_script.py",
-                "dataloader_func": "create_dataloader",
-                "batch_size": 1
-            }
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 5,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "cache_dir": "cache",
-        "output_dir": "models/roberta-large-openai-detector_workflow_cpu.json"
-    }
-}
diff --git a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py b/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
deleted file mode 100644
index 97504ef529..0000000000
--- a/perf_monitoring/perf_models/roberta-large-openai-detector/user_script.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import json
-import os
-
-import torch
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from olive.constants import Framework
-from olive.evaluator.accuracy import AccuracyScore
-from olive.model import OliveModel
-
-# https://huggingface.co/roberta-large-openai-detector
-model_name = "roberta-large-openai-detector"
-dataset_name = "glue"
-subset = "mnli_matched"
-split = "validation"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    return model
-
-
-# -------------------- dataset -------------------
-
-
-def create_evaluation_dataset(dataset_dir):
-    print(f"Dataset directory: {dataset_dir}")
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    rls_ordered = []
-    for item, label in [("small-117M.valid.jsonl", 0), ("webtext.valid.jsonl", 1)]:
-        valid_file = os.path.join(dataset_dir, item)
-        with open(valid_file, "r") as f:
-            for line in f:
-                line = json.loads(line)
-                input = tokenizer(line["text"], return_tensors="pt", padding=True, truncation=True)
-                rls_ordered.append((input, label))
-
-    rls = []
-    for i in range(len(rls_ordered) // 2):
-        rls.append(
-            {
-                "input_ids": rls_ordered[i][0].input_ids[0],
-                "attention_mask": rls_ordered[i][0].attention_mask[0],
-                "labels": rls_ordered[i][1],
-            }
-        )
-        next_i = i + len(rls_ordered) // 2
-        rls.append(
-            {
-                "input_ids": rls_ordered[next_i][0].input_ids[0],
-                "attention_mask": rls_ordered[next_i][0].attention_mask[0],
-                "labels": rls_ordered[next_i][1],
-            }
-        )
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(rls)
-
-
-def create_dataloader(data_dir="data", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset(data_dir)
-    print(f"Data directory: {data_dir}")
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-def post_process(output):
-    import torch
-    import transformers
-
-    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
-        pre = torch.argmax(output.logits, dim=-1)
-    else:
-        pre = torch.argmax(output, dim=-1)
-    return pre
-
-
-def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
-    dataloader = create_dataloader(data_dir, batch_size)
-    print(dataloader, "this is dataloader")
-    preds = []
-    target = []
-    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
-    if model.framework == Framework.ONNX:
-        input_names = [i.name for i in sess.get_inputs()]
-        output_names = [o.name for o in sess.get_outputs()]
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
-            else:
-                inputs = inputs.tolist()
-                input_dict = dict(zip(input_names, [inputs]))
-            res = sess.run(input_feed=input_dict, output_names=None)
-            if len(output_names) == 1:
-                result = torch.Tensor(res[0])
-            else:
-                result = torch.Tensor(res)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    elif model.framework == Framework.PYTORCH:
-        for inputs, labels in dataloader:
-            print(inputs, "this is inputs", labels, "this is labels")
-            if isinstance(inputs, dict):
-                result = sess(**inputs)
-            else:
-                result = sess(inputs)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    print(preds, "this is preds")
-    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 4f60634f7c..83a75b7470 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -41,17 +41,17 @@ def setup():
 #     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/CamemBERT/cpu_config.json"],
-)
-def test_Camembert(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
+# @pytest.mark.parametrize(
+#     "olive_json",
+#     ["perf_models/CamemBERT/cpu_config.json"],
+# )
+# def test_Camembert(olive_json):
+#     print(olive_json)
+#     from olive.workflows import run as olive_run
 
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "CamemBERT")
+#     olive_config = patch_config(olive_json)
+#     footprint = olive_run(olive_config)
+#     extract_best_models(footprint, "CamemBERT")
 
 
 # @pytest.mark.parametrize(
@@ -66,30 +66,30 @@ def test_Camembert(olive_json):
 #     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
-)
-def test_microsoft(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "microsoft-deberta-base-mnli")
-
-
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/roberta-large-mnli/cpu_config.json"],
+#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
 # )
-# def test_roberta_mnli(olive_json):
+# def test_microsoft(olive_json):
 #     print(olive_json)
 #     from olive.workflows import run as olive_run
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "roberta-large-mnli")
+#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    ["perf_models/roberta-large-mnli/cpu_config.json"],
+)
+def test_roberta_mnli(olive_json):
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "roberta-large-mnli")
 
 
 # @pytest.mark.parametrize(

From 8f3f6eac9f04c3142d3ace675a4a412ae6d4a7b1 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Tue, 25 Jul 2023 18:33:34 +0000
Subject: [PATCH 33/67] fixing merge conflict

---
 olive/data/component/dataset.py          | 9 ++-------
 olive/data/component/load_dataset.py     | 4 ++--
 olive/data/component/pre_process_data.py | 8 ++++----
 olive/passes/onnx/conversion.py          | 3 +--
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/olive/data/component/dataset.py b/olive/data/component/dataset.py
index f05eae19aa..5665f54442 100644
--- a/olive/data/component/dataset.py
+++ b/olive/data/component/dataset.py
@@ -19,23 +19,18 @@ class BaseDataset(Dataset):
     The data should be a list or dict of numpy arrays or torch tensors
     """
 
-    def __init__(self, data, label_cols=None, max_samples=None, **kwargs):
+    def __init__(self, data, label_cols=None, **kwargs):
         """
         This function is used to initialize the dataset
         """
         self.data = data
         self.label_cols = label_cols or []
-        self.max_samples = max_samples
 
     def __len__(self):
         """
         This function is used to return the length of the dataset
         """
-        num_samples = len(self.data)
-        if self.max_samples is not None:
-            # if max_samples is not None, return the min of num_samples and max_samples
-            num_samples = min(num_samples, self.max_samples)
-        return num_samples
+        return len(self.data)
 
     def __getitem__(self, index):
         data = {k: v for k, v in self.data[index].items() if k not in self.label_cols}
diff --git a/olive/data/component/load_dataset.py b/olive/data/component/load_dataset.py
index c646daf671..56e4e30604 100644
--- a/olive/data/component/load_dataset.py
+++ b/olive/data/component/load_dataset.py
@@ -23,7 +23,7 @@ def simple_dataset(input_data, label_cols=None, **kwargs):
 
 
 @Registry.register_dataset()
-def huggingface_dataset(data_name=None, subset=None, split="validation", data_files=None, **kwargs):
+def huggingface_dataset(data_name=None, subset=None, split="validation", **kwargs):
     """
     This function is used to create a dataset from huggingface datasets
     """
@@ -34,7 +34,7 @@ def huggingface_dataset(data_name=None, subset=None, split="validation", data_fi
     from datasets import load_dataset
 
     assert data_name is not None, "Please specify the data name"
-    return load_dataset(path=data_name, name=subset, split=split, data_files=data_files, **kwargs)
+    return load_dataset(path=data_name, name=subset, split=split, **kwargs)
 
 
 @Registry.register_dataset()
diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py
index 6c27fee5f5..0dc37f2520 100644
--- a/olive/data/component/pre_process_data.py
+++ b/olive/data/component/pre_process_data.py
@@ -43,7 +43,7 @@ def _huggingface_pre_precess_helper(dataset, model_name, input_cols, label_cols,
 
 
 @Registry.register_pre_process()
-def huggingface_pre_process(_dataset, model_name, input_cols, label_cols, max_samples=None, **kwargs):
+def huggingface_pre_process(_dataset, model_name, input_cols, label_cols, **kwargs):
     """Pre-process data.
 
     Args:
@@ -73,11 +73,11 @@ def _tokenizer_and_align_labels(examples):
         _dataset, model_name, input_cols, label_cols, _tokenizer_and_align_labels, **kwargs
     )
     # label_cols is ["label"] since we added label_cols[0] as "label" to tokenized_inputs
-    return BaseDataset(tokenized_datasets, label_cols=["label"], max_samples=max_samples)
+    return BaseDataset(tokenized_datasets, label_cols=["label"])
 
 
 @Registry.register_pre_process()
-def ner_huggingface_preprocess(_dataset, model_name, input_cols, label_cols, max_samples=None, **kwargs):
+def ner_huggingface_preprocess(_dataset, model_name, input_cols, label_cols, **kwargs):
     """
     Pre-process data for ner task.
     """
@@ -125,4 +125,4 @@ def _tokenizer_and_align_labels(examples):
     tokenized_datasets = _huggingface_pre_precess_helper(
         _dataset, model_name, input_cols, label_cols, _tokenizer_and_align_labels, **kwargs
     )
-    return BaseDataset(tokenized_datasets, label_cols=["label"], max_samples=max_samples)
+    return BaseDataset(tokenized_datasets, label_cols=["label"])
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
index 2cc0b123de..4a7ad08ee6 100644
--- a/olive/passes/onnx/conversion.py
+++ b/olive/passes/onnx/conversion.py
@@ -118,8 +118,7 @@ def _run_for_config(
             if isinstance(dummy_inputs, dict):
                 dummy_input_keys = set(dummy_inputs.keys())
                 unused_keys = dummy_input_keys - set(input_names)
-                if unused_keys:
-                    logger.debug(f"Removing unused dummy inputs: {unused_keys}")
+                logger.debug(f"Unused dummy inputs: {unused_keys}")
                 for key in unused_keys:
                     del dummy_inputs[key]
 

From 680037fee9fca1d54f146f81e2a460dda6294a1d Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Tue, 25 Jul 2023 18:51:27 +0000
Subject: [PATCH 34/67] fixing merge conflict

---
 olive/data/template.py          | 4 ++--
 olive/passes/onnx/conversion.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/olive/data/template.py b/olive/data/template.py
index ec5258002b..f5c5e82f50 100644
--- a/olive/data/template.py
+++ b/olive/data/template.py
@@ -39,11 +39,11 @@ def huggingface_data_config_template(model_name, task, **kwargs) -> DataConfig:
         - `data_name`: str, data name in huggingface dataset, e.g.: "glue", "squad"
         - `subset`: str, subset of data, e.g.: "train", "validation", "test"
         - `split`: str, split of data, e.g.: "train", "validation", "test"
-
+        - `data_files`: str | list | dict, path to source data file(s).
         - `input_cols`: list, input columns of data
         - `label_cols`: list, label columns of data
         - `batch_size`: int, batch size of data
-
+        - `max_samples`: int, maximum number of samples in the dataset
         and other arguments in
             - olive.data.component.load_dataset.huggingface_dataset
             - olive.data.component.pre_process_data.huggingface_pre_process
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
index 631c733f62..29db6b227b 100644
--- a/olive/passes/onnx/conversion.py
+++ b/olive/passes/onnx/conversion.py
@@ -118,7 +118,8 @@ def _run_for_config(
             if isinstance(dummy_inputs, dict):
                 dummy_input_keys = set(dummy_inputs.keys())
                 unused_keys = dummy_input_keys - set(input_names)
-                logger.debug(f"Removing unused dummy inputs: {unused_keys}")
+                if unused_keys:
+                    logger.debug(f"Removing unused dummy inputs: {unused_keys}")
                 for key in unused_keys:
                     del dummy_inputs[key]
 

From 445f0000b29cc6484ec2352d06c0fb12b78933db Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 26 Jul 2023 23:24:54 +0000
Subject: [PATCH 35/67] setting up env variable

---
 .../olive-perf-monitoring-template.yaml       |   4 +-
 .azure_pipelines/perfmonitoring-ci .yaml      |   4 +-
 Makefile                                      |   3 +-
 perf_monitoring/best_metrics.json             |   2 +-
 .../perf_models/CamemBERT/cpu_config.json     |  70 -------
 .../perf_models/CamemBERT/user_script.py      | 194 ------------------
 .../perf_models/bert/bert_workflow_cpu.json   |  78 -------
 .../cpu_config.json                           |  79 -------
 .../user_script.py                            | 154 --------------
 .../cpu_config.json                           |  79 -------
 .../main.py                                   |   7 -
 .../user_script.py                            | 155 --------------
 .../cpu_config.json                           |  78 -------
 .../user_script.py                            | 158 --------------
 .../roberta-large-mnli/cpu_config.json        |  78 -------
 .../roberta-large-mnli/user_script.py         | 158 --------------
 .../test_perf_monitoring_models_cpu.py        |  40 ++--
 perf_monitoring/utils.py                      |   9 +-
 scripts/perf_monitoring.bat                   |   3 +-
 scripts/perf_monitoring.sh                    |   4 +-
 20 files changed, 31 insertions(+), 1326 deletions(-)
 delete mode 100644 perf_monitoring/perf_models/CamemBERT/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/CamemBERT/user_script.py
 delete mode 100644 perf_monitoring/perf_models/bert/bert_workflow_cpu.json
 delete mode 100644 perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
 delete mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
 delete mode 100644 perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
 delete mode 100644 perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
 delete mode 100644 perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/roberta-large-mnli/user_script.py

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
index 8a2df35e7c..c86d3a6768 100644
--- a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
+++ b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
@@ -29,13 +29,15 @@ jobs:
     inputs:
       azureSubscription: $(OLIVE_RG_SERVICE_CONNECTION)
       scriptLocation: 'inlineScript'
-      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName) PERF_MONITORING_SCRIPT_FUNCTION=$(perfMonitoringScriptFunction)
+      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName)
     displayName: performance monitoring
     env:
       OLIVEWHEELS_STORAGE_CONNECTION_STRING: $(olive-wheels-storage-connection-string)
       WORKSPACE_SUBSCRIPTION_ID: $(workspace-subscription-id)
       WORKSPACE_RESOURCE_GROUP: $(workspace-resource-group)
       WORKSPACE_NAME: $(workspace-name)
+      TEST_MODEL: $(testModel)
+
 
   - task: ComponentGovernanceComponentDetection@0
     inputs:
diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 088254df19..067ce4789d 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -45,8 +45,8 @@ jobs:
     examples:
       bert:
         perfMonitoringScriptName: perf_monitoring_models_cpu
-        perfMonitoringScriptFunction: test_bert
+        testModel: bert
 
       bertweet:
         perfMonitoringScriptName: perf_monitoring_models_cpu
-        perfMonitoringScriptFunction: test_bertweet
+        testModel: bertweet
diff --git a/Makefile b/Makefile
index 90a38569db..5a15c12c2c 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,6 @@ INSTALL_DEV_MODE           ?= False
 EXAMPLE_FOLDER             ?=
 EXAMPLE_NAME               ?=
 PERF_MONITORING_SCRIPT_NAME ?=
-PERF_MONITORING_SCRIPT_FUNCTION ?=
 INSTALL_EXTRAS             ?=
 VERSION                    ?=
 ifeq ($(WINDOWS), True)
@@ -59,4 +58,4 @@ clean:
 
 .PHONY: perf-monitoring
 perf-monitoring:
-	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME) $(PERF_MONITORING_SCRIPT_FUNCTION)
+	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)
diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index ada84d68be..bd5ed0a3f5 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1 +1 @@
-{"bert": [], "CamemBERT": [], "bertweet-base-sentiment-analysis": [1.0, -16.03689], "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203], "microsoft-deberta-base-mnli": [1.0, -111.79317], "roberta-large-mnli": [1.0, -146.89287]}
+{"bert": [], "CamemBERT": [], "bertweet-base-sentiment-analysis": [1.0, -16.03689], "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203], "microsoft-deberta-base-mnli": [1.0, -111.79317], "roberta-large-mnli": [1.0, -146.89287], "cpu_models": [-96.3853, 0.9933962225914001]}
diff --git a/perf_monitoring/perf_models/CamemBERT/cpu_config.json b/perf_monitoring/perf_models/CamemBERT/cpu_config.json
deleted file mode 100644
index 110800836e..0000000000
--- a/perf_monitoring/perf_models/CamemBERT/cpu_config.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-"input_model":{
-    "type": "PyTorchModel",
-    "config": {
-        "hf_config": {
-            "model_name": "Jean-Baptiste/camembert-ner",
-            "task": "ner",
-            "dataset": {
-                "data_name":"Jean-Baptiste/wikiner_fr",
-                "split": "test",
-                "input_cols": ["tokens"],
-                "label_cols": ["ner_tags"],
-                "batch_size": 1,
-                "max_samples": 10
-            }
-        }
-    }
-},
-"evaluators": {
-    "common_evaluator": {
-        "metrics":[
-            {
-                "name": "accuracy",
-                "type": "accuracy",
-                "sub_types": [
-                    {
-                        "name": "accuracy_score",
-                        "priority": 1,
-                        "metric_config": {
-                            "task": "multiclass",
-                            "num_classes": "5",
-                            "top_k": 1
-                        }
-                    }
-                ]
-            },
-            {
-                "name": "latency",
-                "type": "latency",
-                "sub_types": [
-                    {"name": "avg", "priority": 2}
-                ]
-            }
-        ]
-    }
-},
-"passes": {
-    "conversion": {
-        "type": "OnnxConversion",
-        "config": {
-            "target_opset": 13
-        }
-    }
-},
-"engine": {
-    "search_strategy": {
-        "execution_order": "joint",
-        "search_algorithm": "tpe",
-        "search_algorithm_config": {
-            "num_samples": 3,
-            "seed": 0
-        }
-    },
-    "log_severity_level": 0,
-    "evaluator": "common_evaluator",
-    "execution_providers": ["CPUExecutionProvider"],
-    "cache_dir": "cache",
-    "output_dir" : "models/camembert_workflow_cpu"
-}
-}
diff --git a/perf_monitoring/perf_models/CamemBERT/user_script.py b/perf_monitoring/perf_models/CamemBERT/user_script.py
deleted file mode 100644
index b0a8eb83d2..0000000000
--- a/perf_monitoring/perf_models/CamemBERT/user_script.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import evaluate
-import numpy
-import torch
-from datasets import load_dataset
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from tqdm import tqdm
-from transformers import AutoTokenizer, CamembertForTokenClassification
-
-from olive.constants import Framework
-
-# https://huggingface.co/Jean-Baptiste/camembert-ner
-model_name = "Jean-Baptiste/camembert-ner"
-dataset_name = "Jean-Baptiste/wikiner_fr"
-split = "test"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = CamembertForTokenClassification.from_pretrained(model_name)
-    model = model.to("cpu")
-    return model
-
-
-# -------------------- dataset -------------------
-def align_labels_with_tokens(labels, word_ids):
-    new_labels = []
-    current_word = None
-    for word_id in word_ids:
-        if word_id != current_word:
-            # Start of a new word!
-            current_word = word_id
-            label = 0 if word_id is None else labels[word_id]
-            new_labels.append(label)
-        elif word_id is None:
-            # Special token
-            new_labels.append(0)
-        else:
-            # Same word as previous token
-            label = labels[word_id]
-            # If the label is B-XXX we change it to I-XXX
-            if label % 2 == 1:
-                label += 1
-            new_labels.append(label)
-
-    return new_labels
-
-
-def tokenize_and_align_labels(examples):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenized_inputs = tokenizer(
-        examples["tokens"],
-        truncation=True,
-        padding=True,
-        is_split_into_words=True,
-        add_special_tokens=False,
-        return_tensors="pt",
-    )
-    all_labels = examples["ner_tags"]
-    new_labels = []
-    for i, labels in enumerate(all_labels):
-        word_ids = tokenized_inputs.word_ids(i)
-        new_labels.append(align_labels_with_tokens(labels, word_ids))
-
-    tokenized_inputs["labels"] = torch.LongTensor(new_labels)
-    return tokenized_inputs
-
-
-def create_evaluation_dataset():
-    dataset = load_dataset(dataset_name, split=split)
-    tokenized_datasets = dataset.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=dataset.column_names,
-    )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(tokenized_datasets)
-
-
-def create_dataloader(data_dir="", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset()
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-# -------------------- post process -------------------
-def _convert_idx_to_ner_tags(labels):
-    id2label = {0: "O", 1: "I-LOC", 2: "I-PER", 3: "I-MISC", 4: "I-ORG"}
-    return [id2label[t.item()] for t in labels]
-
-
-def post_process(model_output, model):
-    if model.framework == Framework.ONNX:
-        logits = model_output[0]
-    else:
-        logits = model_output.logits
-    predicted_token_class_ids = logits.argmax(-1)
-    predicted_tokens_classes = _convert_idx_to_ner_tags(predicted_token_class_ids[0])
-    return predicted_tokens_classes
-
-
-# -------------------- evaluations -------------------
-def _evaluate(pre, ref, computer_func=None):
-    if computer_func is None:
-        return None
-    return computer_func.compute(predictions=pre, references=ref)
-
-
-def evaluate_accuracy_gpu(model, data_dir, batch_size, device="gpu"):
-    evaluate_accuracy(model, data_dir, batch_size, device=device, ep=None)
-
-
-def evaluate_accuracy(model, data_dir, batch_size, device, ep):
-    prepared_model = model.prepare_session(inference_settings=None, device=device)
-    dataloader = create_dataloader(batch_size=batch_size)
-    seqeval = evaluate.load("seqeval")
-
-    pre = []
-    ref = []
-
-    for item in tqdm(dataloader):
-        for v in item[-1]:
-            ref.append(_convert_idx_to_ner_tags(v))
-
-        item = item[0]
-        if model.framework == Framework.ONNX:
-            input_ids = numpy.ascontiguousarray(item["input_ids"].cpu().numpy())
-            attention_mask = numpy.ascontiguousarray(item["attention_mask"].cpu().numpy())
-            input = {"input_ids": input_ids, "attention_mask": attention_mask}
-            ort_outputs = prepared_model.run(None, input)
-            outputs = post_process(ort_outputs, model)
-            pre.append(outputs)
-
-        elif model.framework == Framework.PYTORCH:
-            with torch.no_grad():
-                ort_outputs = prepared_model(input_ids=item["input_ids"], attention_mask=item["attention_mask"])
-                outputs = post_process(ort_outputs, model)
-                pre.append(outputs)
-    _rls = _evaluate(pre, ref, seqeval)
-    rls = _rls["overall_accuracy"]
-    return rls
diff --git a/perf_monitoring/perf_models/bert/bert_workflow_cpu.json b/perf_monitoring/perf_models/bert/bert_workflow_cpu.json
deleted file mode 100644
index bff54468de..0000000000
--- a/perf_monitoring/perf_models/bert/bert_workflow_cpu.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "Intel/bert-base-uncased-mrpc",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mrpc",
-                    "split": "validation",
-                    "input_cols": ["sentence1", "sentence2"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
deleted file mode 100644
index 0221f579fa..0000000000
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/cpu_config.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-    "input_model": {
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "finiteautomata/bertweet-base-sentiment-analysis",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"cardiffnlp/tweet_sentiment_multilingual",
-                    "subset": "english",
-                    "split": "test",
-                    "input_cols": ["text"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bertweet_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py b/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
deleted file mode 100644
index cd1928b427..0000000000
--- a/perf_monitoring/perf_models/bertweet-base-sentiment-analysis/user_script.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import torch
-from datasets import load_dataset
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from olive.constants import Framework
-from olive.evaluator.accuracy import AccuracyScore
-from olive.model import OliveModel
-
-# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
-model_name = "finiteautomata/bertweet-base-sentiment-analysis"
-# dataset_name = "mteb/tweet_sentiment_extraction"
-dataset_name = "cardiffnlp/tweet_sentiment_multilingual"
-subset = "english"
-split = "test"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    model = model.to("cpu")
-    return model
-
-
-# -------------------- dataset -------------------
-def tokenize_and_align_labels(examples):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenized_inputs = tokenizer(
-        examples["text"],
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-    )
-    tokenized_inputs["labels"] = examples["label"]
-    return tokenized_inputs
-
-
-def create_evaluation_dataset():
-    dataset = load_dataset(dataset_name, subset, split=split)
-    tokenized_datasets = dataset.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=dataset.column_names,
-    )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            labels = self.dataset[index]["labels"]
-            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
-            return inputs, labels
-            # return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(tokenized_datasets)
-
-
-def create_dataloader(data_dir="", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset()
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-def post_process(output):
-    import torch
-    import transformers
-
-    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
-        preds = torch.argmax(output.logits, dim=-1)
-    else:
-        preds = torch.argmax(output, dim=-1)
-    return preds
-
-
-def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
-    dataloader = create_dataloader(data_dir, batch_size)
-    preds = []
-    target = []
-    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
-    if model.framework == Framework.ONNX:
-        input_names = [i.name for i in sess.get_inputs()]
-        output_names = [o.name for o in sess.get_outputs()]
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
-            else:
-                inputs = inputs.tolist()
-                input_dict = dict(zip(input_names, [inputs]))
-            res = sess.run(input_feed=input_dict, output_names=None)
-            if len(output_names) == 1:
-                result = torch.Tensor(res[0])
-            else:
-                result = torch.Tensor(res)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    elif model.framework == Framework.PYTORCH:
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                result = sess(**inputs)
-            else:
-                result = sess(inputs)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
deleted file mode 100644
index 50547b2094..0000000000
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "sst2",
-                    "split": "validation",
-                    "input_cols": ["sentence"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-        }
-    },
-
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
deleted file mode 100644
index ec02da659b..0000000000
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/main.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from olive.workflows import run as olive_run
-
-config = "./cpu_config.json"
-# config = "./gpu_config.json"
-config = "./aml_cpu_config.json"
-rls = olive_run(config)
-print(rls)
diff --git a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py b/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
deleted file mode 100644
index 19a4efcbb6..0000000000
--- a/perf_monitoring/perf_models/distilbert-base-uncased-finetuned-sst-2-english/user_script.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import torch
-from datasets import load_dataset
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from olive.constants import Framework
-from olive.evaluator.accuracy import AccuracyScore
-from olive.model import OliveModel
-
-# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
-model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-dataset_name = "mteb/tweet_sentiment_extraction"
-dataset_name = "glue"
-subset = "sst2"
-split = "validation"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    return model
-
-
-# -------------------- dataset -------------------
-def tokenize_and_align_labels(examples):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenized_inputs = tokenizer(
-        examples["sentence"],
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-    )
-    # pre process
-
-    tokenized_inputs["labels"] = examples["label"]
-    return tokenized_inputs
-
-
-def create_evaluation_dataset():
-    dataset = load_dataset(dataset_name, subset, split=split)
-    tokenized_datasets = dataset.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=dataset.column_names,
-    )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            labels = self.dataset[index]["labels"]
-            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
-            return inputs, labels
-            # return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(tokenized_datasets)
-
-
-def create_dataloader(data_dir="", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset()
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-def post_process(output):
-    import torch
-    import transformers
-
-    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
-        preds = torch.argmax(output.logits, dim=-1)
-    else:
-        preds = torch.argmax(output, dim=-1)
-    return preds
-
-
-def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
-    dataloader = create_dataloader(data_dir, batch_size)
-    preds = []
-    target = []
-    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
-    if model.framework == Framework.ONNX:
-        input_names = [i.name for i in sess.get_inputs()]
-        output_names = [o.name for o in sess.get_outputs()]
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
-            else:
-                inputs = inputs.tolist()
-                input_dict = dict(zip(input_names, [inputs]))
-            res = sess.run(input_feed=input_dict, output_names=None)
-            if len(output_names) == 1:
-                result = torch.Tensor(res[0])
-            else:
-                result = torch.Tensor(res)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    elif model.framework == Framework.PYTORCH:
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                result = sess(**inputs)
-            else:
-                result = sess(inputs)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
deleted file mode 100644
index 35c24b3b11..0000000000
--- a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/cpu_config.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "microsoft/deberta-base-mnli",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mnli_matched",
-                    "split": "validation",
-                    "input_cols": ["sentence1", "sentence2"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/deberta_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py b/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
deleted file mode 100644
index a121cf43c5..0000000000
--- a/perf_monitoring/perf_models/microsoft-deberta-base-mnli/user_script.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import torch
-from datasets import load_dataset
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from olive.constants import Framework
-from olive.evaluator.accuracy import AccuracyScore
-from olive.model import OliveModel
-
-# https://huggingface.co/microsoft/deberta-base-mnli
-model_name = "microsoft/deberta-base-mnli"
-dataset_name = "glue"
-subset = "mnli_matched"
-split = "validation"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    return model
-
-
-# -------------------- dataset -------------------
-def tokenize_and_align_labels(examples):
-    if isinstance(examples["label"], list):
-        label = list(map(lambda x: 2 - x, examples["label"]))
-    elif isinstance(examples["label"], int):
-        label = 2 - examples["label"]
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenized_inputs = tokenizer(
-        examples["premise"],
-        examples["hypothesis"],
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-    )
-    # pre process
-    tokenized_inputs["labels"] = torch.LongTensor(label)
-    return tokenized_inputs
-
-
-def create_evaluation_dataset():
-    dataset = load_dataset(dataset_name, subset, split=split)
-    tokenized_datasets = dataset.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=dataset.column_names,
-    )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            labels = self.dataset[index]["labels"]
-            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
-            return inputs, labels
-            # return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(tokenized_datasets)
-
-
-def create_dataloader(data_dir="", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset()
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-def post_process(output):
-    import torch
-    import transformers
-
-    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
-        preds = torch.argmax(output.logits, dim=-1)
-    else:
-        preds = torch.argmax(output, dim=-1)
-    return preds
-
-
-def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
-    dataloader = create_dataloader(data_dir, batch_size)
-    preds = []
-    target = []
-    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
-    if model.framework == Framework.ONNX:
-        input_names = [i.name for i in sess.get_inputs()]
-        output_names = [o.name for o in sess.get_outputs()]
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
-            else:
-                inputs = inputs.tolist()
-                input_dict = dict(zip(input_names, [inputs]))
-            res = sess.run(input_feed=input_dict, output_names=None)
-            if len(output_names) == 1:
-                result = torch.Tensor(res[0])
-            else:
-                result = torch.Tensor(res)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    elif model.framework == Framework.PYTORCH:
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                result = sess(**inputs)
-            else:
-                result = sess(inputs)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json b/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
deleted file mode 100644
index 6760514c0c..0000000000
--- a/perf_monitoring/perf_models/roberta-large-mnli/cpu_config.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "roberta-large-mnli",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mnli_matched",
-                    "split": "validation",
-                    "input_cols": ["premise"],
-                    "label_cols": ["label"],
-                    "batch_size": 1
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/roberta_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py b/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
deleted file mode 100644
index 48f5c64c79..0000000000
--- a/perf_monitoring/perf_models/roberta-large-mnli/user_script.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import torch
-from datasets import load_dataset
-from onnxruntime.quantization import CalibrationDataReader
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from olive.constants import Framework
-from olive.evaluator.accuracy import AccuracyScore
-from olive.model import OliveModel
-
-# https://huggingface.co/roberta-large-mnli
-model_name = "roberta-large-mnli"
-dataset_name = "glue"
-subset = "mnli_matched"
-split = "validation"
-
-
-class CalibrationDataLoader(CalibrationDataReader):
-    def __init__(self, dataloader, post_func, num_samplers=100):
-        self.dataloader = dataloader
-        self.iter = iter(dataloader)
-        self.post_func = post_func
-        self.counter = 0
-        self.num_samplers = num_samplers
-
-    def get_next(self):
-        if self.counter >= self.num_samplers:
-            return None
-        self.counter += 1
-        if self.iter is None:
-            self.iter = iter(self.dataloader)
-        try:
-            return self.post_func(next(self.iter))
-        except StopIteration:
-            return None
-
-    def rewind(self):
-        self.iter = None
-        self.counter = 0
-
-
-# -------------------- model -------------------
-def load_model(model_path=None):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    return model
-
-
-# -------------------- dataset -------------------
-def tokenize_and_align_labels(examples):
-    if isinstance(examples["label"], list):
-        label = list(map(lambda x: 2 - x, examples["label"]))
-    elif isinstance(examples["label"], int):
-        label = 2 - examples["label"]
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenized_inputs = tokenizer(
-        examples["premise"],
-        examples["hypothesis"],
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-    )
-    # pre process
-    tokenized_inputs["labels"] = torch.LongTensor(label)
-    return tokenized_inputs
-
-
-def create_evaluation_dataset():
-    dataset = load_dataset(dataset_name, subset, split=split)
-    tokenized_datasets = dataset.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=dataset.column_names,
-    )
-    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
-
-    class _Dateset(Dataset):
-        def __init__(self, dataset):
-            self.dataset = dataset
-
-        def __getitem__(self, index):
-            labels = self.dataset[index]["labels"]
-            inputs = {k: self.dataset[index][k] for k in self.dataset[index].keys() if k != "labels"}
-            return inputs, labels
-            # return self.dataset[index], self.dataset[index]["labels"]
-
-        def __len__(self):
-            return 5
-            # return len(self.dataset)
-
-    return _Dateset(tokenized_datasets)
-
-
-def create_dataloader(data_dir="", batch_size=2, model_framework=None):
-    def _collate_fn(batch):
-        batch = default_collate(batch)
-        return batch
-
-    dataset = create_evaluation_dataset()
-    return DataLoader(dataset, batch_size=batch_size, collate_fn=_collate_fn)
-
-
-def create_cali_dataloader():
-    def _post_func(sampler):
-        return sampler
-
-    dataloader = create_dataloader()
-    cali_dataloader = CalibrationDataLoader(create_dataloader(dataloader, _post_func))
-    return cali_dataloader
-
-
-def post_process(output):
-    import torch
-    import transformers
-
-    if isinstance(output, transformers.modeling_outputs.SequenceClassifierOutput):
-        preds = torch.argmax(output.logits, dim=-1)
-    else:
-        preds = torch.argmax(output, dim=-1)
-    return preds
-
-
-def eval_accuracy(model: OliveModel, data_dir, batch_size, device, execution_providers):
-    dataloader = create_dataloader(data_dir, batch_size)
-    preds = []
-    target = []
-    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
-    if model.framework == Framework.ONNX:
-        input_names = [i.name for i in sess.get_inputs()]
-        output_names = [o.name for o in sess.get_outputs()]
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                input_dict = {k: inputs[k].tolist() for k in inputs.keys()}
-            else:
-                inputs = inputs.tolist()
-                input_dict = dict(zip(input_names, [inputs]))
-            res = sess.run(input_feed=input_dict, output_names=None)
-            if len(output_names) == 1:
-                result = torch.Tensor(res[0])
-            else:
-                result = torch.Tensor(res)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    elif model.framework == Framework.PYTORCH:
-        for inputs, labels in dataloader:
-            if isinstance(inputs, dict):
-                result = sess(**inputs)
-            else:
-                result = sess(inputs)
-            outputs = post_process(result)
-            preds.extend(outputs.tolist())
-            target.extend(labels.data.tolist())
-    return AccuracyScore().measure(preds, target)
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 83a75b7470..8f32bcfa97 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -41,17 +41,18 @@ def setup():
 #     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/CamemBERT/cpu_config.json"],
-# )
-# def test_Camembert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
+@pytest.mark.parametrize(
+    "olive_json",
+    [f"perf_models/cpu_models/{os.environ['TEST_MODEL']}_cpu_config.json"],
+)
+def test_models(olive_json):
 
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "CamemBERT")
+    print(olive_json)
+    from olive.workflows import run as olive_run
+
+    olive_config = patch_config(olive_json)
+    footprint = olive_run(olive_config)
+    extract_best_models(footprint, "cpu_models")
 
 
 # @pytest.mark.parametrize(
@@ -79,27 +80,14 @@ def setup():
 #     extract_best_models(footprint, "microsoft-deberta-base-mnli")
 
 
-@pytest.mark.parametrize(
-    "olive_json",
-    ["perf_models/roberta-large-mnli/cpu_config.json"],
-)
-def test_roberta_mnli(olive_json):
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, "roberta-large-mnli")
-
-
 # @pytest.mark.parametrize(
 #     "olive_json",
-#     ["perf_models/roberta-large-openai-detector/cpu_config.json"],
+#     ["perf_models/roberta-large-mnli/cpu_config.json"],
 # )
-# def test_bert(olive_json):
+# def test_roberta_mnli(olive_json):
 #     print(olive_json)
 #     from olive.workflows import run as olive_run
 
 #     olive_config = patch_config(olive_json)
 #     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "roberta-large-openai-detector")
+#     extract_best_models(footprint, "roberta-large-mnli")
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 2f9d84921b..e4ddeab94a 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -32,7 +32,11 @@ def patch_config(config_json_path: str):
 def extract_best_models(footprint, model_name):
     print("Footprint: ", footprint)
     footprint = list(footprint.values())[0]
-    metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
+    print(
+        "Footprint: ",
+        footprint,
+    )
+    metrics_of_interest = ["accuracy-accuracy", "latency-avg", "accuracy-accuracy_score"]
     # gather the metrics from all pareto frontier nodes
     all_metrics = []
     # we iterate over the nodes in the pareto frontier
@@ -42,7 +46,8 @@ def extract_best_models(footprint, model_name):
         for name in metrics_of_interest:
             # (value of metric * direction of comparison)
             # now higher is better for all metrics
-            metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])
+            if name in node.metrics.value:
+                metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])
         all_metrics.append(metrics)
     # sort the metrics
     # this sorts it
diff --git a/scripts/perf_monitoring.bat b/scripts/perf_monitoring.bat
index 7171ffe65a..11b008d251 100644
--- a/scripts/perf_monitoring.bat
+++ b/scripts/perf_monitoring.bat
@@ -7,7 +7,6 @@ REM --------------------------------------------------------------------------
 set PIPELINE=%1
 set ROOT_DIR=%2
 set PERF_MONITORING_SCRIPT_NAME=%3
-set PERF_MONITORING_SCRIPT_FUNCTION=%4
 
 if "%PIPELINE%"=="True" (
     call olive-venv\\Scripts\\activate.bat || goto :error
@@ -21,7 +20,7 @@ call echo "performance monitoring examples"
 call python -m pip install -r %ROOT_DIR%\\perf_monitoring\\requirements.txt || goto :error
 
 call python -m pytest -v -s --log-cli-level=WARNING --junitxml=%ROOT_DIR%\\logs\\performance-monitoring-TestOlive.xml^
- %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py::%PERF_MONITORING_SCRIPT_FUNCTION% || goto :error
+ %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py || goto :error
 
 goto :EOF
 
diff --git a/scripts/perf_monitoring.sh b/scripts/perf_monitoring.sh
index 4e06e0b10b..9ae4e00003 100644
--- a/scripts/perf_monitoring.sh
+++ b/scripts/perf_monitoring.sh
@@ -8,7 +8,7 @@ set -eoux pipefail
 PIPELINE=$1
 ROOT_DIR=$2
 PERF_MONITORING_SCRIPT_NAME=$3
-PERF_MONITORING_SCRIPT_FUNCTION=$4
+
 
 echo $PIPELINE
 if [[ "$PIPELINE" == "True" ]]; then
@@ -24,4 +24,4 @@ python -m pip install pytest
 echo "performance monitoring examples"
 python -m pip install -r $ROOT_DIR/perf_monitoring/requirements.txt
 
-python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py::$PERF_MONITORING_SCRIPT_FUNCTION
+python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py

From 8e7e21a205a89ab66c283f1274a6ac4b73d3181d Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 26 Jul 2023 23:50:40 +0000
Subject: [PATCH 36/67] adding extra files

---
 .azure_pipelines/olive-ci.yaml                |  9 ---
 .azure_pipelines/perfmonitoring-ci .yaml      |  6 +-
 .../cpu_models/bert_cpu_config.json           | 78 ++++++++++++++++++
 .../cpu_models/bertweet_cpu_config.json       | 79 +++++++++++++++++++
 .../cpu_models/camembert_cpu_config.json      | 70 ++++++++++++++++
 .../cpu_models/distilbert_cpu_config.json     | 79 +++++++++++++++++++
 .../microsoft-deberta_cpu_config.json         | 79 +++++++++++++++++++
 .../cpu_models/roberta_cpu_config.json        | 79 +++++++++++++++++++
 8 files changed, 469 insertions(+), 10 deletions(-)
 create mode 100644 perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
 create mode 100644 perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
 create mode 100644 perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
 create mode 100644 perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
 create mode 100644 perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
 create mode 100644 perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json

diff --git a/.azure_pipelines/olive-ci.yaml b/.azure_pipelines/olive-ci.yaml
index 251325f259..0a9e111c20 100644
--- a/.azure_pipelines/olive-ci.yaml
+++ b/.azure_pipelines/olive-ci.yaml
@@ -91,12 +91,3 @@ jobs:
     display_name: Test Build Docs
     pool: $(OLIVE_POOL_UBUNTU2004)
     publish_docs: false
-
-- template: job_templates/olive-perf-monitoring-template.yaml
-  parameters:
-    name: Windows_CI
-    pool: $(OLIVE_POOL_WIN2019)
-    windows: True
-    examples:
-      bert:
-        perfMonitoringScriptName: perf_monitoring_bert_cpu.py
diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 067ce4789d..e0fd8b72f8 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -35,7 +35,11 @@ jobs:
     examples:
       bert:
         perfMonitoringScriptName: perf_monitoring_models_cpu
-        perfMonitoringScriptFunction: test_bert
+        testModel: bert
+
+      bertweet:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: bertweet
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:
diff --git a/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
new file mode 100644
index 0000000000..bff54468de
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
@@ -0,0 +1,78 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Intel/bert-base-uncased-mrpc",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
new file mode 100644
index 0000000000..0221f579fa
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
@@ -0,0 +1,79 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "finiteautomata/bertweet-base-sentiment-analysis",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"cardiffnlp/tweet_sentiment_multilingual",
+                    "subset": "english",
+                    "split": "test",
+                    "input_cols": ["text"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bertweet_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
new file mode 100644
index 0000000000..74e8a55b98
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
@@ -0,0 +1,70 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Jean-Baptiste/camembert-ner",
+                "task": "ner",
+                "dataset": {
+                    "data_name":"Jean-Baptiste/wikiner_fr",
+                    "split": "test",
+                    "input_cols": ["tokens"],
+                    "label_cols": ["ner_tags"],
+                    "batch_size": 1,
+                    "max_samples": 10
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_score",
+                            "priority": 1,
+                            "metric_config": {
+                                "task": "multiclass",
+                                "num_classes": "5",
+                                "top_k": 1
+                            }
+                        }
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
new file mode 100644
index 0000000000..50547b2094
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
@@ -0,0 +1,79 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "sst2",
+                    "split": "validation",
+                    "input_cols": ["sentence"],
+                    "label_cols": ["label"],
+                    "batch_size": 1
+                }
+            }
+        }
+    },
+
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
new file mode 100644
index 0000000000..cff81b5491
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
@@ -0,0 +1,79 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "microsoft/deberta-base-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 10
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/deberta_workflow_cpu"
+    }
+}
diff --git a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
new file mode 100644
index 0000000000..5c706ac4b2
--- /dev/null
+++ b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
@@ -0,0 +1,79 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "roberta-large-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 10
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/roberta_workflow_cpu"
+    }
+}

From bd4dd28ee8bdaad50e5b4922b97d39993840cdd9 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Wed, 26 Jul 2023 23:52:20 +0000
Subject: [PATCH 37/67] adding extra files

---
 perf_monitoring/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index e4ddeab94a..a698a02d5e 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -36,7 +36,7 @@ def extract_best_models(footprint, model_name):
         "Footprint: ",
         footprint,
     )
-    metrics_of_interest = ["accuracy-accuracy", "latency-avg", "accuracy-accuracy_score"]
+    metrics_of_interest = ["accuracy-accuracy", "accuracy-accuracy_score", "latency-avg"]
     # gather the metrics from all pareto frontier nodes
     all_metrics = []
     # we iterate over the nodes in the pareto frontier

From f7dec68cbc3a1af830195199801e4b82333bf3e9 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 00:11:11 +0000
Subject: [PATCH 38/67] adding extra files

---
 .../olive-perf-monitoring-template.yaml           |  7 -------
 .azure_pipelines/perfmonitoring-ci .yaml          | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
index c86d3a6768..25fd4e84b4 100644
--- a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
+++ b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
@@ -53,13 +53,6 @@ jobs:
       testRunTitle: '$(Build.BuildNumber)[$(Agent.JobName)]'
     displayName: Upload pipeline run test results
 
-  - task:  PublishPipelineArtifact@1
-    inputs:
-      path: $(Build.SourcesDirectory)/perf_monitoring_results
-      artifactName: best_metrics
-      artifactType: pipeline
-    displayName: Publish models
-
   - script: make clean WINDOWS=$(WINDOWS)
     condition: always()
     displayName: Clean remaining artifacts
diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index e0fd8b72f8..5b0c906624 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -54,3 +54,18 @@ jobs:
       bertweet:
         perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bertweet
+      camembert:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: camembert
+
+      distilbert:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: distilbert
+
+      microsoft-deberta:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: microsoft-deberta
+
+      roberta:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: roberta

From 812a264441e12da8dff6b7f8a4bebfb8e3412180 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 00:13:29 +0000
Subject: [PATCH 39/67] adding extra files

---
 .azure_pipelines/perfmonitoring-ci .yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 5b0c906624..3db278344a 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -40,6 +40,21 @@ jobs:
       bertweet:
         perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bertweet
+      camembert:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: camembert
+
+      distilbert:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: distilbert
+
+      microsoft-deberta:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: microsoft-deberta
+
+      roberta:
+        perfMonitoringScriptName: perf_monitoring_models_cpu
+        testModel: roberta
 
 - template: job_templates/olive-perf-monitoring-template.yaml
   parameters:

From 754a8898fb2116962da019f4e7ae12a5f493015b Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 17:44:53 +0000
Subject: [PATCH 40/67] cleaning up code

---
 perf_monitoring/best_metrics.json             | 29 +++++++-
 .../test_perf_monitoring_models_cpu.py        | 74 +------------------
 perf_monitoring/utils.py                      | 33 ++++++++-
 3 files changed, 62 insertions(+), 74 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index bd5ed0a3f5..f1cae41c64 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1 +1,28 @@
-{"bert": [], "CamemBERT": [], "bertweet-base-sentiment-analysis": [1.0, -16.03689], "distilbert-base-uncased-finetuned-sst-2-english": [1.0, -8.61203], "microsoft-deberta-base-mnli": [1.0, -111.79317], "roberta-large-mnli": [1.0, -146.89287], "cpu_models": [-96.3853, 0.9933962225914001]}
+{
+    "bert": [],
+    "CamemBERT": [],
+    "bertweet-base-sentiment-analysis": [
+        1.0,
+        -16.03689
+    ],
+    "distilbert-base-uncased-finetuned-sst-2-english": [
+        1.0,
+        -8.61203
+    ],
+    "microsoft-deberta-base-mnli": [
+        1.0,
+        -111.79317
+    ],
+    "roberta-large-mnli": [
+        1.0,
+        -146.89287
+    ],
+    "cpu_models": [
+        -96.3853,
+        0.9933962225914001
+    ],
+    "bertweet": [
+        0.735632183908046,
+        -14.61082
+    ]
+}
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 8f32bcfa97..63eaf85051 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -15,79 +15,13 @@ def setup():
     os.chdir(cur_dir)
 
 
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/bert/bert_workflow_cpu.json"],
-# )
-# def test_bert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "bert")
-
-
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/distilbert-base-uncased-finetuned-sst-2-english/cpu_config.json"],
-# )
-# def test_distilbert(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "distilbert-base-uncased-finetuned-sst-2-english")
-
-
-@pytest.mark.parametrize(
-    "olive_json",
-    [f"perf_models/cpu_models/{os.environ['TEST_MODEL']}_cpu_config.json"],
-)
-def test_models(olive_json):
+def test_models():
+    model_name = os.environ["TEST_MODEL"]
+    olive_json = f"perf_models/cpu_models/{model_name}_cpu_config.json"
 
     print(olive_json)
     from olive.workflows import run as olive_run
 
     olive_config = patch_config(olive_json)
     footprint = olive_run(olive_config)
-    extract_best_models(footprint, "cpu_models")
-
-
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/bertweet-base-sentiment-analysis/cpu_config.json"],
-# )
-# def test_bertweet(olive_json):
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "bertweet-base-sentiment-analysis")
-
-
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/microsoft-deberta-base-mnli/cpu_config.json"],
-# )
-# def test_microsoft(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "microsoft-deberta-base-mnli")
-
-
-# @pytest.mark.parametrize(
-#     "olive_json",
-#     ["perf_models/roberta-large-mnli/cpu_config.json"],
-# )
-# def test_roberta_mnli(olive_json):
-#     print(olive_json)
-#     from olive.workflows import run as olive_run
-
-#     olive_config = patch_config(olive_json)
-#     footprint = olive_run(olive_config)
-#     extract_best_models(footprint, "roberta-large-mnli")
+    extract_best_models(footprint, model_name)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index a698a02d5e..611c628da4 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -75,20 +75,47 @@ def compare_metrics(best_metrics, model_name):
         model_data = data[model_name]
         if len(model_data) == 0:
             print("No data in best_metrics.json")
-            return {"accuracy": True, "latency": True}
+            return {"accuracy": True, "latency": True, "accuracy_percentage_change": 0, "latency_percentage_change": 0}
         print(model_data[0], model_data[1])
         print(best_metrics[0], best_metrics[1])
+
+        accuracy_percentage_change = ((best_metrics[0] - model_data[0]) / model_data[0]) * 100
+        latency_percentage_change = ((best_metrics[1] - model_data[1]) / model_data[1]) * 100
+
         comparison_result = {
             "accuracy": no_regression(best_metrics[0], model_data[0], 0.05),
             "latency": no_regression(best_metrics[1], model_data[1], 0.05),
+            "accuracy_percentage_change": accuracy_percentage_change,
+            "latency_percentage_change": latency_percentage_change,
+            "accuracy_better": "same"
+            if accuracy_percentage_change == 0
+            else "higher"
+            if accuracy_percentage_change > 0
+            else "lower",
+            "latency_better": "same"
+            if latency_percentage_change == 0
+            else "lower"
+            if latency_percentage_change > 0
+            else "higher",
         }
+
+        with open("model_output.txt", "w") as f:
+            f.write(f"Accuracy percentage change: {accuracy_percentage_change}\n")
+            f.write(f"Latency percentage change: {latency_percentage_change}\n")
+            f.write(f"Is accuracy better?: {comparison_result['accuracy_better']}\n")
+            f.write(f"Is latency better?: {comparison_result['latency_better']}\n")
     else:
         print(f"{model_name} not found in best_metrics.json, creating new entry...")
         data[model_name] = best_metrics
-        comparison_result = {"accuracy": True, "latency": True}
+        comparison_result = {
+            "accuracy": True,
+            "latency": True,
+            "accuracy_percentage_change": 0,
+            "latency_percentage_change": 0,
+        }
 
     # Save the updated data back to the file
     with open("best_metrics.json", "w") as f:
-        json.dump(data, f)
+        json.dump(data, f, indent=4)
 
     return comparison_result

From 54d9b1a2594a41f49a2493bfab54ecc47226f6ca Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 18:02:28 +0000
Subject: [PATCH 41/67] cleaning up pipeline

---
 .../olive-perf-monitoring-template.yaml         |  2 +-
 .azure_pipelines/perfmonitoring-ci .yaml        | 12 ------------
 perf_monitoring/utils.py                        | 17 -----------------
 3 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
index 25fd4e84b4..4e14107380 100644
--- a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
+++ b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
@@ -29,7 +29,7 @@ jobs:
     inputs:
       azureSubscription: $(OLIVE_RG_SERVICE_CONNECTION)
       scriptLocation: 'inlineScript'
-      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=$(perfMonitoringScriptName)
+      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=perf_monitoring_models_cpu
     displayName: performance monitoring
     env:
       OLIVEWHEELS_STORAGE_CONNECTION_STRING: $(olive-wheels-storage-connection-string)
diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
index 3db278344a..8ddd605c77 100644
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ b/.azure_pipelines/perfmonitoring-ci .yaml	
@@ -34,26 +34,20 @@ jobs:
     windows: True
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bert
 
       bertweet:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bertweet
       camembert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: camembert
 
       distilbert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: distilbert
 
       microsoft-deberta:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: microsoft-deberta
 
       roberta:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: roberta
 
 - template: job_templates/olive-perf-monitoring-template.yaml
@@ -63,24 +57,18 @@ jobs:
     windows: False
     examples:
       bert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bert
 
       bertweet:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: bertweet
       camembert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: camembert
 
       distilbert:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: distilbert
 
       microsoft-deberta:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: microsoft-deberta
 
       roberta:
-        perfMonitoringScriptName: perf_monitoring_models_cpu
         testModel: roberta
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 611c628da4..b7e4657091 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -1,23 +1,6 @@
 import json
 
 
-def check_search_output(footprints):
-    """Check if the search output is valid."""
-    assert footprints, "footprints is empty. The search must have failed for all accelerator specs."
-    for footprint in footprints.values():
-        assert footprint.nodes
-        for v in footprint.nodes.values():
-            assert all([metric_result.value > 0 for metric_result in v.metrics.value.values()])
-
-
-def check_no_search_output(outputs):
-    assert outputs, "outputs is empty. The run must have failed for all accelerator specs."
-    for output in outputs.values():
-        output_metrics = output["metrics"]
-        for item in output_metrics.values():
-            assert item.value > 0
-
-
 def patch_config(config_json_path: str):
     """Load the config json file and patch it with default search algorithm (exhaustive)"""
     with open(config_json_path, "r") as fin:

From 4465d28944bdc4633c85d5f3fa9b872535c667ea Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 23:43:36 +0000
Subject: [PATCH 42/67] final changes made

---
 .azure_pipelines/perfmonitoring-ci .yaml      |  74 -----------
 perf_monitoring/best_metrics.json             |  16 +--
 .../cpu_models/camembert_cpu_config.json      |   2 +-
 .../cpu_models/distilbert_cpu_config.json     |   2 +-
 perf_monitoring/readme.md                     | 116 +++++++++++++++++-
 perf_monitoring/requirements.txt              |   4 -
 .../test_perf_monitoring_models_cpu.py        |   1 -
 perf_monitoring/utils.py                      |  22 +---
 8 files changed, 121 insertions(+), 116 deletions(-)
 delete mode 100644 .azure_pipelines/perfmonitoring-ci .yaml

diff --git a/.azure_pipelines/perfmonitoring-ci .yaml b/.azure_pipelines/perfmonitoring-ci .yaml
deleted file mode 100644
index 8ddd605c77..0000000000
--- a/.azure_pipelines/perfmonitoring-ci .yaml	
+++ /dev/null
@@ -1,74 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-  paths:
-    exclude:
-    - docs/*
-    - examples/README.md
-    - examples/**/README.md
-    - README.md
-    - CONTRIBUTING.md
-    - LICENSE
-pr:
-  branches:
-    include:
-    - main
-  paths:
-    exclude:
-    - docs/*
-    - examples/README.md
-    - examples/**/README.md
-    - README.md
-    - CONTRIBUTING.md
-    - LICENSE
-
-variables:
-    ComponentDetection.Timeout: 2400
-
-jobs:
-- template: job_templates/olive-perf-monitoring-template.yaml
-  parameters:
-    name: Windows_CI
-    pool: $(OLIVE_POOL_WIN2019)
-    windows: True
-    examples:
-      bert:
-        testModel: bert
-
-      bertweet:
-        testModel: bertweet
-      camembert:
-        testModel: camembert
-
-      distilbert:
-        testModel: distilbert
-
-      microsoft-deberta:
-        testModel: microsoft-deberta
-
-      roberta:
-        testModel: roberta
-
-- template: job_templates/olive-perf-monitoring-template.yaml
-  parameters:
-    name: Linux_CI
-    pool: $(OLIVE_POOL_UBUNTU2004)
-    windows: False
-    examples:
-      bert:
-        testModel: bert
-
-      bertweet:
-        testModel: bertweet
-      camembert:
-        testModel: camembert
-
-      distilbert:
-        testModel: distilbert
-
-      microsoft-deberta:
-        testModel: microsoft-deberta
-
-      roberta:
-        testModel: roberta
diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index f1cae41c64..f057b9ea94 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,26 +1,18 @@
 {
     "bert": [],
-    "CamemBERT": [],
-    "bertweet-base-sentiment-analysis": [
-        1.0,
-        -16.03689
-    ],
-    "distilbert-base-uncased-finetuned-sst-2-english": [
+    "camemBERT": [],
+    "distilbert": [
         1.0,
         -8.61203
     ],
-    "microsoft-deberta-base-mnli": [
+    "microsoft-deberta": [
         1.0,
         -111.79317
     ],
-    "roberta-large-mnli": [
+    "roberta": [
         1.0,
         -146.89287
     ],
-    "cpu_models": [
-        -96.3853,
-        0.9933962225914001
-    ],
     "bertweet": [
         0.735632183908046,
         -14.61082
diff --git a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
index 74e8a55b98..2d73fed4bf 100644
--- a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
@@ -65,6 +65,6 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
+        "output_dir" : "models/camembert_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
index 50547b2094..8b2d068cc0 100644
--- a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
@@ -74,6 +74,6 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
+        "output_dir" : "models/distilbert_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/readme.md b/perf_monitoring/readme.md
index 7e7e785873..ae6a9f458b 100644
--- a/perf_monitoring/readme.md
+++ b/perf_monitoring/readme.md
@@ -1,7 +1,111 @@
-## Extracting best models
+# Performance Monitoring in Azure DevOps with Olive
 
--The file named testtt.ipynb is a place holder file to test and see the structure of my outputs so i could write my script
--The file named best_models.py is the script that extracts the best models from the output of the script and compared with the metrics of the original model, if the metrics of the best model is better than the original model, the best model is saved in a json named best_metrics.json.
--I added a function called extract best models in my utils file to extract the best models from the output of the script and compared with the metrics of the original model, if the metrics of the best model is better than the original model, the best model is saved in a json named best_metrics.json.
-### How to run the script
-Use pytest -s -v to run the script with the script name.
+Contains Python scripts and Azure DevOps YAML files for performance monitoring of Olive models. The scripts and YAML files help you compare the performance of different models and ensure no regression occurs over time.
+
+## Contents
+
+### Azure DevOps YAML Files
+
+The YAML files define Azure DevOps pipelines for automated testing and performance monitoring.
+
+### -olive-perf-monitoring-template.yaml
+
+This YAML file defines a pipeline template for performance monitoring of the models. It uses Python 3.8, installs Olive, runs the performance monitoring script, detects and registers components, publishes test results, and cleans up.
+
+### -perfmonitoring-ci.yaml
+
+This YAML file defines a CI pipeline triggered on changes to the main branch, excluding changes only to documentation and README files. It runs the performance monitoring template for several models on both Windows and Linux environments.
+
+### Python Scripts
+The Python scripts utils.py and test_perf_monitoring.py perform the main tasks of model performance comparison. They load model configurations, run the models, and compare the performance metrics.
+
+The utils.py script contains several utility functions, including functions to:
+
+-Patch the model configuration JSON file
+
+-Extract the best metrics from the model's performance footprint
+
+-Compare the metrics against previous best metrics
+
+-Assert whether performance has not regressed
+
+-The test_perf_monitoring.py script uses pytest to set up a testing environment, run the model, and check the results.
+
+### -Bash and Batch Scripts
+Two additional scripts, perf_monitoring.sh for Unix-like systems and perf_monitoring.bat for Windows, are used for the performance monitoring tasks. They perform the following tasks:
+
+-Set environment variables based on pipeline parameters
+
+-Activate the Python virtual environment if in a pipeline
+
+-Install pytest
+
+-Install the necessary Python packages for performance monitoring
+
+-Run the performance monitoring Python script with pytest and capture the results
+
+
+## Models
+
+The performance monitoring setup utilizes the following models, each configured via their respective JSON files:
+
+1. **BERT** (`bert_cpu_config.json`): BERT (Bidirectional Encoder Representations from Transformers) is a state-of-the-art transformer model for a wide range of NLP tasks.
+
+2. **RoBERTa** (`roberta_cpu_config.json`): RoBERTa is a variant of BERT that uses a different training approach for improved performance.
+
+3. **DeBERTa** (`microsoft-deberta_cpu_config.json`): DeBERTa (Decoding-enhanced BERT with disentangled attention) improves the BERT and RoBERTa models through a two-step disentangled attention mechanism.
+
+4. **CamemBERT** (`camembert_cpu_config.json`): A BERT-based model specifically trained for French language tasks.
+
+5. **DistilBERT** (`distilbert_cpu_config.json`): DistilBERT is a smaller, faster, cheaper, lighter version of BERT, trained using knowledge distillation.
+
+6. **BERTweet** (`bertweet_cpu_config.json`): BERTweet is a BERT-based model specifically fine-tuned for English Twitter sentiment analysis tasks.
+
+Each JSON file contains configurations for the input model, evaluators, passes, and the engine.
+
+The **input_model** section specifies the type of model (PyTorchModel in these cases), and the model's configuration, including the Hugging Face (hf) configuration for the model name, task, and dataset.
+
+The **evaluators** section defines the metrics to be used for evaluation, such as accuracy and latency.
+
+The **passes** section includes the type and configuration of optimization and conversion processes to be performed on the model.
+
+The **engine** section specifies the search strategy for performance tuning, the evaluator to use, the execution providers, and the directories for caching and output.
+
+
+## USAGE (Locally and on CI)
+
+**On CI Pipeline**
+
+Set up your Python environment with Python 3.8 and the necessary packages.
+
+Define the necessary environment variables, pools, and connection strings in your Azure DevOps environment.
+
+Adjust the paths and model names in the Python scripts and YAML files to match your specific requirements.
+
+Manually start the pipelines, or push a change to your repository to trigger them automatically.
+
+Check the results in the Azure DevOps portal.
+
+**Running the Scripts Locally**
+
+After setting up your environment and familiarizing yourself with the configuration files, you can run the performance monitoring script:
+
+```bash
+python -m pytest -v -s test_perf_monitoring_models_cpu.py
+```
+
+This command starts pytest, which runs all the test cases in the test_perf_monitoring_models_cpu.py script.
+
+
+## Comparison Metrics and Process
+Performance comparison takes into account the following metrics:
+
+**Accuracy** - Calculated as the number of correct predictions divided by the total number of predictions.
+
+**Latency** - Measured in seconds, calculated as the time taken to run the model on a specific test set.
+
+The metrics from each model run are compared with a set of reference metrics stored in a **best_metrics.json** file using the **compared_metrics** function in **utils.py**. This function calculates the percentage change in each metric relative to the stored "best" metrics.
+
+If the function detects a regression (increase in latency or decrease in accuracy) exceeding a predefined threshold, it raises an error, causing the Azure pipeline to fail.
+
+The **best_metrics.json** file is updated with the new metrics if the function does not detect a regression.
diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
index 0aa2a6f1d3..4c661a4e84 100644
--- a/perf_monitoring/requirements.txt
+++ b/perf_monitoring/requirements.txt
@@ -1,10 +1,6 @@
-azure-ai-ml
-azure-identity
 datasets
 evaluate
-docker
 onnxruntime
-neural-compressor
 scipy
 scikit-learn
 transformers
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 63eaf85051..673a791844 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -18,7 +18,6 @@ def setup():
 def test_models():
     model_name = os.environ["TEST_MODEL"]
     olive_json = f"perf_models/cpu_models/{model_name}_cpu_config.json"
-
     print(olive_json)
     from olive.workflows import run as olive_run
 
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index b7e4657091..4f5abeb97f 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -63,30 +63,19 @@ def compare_metrics(best_metrics, model_name):
         print(best_metrics[0], best_metrics[1])
 
         accuracy_percentage_change = ((best_metrics[0] - model_data[0]) / model_data[0]) * 100
-        latency_percentage_change = ((best_metrics[1] - model_data[1]) / model_data[1]) * 100
+        latency_percentage_change = -((best_metrics[1] - model_data[1]) / model_data[1]) * 100
 
         comparison_result = {
             "accuracy": no_regression(best_metrics[0], model_data[0], 0.05),
             "latency": no_regression(best_metrics[1], model_data[1], 0.05),
             "accuracy_percentage_change": accuracy_percentage_change,
             "latency_percentage_change": latency_percentage_change,
-            "accuracy_better": "same"
-            if accuracy_percentage_change == 0
-            else "higher"
-            if accuracy_percentage_change > 0
-            else "lower",
-            "latency_better": "same"
-            if latency_percentage_change == 0
-            else "lower"
-            if latency_percentage_change > 0
-            else "higher",
         }
 
-        with open("model_output.txt", "w") as f:
-            f.write(f"Accuracy percentage change: {accuracy_percentage_change}\n")
-            f.write(f"Latency percentage change: {latency_percentage_change}\n")
-            f.write(f"Is accuracy better?: {comparison_result['accuracy_better']}\n")
-            f.write(f"Is latency better?: {comparison_result['latency_better']}\n")
+        # Assert that both accuracy and latency are True
+        assert comparison_result["accuracy"], "accuracy must be True"
+        assert comparison_result["latency"], "latency must be True"
+
     else:
         print(f"{model_name} not found in best_metrics.json, creating new entry...")
         data[model_name] = best_metrics
@@ -100,5 +89,4 @@ def compare_metrics(best_metrics, model_name):
     # Save the updated data back to the file
     with open("best_metrics.json", "w") as f:
         json.dump(data, f, indent=4)
-
     return comparison_result

From c8716cd91dd2caafe8b1091678a2c9d71e979091 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Thu, 27 Jul 2023 23:47:07 +0000
Subject: [PATCH 43/67] final changes made

---
 .azure_pipelines/perfmonitoring-ci.yaml | 57 +++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 .azure_pipelines/perfmonitoring-ci.yaml

diff --git a/.azure_pipelines/perfmonitoring-ci.yaml b/.azure_pipelines/perfmonitoring-ci.yaml
new file mode 100644
index 0000000000..c4e8fa0102
--- /dev/null
+++ b/.azure_pipelines/perfmonitoring-ci.yaml
@@ -0,0 +1,57 @@
+trigger:
+  batch: true
+  branches:
+    include:
+    - main
+  paths:
+    exclude:
+    - docs/*
+    - examples/README.md
+    - examples/**/README.md
+    - README.md
+    - CONTRIBUTING.md
+    - LICENSE
+pr: none
+
+
+variables:
+    ComponentDetection.Timeout: 2400
+
+jobs:
+- template: job_templates/olive-perf-monitoring-template.yaml
+  parameters:
+    name: Windows_CI
+    pool: $(OLIVE_POOL_WIN2019)
+    windows: True
+    examples:
+      bert:
+        testModel: bert
+      bertweet:
+        testModel: bertweet
+      camembert:
+        testModel: camembert
+      distilbert:
+        testModel: distilbert
+      microsoft-deberta:
+        testModel: microsoft-deberta
+      roberta:
+        testModel: roberta
+
+- template: job_templates/olive-perf-monitoring-template.yaml
+  parameters:
+    name: Linux_CI
+    pool: $(OLIVE_POOL_UBUNTU2004)
+    windows: False
+    examples:
+      bert:
+        testModel: bert
+      bertweet:
+        testModel: bertweet
+      camembert:
+        testModel: camembert
+      distilbert:
+        testModel: distilbert
+      microsoft-deberta:
+        testModel: microsoft-deberta
+      roberta:
+        testModel: roberta

From 839eb50e2c423002322dfb7d4a16e6041b1260d6 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 28 Jul 2023 00:17:05 +0000
Subject: [PATCH 44/67] final changes made

---
 perf_monitoring/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
index 4c661a4e84..5826720bcd 100644
--- a/perf_monitoring/requirements.txt
+++ b/perf_monitoring/requirements.txt
@@ -8,3 +8,4 @@ sentencepiece
 evaluate
 seqeval
 emoji==0.6.0
+psutil

From ae197be0f66ff5a93c67681f181659f7b3f3f7f6 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 4 Aug 2023 00:06:09 +0000
Subject: [PATCH 45/67] made changes to models and metrics

---
 perf_monitoring/best_metrics.json             | 23 +++++++++++--------
 .../microsoft-deberta_cpu_config.json         |  2 +-
 .../test_perf_monitoring_models_cpu.py        |  1 +
 perf_monitoring/utils.py                      | 23 ++++++++++++++++++-
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index f057b9ea94..78693d03e4 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,20 +1,25 @@
 {
-    "bert": [],
-    "camemBERT": [],
-    "distilbert": [
+    "bert": [
         1.0,
-        -8.61203
+        -104.38034],
+    "camemBERT": [
+        0.9933962225914001,
+        -64.11745
+    ],
+    "distilbert": [
+        0.9002293577981652,
+        -10.57228
     ],
     "microsoft-deberta": [
-        1.0,
-        -111.79317
+        0.3,
+        -51.1691
     ],
     "roberta": [
-        1.0,
-        -146.89287
+        0.3,
+        -72.22182
     ],
     "bertweet": [
         0.735632183908046,
-        -14.61082
+         -11.18819
     ]
 }
diff --git a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
index cff81b5491..e1b6a00635 100644
--- a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
@@ -74,6 +74,6 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
-        "output_dir" : "models/deberta_workflow_cpu"
+        "output_dir" : "models/microsoft-deberta_workflow_cpu"
     }
 }
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
index 673a791844..d803a8014c 100644
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ b/perf_monitoring/test_perf_monitoring_models_cpu.py
@@ -16,6 +16,7 @@ def setup():
 
 
 def test_models():
+
     model_name = os.environ["TEST_MODEL"]
     olive_json = f"perf_models/cpu_models/{model_name}_cpu_config.json"
     print(olive_json)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 4f5abeb97f..968b9790b2 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -8,7 +8,7 @@ def patch_config(config_json_path: str):
     # set default logger severity
     olive_config["engine"]["log_severity_level"] = 0
     # set clean cache
-    olive_config["engine"]["clean_cache"] = True
+    olive_config["engine"]["clean_cache"] = False
     return olive_config
 
 
@@ -41,6 +41,8 @@ def extract_best_models(footprint, model_name):
     print("Best metrics: ", best_metrics)
     compared_metric = compare_metrics(best_metrics, model_name)
     print("Compared metrics: ", compared_metric)
+    compared_new_input_metric = compare_input_metrics(best_metrics, model_name)
+    print("Compared new input metrics: ", compared_new_input_metric)
 
 
 def no_regression(actual, expected, rel_tol):  # check for tolerance
@@ -90,3 +92,22 @@ def compare_metrics(best_metrics, model_name):
     with open("best_metrics.json", "w") as f:
         json.dump(data, f, indent=4)
     return comparison_result
+
+
+def compare_input_metrics(best_metrics, model_name):
+    # open best metrics json
+    with open(f"models/{model_name}_workflow_cpu/cpu-cpu_input_model_metrics.json") as f:
+        data = json.load(f)
+
+    accuracy = data["accuracy-accuracy"]["value"]
+    latency = data["latency-avg"]["value"]
+    accuracy_percentage_change = ((best_metrics[0] - accuracy) / accuracy) * 100
+    latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
+
+    comparison_result = {
+        "accuracy": no_regression(best_metrics[0], accuracy, 0.05),
+        "latency": no_regression(best_metrics[1], latency, 0.05),
+        "accuracy_percentage_change": accuracy_percentage_change,
+        "latency_percentage_change": latency_percentage_change,
+    }
+    return comparison_result

From 7404da567bb06a552ded086268c2ea801ec28a2a Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 4 Aug 2023 02:54:41 +0000
Subject: [PATCH 46/67] made changes to models tolerance

---
 perf_monitoring/best_metrics.json | 13 +++++++------
 perf_monitoring/utils.py          | 17 ++++++++++-------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index 78693d03e4..e0091bd717 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,10 +1,7 @@
 {
     "bert": [
-        1.0,
-        -104.38034],
-    "camemBERT": [
-        0.9933962225914001,
-        -64.11745
+        0.8455882352941176,
+        -23.43271
     ],
     "distilbert": [
         0.9002293577981652,
@@ -20,6 +17,10 @@
     ],
     "bertweet": [
         0.735632183908046,
-         -11.18819
+        -11.18819
+    ],
+    "camembert": [
+        0.9933962225914001,
+        -97.38994
     ]
 }
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 968b9790b2..ddbc5ccf08 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -8,7 +8,7 @@ def patch_config(config_json_path: str):
     # set default logger severity
     olive_config["engine"]["log_severity_level"] = 0
     # set clean cache
-    olive_config["engine"]["clean_cache"] = False
+    olive_config["engine"]["clean_cache"] = True
     return olive_config
 
 
@@ -68,8 +68,8 @@ def compare_metrics(best_metrics, model_name):
         latency_percentage_change = -((best_metrics[1] - model_data[1]) / model_data[1]) * 100
 
         comparison_result = {
-            "accuracy": no_regression(best_metrics[0], model_data[0], 0.05),
-            "latency": no_regression(best_metrics[1], model_data[1], 0.05),
+            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09),
+            "latency": no_regression(best_metrics[1], model_data[1], 0.095),
             "accuracy_percentage_change": accuracy_percentage_change,
             "latency_percentage_change": latency_percentage_change,
         }
@@ -98,15 +98,18 @@ def compare_input_metrics(best_metrics, model_name):
     # open best metrics json
     with open(f"models/{model_name}_workflow_cpu/cpu-cpu_input_model_metrics.json") as f:
         data = json.load(f)
-
-    accuracy = data["accuracy-accuracy"]["value"]
+    if "accuracy-accuracy" in data:
+        accuracy = data["accuracy-accuracy"]["value"]
+    else:
+        accuracy = data["accuracy-accuracy_score"]["value"]
+    # accuracy = data["accuracy-accuracy"]["value"]
     latency = data["latency-avg"]["value"]
     accuracy_percentage_change = ((best_metrics[0] - accuracy) / accuracy) * 100
     latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
 
     comparison_result = {
-        "accuracy": no_regression(best_metrics[0], accuracy, 0.05),
-        "latency": no_regression(best_metrics[1], latency, 0.05),
+        "accuracy": no_regression(best_metrics[0], accuracy, 0.09),
+        "latency": no_regression(best_metrics[1], latency, 0.095),
         "accuracy_percentage_change": accuracy_percentage_change,
         "latency_percentage_change": latency_percentage_change,
     }

From 579325edd948a522a5856983dfc4edf89cd8265c Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 4 Aug 2023 21:11:06 +0000
Subject: [PATCH 47/67] updated models search

---
 .../perf_models/cpu_models/bert_cpu_config.json      | 12 +++---------
 .../perf_models/cpu_models/bertweet_cpu_config.json  | 12 +++---------
 .../perf_models/cpu_models/camembert_cpu_config.json | 11 ++---------
 .../cpu_models/distilbert_cpu_config.json            | 12 +++---------
 .../cpu_models/microsoft-deberta_cpu_config.json     | 11 ++---------
 .../perf_models/cpu_models/roberta_cpu_config.json   | 11 ++---------
 6 files changed, 15 insertions(+), 54 deletions(-)

diff --git a/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
index bff54468de..8eead050f6 100644
--- a/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
@@ -11,7 +11,8 @@
                     "split": "validation",
                     "input_cols": ["sentence1", "sentence2"],
                     "label_cols": ["label"],
-                    "batch_size": 1
+                    "batch_size": 1,
+                    "max_samples": 100
                 }
             }
         }
@@ -62,14 +63,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy":true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
diff --git a/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
index 0221f579fa..7c4e6341b8 100644
--- a/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
@@ -11,7 +11,8 @@
                     "split": "test",
                     "input_cols": ["text"],
                     "label_cols": ["label"],
-                    "batch_size": 1
+                    "batch_size": 1,
+                    "max_samples": 100
                 }
             }
 
@@ -63,14 +64,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy":true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
diff --git a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
index 2d73fed4bf..973cdda1d7 100644
--- a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
@@ -11,7 +11,7 @@
                     "input_cols": ["tokens"],
                     "label_cols": ["ner_tags"],
                     "batch_size": 1,
-                    "max_samples": 10
+                    "max_samples": 100
                 }
             }
         }
@@ -53,14 +53,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy": true,
         "log_severity_level": 0,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
diff --git a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
index 8b2d068cc0..3d5eb2bb91 100644
--- a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
@@ -11,7 +11,8 @@
                     "split": "validation",
                     "input_cols": ["sentence"],
                     "label_cols": ["label"],
-                    "batch_size": 1
+                    "batch_size": 1,
+                    "max_samples": 100
                 }
             }
         }
@@ -63,14 +64,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
diff --git a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
index e1b6a00635..42eaa1c8af 100644
--- a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
@@ -12,7 +12,7 @@
                     "input_cols": ["premise"],
                     "label_cols": ["label"],
                     "batch_size": 1,
-                    "max_samples": 10
+                    "max_samples": 100
                 }
             }
         }
@@ -63,14 +63,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",
diff --git a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
index 5c706ac4b2..e85e00c922 100644
--- a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
@@ -12,7 +12,7 @@
                     "input_cols": ["premise"],
                     "label_cols": ["label"],
                     "batch_size": 1,
-                    "max_samples": 10
+                    "max_samples": 100
                 }
             }
         }
@@ -63,14 +63,7 @@
         }
     },
     "engine": {
-        "search_strategy": {
-            "execution_order": "joint",
-            "search_algorithm": "tpe",
-            "search_algorithm_config": {
-                "num_samples": 3,
-                "seed": 0
-            }
-        },
+        "search_strategy":true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
         "cache_dir": "cache",

From 9c05d3a8ccf9bdc5d0349fd23037c2bd950ea3f3 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 4 Aug 2023 21:43:02 +0000
Subject: [PATCH 48/67] updated models and fixed error

---
 olive/passes/onnx/quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/olive/passes/onnx/quantization.py b/olive/passes/onnx/quantization.py
index 18a28d2ee1..dd697e96d5 100644
--- a/olive/passes/onnx/quantization.py
+++ b/olive/passes/onnx/quantization.py
@@ -397,7 +397,7 @@ def _run_for_config(
                     use_external_data_format=True,
                     **run_config,
                 )
-            except AttributeError as e:
+            except (AttributeError, ValueError) as e:
                 raise OlivePassException("quantize_static failed.") from e
         else:
             try:
@@ -407,7 +407,7 @@ def _run_for_config(
                     use_external_data_format=True,
                     **run_config,
                 )
-            except AttributeError as e:
+            except (AttributeError, ValueError) as e:
                 raise OlivePassException("quantize_dynamic failed.") from e
 
         # load the model

From db38fa8989b6d768d1657efe4f01aeb924bf83d8 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Fri, 4 Aug 2023 21:46:29 +0000
Subject: [PATCH 49/67] updated models and fixed error

---
 .../cpu_models/camembert_cpu_config.json          | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
index 973cdda1d7..a95ae83782 100644
--- a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
@@ -50,6 +50,21 @@
             "config": {
                 "target_opset": 13
             }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization"
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning"
         }
     },
     "engine": {

From 1fed3ae79e42c1f7ae0358c447d90f6f0e045696 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Sat, 5 Aug 2023 03:13:00 +0000
Subject: [PATCH 50/67] updated samples for roberta

---
 perf_monitoring/best_metrics.json             | 24 +++++++++----------
 .../cpu_models/roberta_cpu_config.json        |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index e0091bd717..1debdc8c66 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -1,26 +1,26 @@
 {
     "bert": [
-        0.8455882352941176,
-        -23.43271
+        0.91,
+        -21.68454
     ],
     "distilbert": [
-        0.9002293577981652,
-        -10.57228
+        0.94,
+        -5.60627
     ],
     "microsoft-deberta": [
-        0.3,
-        -51.1691
+        0.34,
+        -49.40051
     ],
     "roberta": [
-        0.3,
-        -72.22182
+        0.34,
+        -423.74402
     ],
     "bertweet": [
-        0.735632183908046,
-        -11.18819
+        0.8,
+        -9.84033
     ],
     "camembert": [
-        0.9933962225914001,
-        -97.38994
+        0.9918867945671082,
+        -37.94092
     ]
 }
diff --git a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
index e85e00c922..226e626bf6 100644
--- a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
+++ b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
@@ -12,7 +12,7 @@
                     "input_cols": ["premise"],
                     "label_cols": ["label"],
                     "batch_size": 1,
-                    "max_samples": 100
+                    "max_samples": 50
                 }
             }
         }

From 27a27cc1bdb546df234a5cf55b6693d4fa1d57f1 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Sun, 6 Aug 2023 19:38:51 +0000
Subject: [PATCH 51/67] updated tolerance code

---
 perf_monitoring/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index ddbc5ccf08..de5bb364f7 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -45,8 +45,10 @@ def extract_best_models(footprint, model_name):
     print("Compared new input metrics: ", compared_new_input_metric)
 
 
-def no_regression(actual, expected, rel_tol):  # check for tolerance
-    if actual > expected:
+def no_regression(actual, expected, rel_tol, higher_is_better):  # check for tolerance
+    if higher_is_better and actual > expected:
+        return True
+    elif not higher_is_better and actual < expected:
         return True
     return abs(actual - expected) <= rel_tol * abs(expected)
 

From ae2683ae18b54dd46ab69ece0a82664f77e0b13c Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Sun, 6 Aug 2023 20:07:16 +0000
Subject: [PATCH 52/67] updated tolerance

---
 perf_monitoring/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index de5bb364f7..45f6dafe7d 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -110,8 +110,8 @@ def compare_input_metrics(best_metrics, model_name):
     latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
 
     comparison_result = {
-        "accuracy": no_regression(best_metrics[0], accuracy, 0.09),
-        "latency": no_regression(best_metrics[1], latency, 0.095),
+        "accuracy": no_regression(best_metrics[0], accuracy, 0.09, True),
+        "latency": no_regression(best_metrics[1], latency, 0.095, False),
         "accuracy_percentage_change": accuracy_percentage_change,
         "latency_percentage_change": latency_percentage_change,
     }

From 1c775ce0344152262b01ba05a01768eaa95fafc9 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Sun, 6 Aug 2023 20:55:12 +0000
Subject: [PATCH 53/67] updated toleranc

---
 perf_monitoring/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 45f6dafe7d..8532e9d896 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -70,8 +70,8 @@ def compare_metrics(best_metrics, model_name):
         latency_percentage_change = -((best_metrics[1] - model_data[1]) / model_data[1]) * 100
 
         comparison_result = {
-            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09),
-            "latency": no_regression(best_metrics[1], model_data[1], 0.095),
+            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09, True),
+            "latency": no_regression(best_metrics[1], model_data[1], 0.095, False),
             "accuracy_percentage_change": accuracy_percentage_change,
             "latency_percentage_change": latency_percentage_change,
         }

From a26087b717f6cbf60811f1bcf3d23ea9194c1549 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 7 Aug 2023 19:45:51 +0000
Subject: [PATCH 54/67] updated utils

---
 perf_monitoring/best_metrics.json |  4 ++--
 perf_monitoring/utils.py          | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
index 1debdc8c66..9652f297fc 100644
--- a/perf_monitoring/best_metrics.json
+++ b/perf_monitoring/best_metrics.json
@@ -17,10 +17,10 @@
     ],
     "bertweet": [
         0.8,
-        -9.84033
+        -10.97
     ],
     "camembert": [
         0.9918867945671082,
-        -37.94092
+        -217.06503
     ]
 }
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 8532e9d896..61629cafe1 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -45,10 +45,8 @@ def extract_best_models(footprint, model_name):
     print("Compared new input metrics: ", compared_new_input_metric)
 
 
-def no_regression(actual, expected, rel_tol, higher_is_better):  # check for tolerance
-    if higher_is_better and actual > expected:
-        return True
-    elif not higher_is_better and actual < expected:
+def no_regression(actual, expected, rel_tol):  # check for tolerance
+    if actual > expected:
         return True
     return abs(actual - expected) <= rel_tol * abs(expected)
 
@@ -70,8 +68,8 @@ def compare_metrics(best_metrics, model_name):
         latency_percentage_change = -((best_metrics[1] - model_data[1]) / model_data[1]) * 100
 
         comparison_result = {
-            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09, True),
-            "latency": no_regression(best_metrics[1], model_data[1], 0.095, False),
+            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09),
+            "latency": no_regression(best_metrics[1], model_data[1], 0.095),
             "accuracy_percentage_change": accuracy_percentage_change,
             "latency_percentage_change": latency_percentage_change,
         }
@@ -97,21 +95,24 @@ def compare_metrics(best_metrics, model_name):
 
 
 def compare_input_metrics(best_metrics, model_name):
-    # open best metrics json
     with open(f"models/{model_name}_workflow_cpu/cpu-cpu_input_model_metrics.json") as f:
         data = json.load(f)
+        print("Contents of cpu-cpu_input_model_metrics.json:")
+        print(json.dumps(data, indent=4))
     if "accuracy-accuracy" in data:
         accuracy = data["accuracy-accuracy"]["value"]
     else:
         accuracy = data["accuracy-accuracy_score"]["value"]
     # accuracy = data["accuracy-accuracy"]["value"]
     latency = data["latency-avg"]["value"]
+    # print latency and accuracy values on next line
+    print("accuracy: ", accuracy, "latency: ", latency)
+    # now print the input model metrics on the next line
+    print("input model metrics: ", best_metrics[0], best_metrics[1])
     accuracy_percentage_change = ((best_metrics[0] - accuracy) / accuracy) * 100
     latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
 
     comparison_result = {
-        "accuracy": no_regression(best_metrics[0], accuracy, 0.09, True),
-        "latency": no_regression(best_metrics[1], latency, 0.095, False),
         "accuracy_percentage_change": accuracy_percentage_change,
         "latency_percentage_change": latency_percentage_change,
     }

From 0d5cd97cc64ddb1d7a885c3955974d8f3de762a0 Mon Sep 17 00:00:00 2001
From: Emmanuel Assumang <emmanuelassumang@gmail.com>
Date: Mon, 7 Aug 2023 20:43:04 +0000
Subject: [PATCH 55/67] updates made to utils

---
 perf_monitoring/utils.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
index 61629cafe1..231f72940b 100644
--- a/perf_monitoring/utils.py
+++ b/perf_monitoring/utils.py
@@ -97,18 +97,14 @@ def compare_metrics(best_metrics, model_name):
 def compare_input_metrics(best_metrics, model_name):
     with open(f"models/{model_name}_workflow_cpu/cpu-cpu_input_model_metrics.json") as f:
         data = json.load(f)
-        print("Contents of cpu-cpu_input_model_metrics.json:")
-        print(json.dumps(data, indent=4))
     if "accuracy-accuracy" in data:
         accuracy = data["accuracy-accuracy"]["value"]
     else:
         accuracy = data["accuracy-accuracy_score"]["value"]
-    # accuracy = data["accuracy-accuracy"]["value"]
-    latency = data["latency-avg"]["value"]
+
+    latency = data["latency-avg"]["value"] * -1
     # print latency and accuracy values on next line
     print("accuracy: ", accuracy, "latency: ", latency)
-    # now print the input model metrics on the next line
-    print("input model metrics: ", best_metrics[0], best_metrics[1])
     accuracy_percentage_change = ((best_metrics[0] - accuracy) / accuracy) * 100
     latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
 

From 56afa705fc81e3b87cfebfde0cc0f2f3dd05ef86 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 28 Sep 2023 05:52:23 +0000
Subject: [PATCH 56/67] save

---
 .../performance_check/best_metrics.json       | 42 ++++++++++++
 .../run_performance_check.py                  | 20 +++++-
 examples/bert/bert_cuda_gpu.json              | 30 ++++++++
 olive/azureml/azureml_client.py               | 16 +++++
 olive/model/__init__.py                       | 27 +++++---
 olive/model/hf_utils.py                       |  1 +
 olive/resource_path.py                        | 68 ++++++++++++++++++-
 perf_monitoring/best_metrics.json             | 26 -------
 8 files changed, 190 insertions(+), 40 deletions(-)
 create mode 100644 .azure_pipelines/performance_check/best_metrics.json
 delete mode 100644 perf_monitoring/best_metrics.json

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
new file mode 100644
index 0000000000..31dde5a112
--- /dev/null
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -0,0 +1,42 @@
+{
+    "bert": {
+        "cpu": {
+            "accuracy-accuracy": 0.88,
+            "latency-avg": 27.1239
+        },
+        "gpu": {
+            "accuracy-accuracy": 0.89,
+            "latency-avg": 1.61
+        }
+    },
+    "deberta": {
+        "cpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        },
+        "gpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        }
+    },
+    "distilbert": {
+        "cpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        },
+        "gpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        }
+    },
+    "roberta_large": {
+        "cpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        },
+        "gpu": {
+            "accuracy-accuracy": 0.92,
+            "latency-avg": 4.21
+        }
+    },
+}
diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index 9246cd81fd..fd1a2dce77 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -70,6 +70,7 @@
             "label_cols": ["label"],
             "batch_size": 1,
             "max_samples": 100,
+            "component_kwargs": {"pre_process_data": {"align_labels": True}},
         },
     },
     "roberta_large": {
@@ -237,11 +238,11 @@ def run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num):
         olive_config = f"{model_name}.json" if device == "cpu" else f"{model_name}_gpu.json"
         olive_config_path = cur_dir / "configs" / olive_config
         run_with_config("olive", olive_config_path, metric_res)
-    print(metric_res)
     for model, v in metric_res.items():
         for metric_name, metric_value_list in v.items():
             vsum = sum(float(v) for v in metric_value_list)
-            metric_res[model][metric_name] = vsum / len(metric_value_list)
+            metric_res[model][metric_name] = round((vsum / len(metric_value_list)), 4)
+    print(metric_res)
     return metric_res
 
 
@@ -256,6 +257,20 @@ def print_perf_table(metric_res, device):
     print(table)
 
 
+def regression_check(metric, device):
+    metric_data_path = Path(__file__).absolute().parent / "best_metrics.json"
+    if not metric_data_path.exists():
+        metric_data_path.touch()
+    with open(metric_data_path) as f:
+        data = json.load(f)
+
+
+def no_regression(actual, expected, rel_tol):
+    if actual > expected:
+        return True
+    return abs(actual - expected) <= rel_tol * abs(expected)
+
+
 def main():
     args = get_args()
     model_name = args.model_name
@@ -294,6 +309,7 @@ def main():
         nvidia_smi = subprocess.check_output(["nvidia-smi"])
         print(nvidia_smi.decode("utf-8"))
     print_perf_table(metric_res, device)
+    regression_check(metric_res["olive"], device)
 
 
 if __name__ == "__main__":
diff --git a/examples/bert/bert_cuda_gpu.json b/examples/bert/bert_cuda_gpu.json
index bdd9f25748..080f738e10 100644
--- a/examples/bert/bert_cuda_gpu.json
+++ b/examples/bert/bert_cuda_gpu.json
@@ -16,6 +16,30 @@
             }
         }
     },
+    "systems": {
+        "aml_gpu_system": {
+            "type": "AzureML",
+            "config": {
+                "accelerators": ["GPU"],
+                "aml_compute": "gpu-cluster",
+                "aml_docker_config": {
+                    "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04",
+                    "conda_file_path": "conda_gpu.yaml"
+                }
+            }
+        },
+        "aml_arc_system": {
+            "type": "AzureML",
+            "config": {
+                "accelerators": ["NPU"],
+                "aml_compute": "olive-intel-npu",
+                "aml_docker_config": {
+                    "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
+                    "conda_file_path": "conda.yaml"
+                }
+            }
+        }
+    },
     "evaluators": {
         "common_evaluator": {
             "metrics":[
@@ -74,8 +98,14 @@
                 "seed": 0
             }
         },
+        "packaging_config": {
+            "type": "Zipfile",
+            "name": "OutputModels"
+        },
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
+        "host": "local_system",
+        "target": "local_system",
         "cache_dir": "cache",
         "output_dir" : "models/bert_cuda"
     }
diff --git a/olive/azureml/azureml_client.py b/olive/azureml/azureml_client.py
index 3263fbfd6d..3834fbb789 100644
--- a/olive/azureml/azureml_client.py
+++ b/olive/azureml/azureml_client.py
@@ -101,6 +101,22 @@ def create_client(self):
                 credential=self._get_credentials(), path=self.aml_config_path, read_timeout=self.read_timeout
             )
 
+    def create_registry_client(self, registry_name: str):
+        """Create an MLClient instance."""
+        from azure.ai.ml import MLClient
+
+        # set logger level to error to avoid too many logs from azure sdk
+        logging.getLogger("azure.ai.ml").setLevel(logging.ERROR)
+        logging.getLogger("azure.identity").setLevel(logging.ERROR)
+
+        return MLClient(
+            credential=self._get_credentials(),
+            subscription_id=self.subscription_id,
+            resource_group_name=self.resource_group,
+            registry_name=registry_name,
+            read_timeout=self.read_timeout,
+        )
+
     def _get_credentials(self):
         """Get credentials for MLClient.
 
diff --git a/olive/model/__init__.py b/olive/model/__init__.py
index 81475cdef1..b26487aa96 100644
--- a/olive/model/__init__.py
+++ b/olive/model/__init__.py
@@ -542,22 +542,22 @@ def load_model(self, rank: int = None) -> torch.nn.Module:
         if self.model is not None:
             return self.model
 
+        # Load special path or format model -> load model from hf config -> load normal path model
         if self.model_loader is not None:
             user_module_loader = UserModuleLoader(self.model_script, self.script_dir)
             model = user_module_loader.call_object(self.model_loader, self.model_path)
+        elif self.model_file_format == ModelFileFormat.PYTORCH_TORCH_SCRIPT:
+            model = torch.jit.load(self.model_path)
+        elif self.model_file_format == ModelFileFormat.PYTORCH_MLFLOW_MODEL:
+            model = self.load_mlflow_model()
         elif self.hf_config and (self.hf_config.model_class or self.hf_config.task):
             model = self.hf_config.load_model(self.model_path)
+        elif self.model_file_format == ModelFileFormat.PYTORCH_ENTIRE_MODEL:
+            model = torch.load(self.model_path)
+        elif self.model_file_format == ModelFileFormat.PYTORCH_STATE_DICT:
+            raise ValueError("Please use customized model loader to load state dict of model.")
         else:
-            if self.model_file_format == ModelFileFormat.PYTORCH_ENTIRE_MODEL:
-                model = torch.load(self.model_path)
-            elif self.model_file_format == ModelFileFormat.PYTORCH_TORCH_SCRIPT:
-                model = torch.jit.load(self.model_path)
-            elif self.model_file_format == ModelFileFormat.PYTORCH_MLFLOW_MODEL:
-                model = self.load_mlflow_model()
-            elif self.model_file_format == ModelFileFormat.PYTORCH_STATE_DICT:
-                raise ValueError("Please use customized model loader to load state dict of model.")
-            else:
-                raise ValueError(f"Unsupported model file format: {self.model_file_format}")
+            raise ValueError(f"Unsupported model file format: {self.model_file_format}")
 
         # we only have peft adapters for now
         adapter_path = self.get_resource("adapter_path")
@@ -571,6 +571,7 @@ def load_model(self, rank: int = None) -> torch.nn.Module:
         return model
 
     def load_mlflow_model(self):
+        logger.info(f"Loading MLFlow model from {self.model_path}")
         tmp_dir = tempfile.TemporaryDirectory(prefix="mlflow_tmp")
         tmp_dir_path = Path(tmp_dir.name)
 
@@ -670,7 +671,11 @@ def get_dummy_inputs(self):
     def get_hf_model_config(self):
         if self.hf_config is None:
             raise ValueError("HF model_config is not available")
-        return self.hf_config.load_model_config(self.model_path)
+        return (
+            self.hf_config.load_model_config(self.hf_config.model_name)
+            if self.model_file_format == ModelFileFormat.PYTORCH_MLFLOW_MODEL
+            else self.hf_config.load_model_config(self.model_path)
+        )
 
     @property
     def components(self) -> List[str]:
diff --git a/olive/model/hf_utils.py b/olive/model/hf_utils.py
index c33618b163..7991c01b54 100644
--- a/olive/model/hf_utils.py
+++ b/olive/model/hf_utils.py
@@ -215,6 +215,7 @@ def task_or_model_class_required(cls, v, values):
     def load_model(self, model_path: str = None):
         """Load model from model_path or model_name."""
         model_name_or_path = model_path or self.model_name
+        logger.info(f"Loading Huggingface model from {model_name_or_path}")
         loading_args = self.model_loading_args.get_loading_args() if self.model_loading_args else {}
         if self.task:
             model = load_huggingface_model_from_task(self.task, model_name_or_path, **loading_args)
diff --git a/olive/resource_path.py b/olive/resource_path.py
index ad15e9a4a0..c83a6de147 100644
--- a/olive/resource_path.py
+++ b/olive/resource_path.py
@@ -26,6 +26,7 @@ class ResourceType(str, Enum):
     LocalFolder = "folder"
     StringName = "string_name"
     AzureMLModel = "azureml_model"
+    AzureMLRegistryModel = "azureml_registry_model"
     AzureMLDatastore = "azureml_datastore"
     AzureMLJobOutput = "azureml_job_output"
 
@@ -34,7 +35,12 @@ def __str__(self) -> str:
 
 
 LOCAL_RESOURCE_TYPES = [ResourceType.LocalFile, ResourceType.LocalFolder]
-AZUREML_RESOURCE_TYPES = [ResourceType.AzureMLModel, ResourceType.AzureMLDatastore, ResourceType.AzureMLJobOutput]
+AZUREML_RESOURCE_TYPES = [
+    ResourceType.AzureMLModel,
+    ResourceType.AzureMLRegistryModel,
+    ResourceType.AzureMLDatastore,
+    ResourceType.AzureMLJobOutput,
+]
 
 
 class ResourcePath(AutoConfigClass):
@@ -344,6 +350,66 @@ def save_to_dir(self, dir_path: Union[Path, str], name: str = None, overwrite: b
         return str(new_path)
 
 
+class AzureMLRegistryModel(ResourcePath):
+    """AzureML Model resource path"""
+
+    name = ResourceType.AzureMLRegistryModel
+
+    @staticmethod
+    def _default_config() -> Dict[str, Any]:
+        return {
+            "azureml_client": ConfigParam(
+                type_=AzureMLClientConfig, required=True, description="AzureML client config."
+            ),
+            "registry_name": ConfigParam(type_=str, required=True, description="Name of the registry."),
+            "name": ConfigParam(type_=str, required=True, description="Name of the model."),
+            "version": ConfigParam(type_=Union[int, str], required=True, description="Version of the model."),
+        }
+
+    def get_path(self) -> str:
+        return (
+            f"azureml://registries/{self.config.registry_name}/models/{self.config.name}/versions/{self.config.version}"
+        )
+
+    def save_to_dir(self, dir_path: Union[Path, str], name: str = None, overwrite: bool = False) -> str:
+        # directory to save the resource to
+        dir_path = Path(dir_path).resolve()
+        dir_path.mkdir(parents=True, exist_ok=True)
+
+        # azureml client
+        ml_client = self.config.azureml_client.create_registry_client(self.config.registry_name)
+
+        # azureml model
+        model = ml_client.models.get(self.config.name, version=self.config.version)
+        model_path = Path(model.path)
+
+        # path to save the resource to
+        if name:
+            new_path_name = Path(name).with_suffix(model_path.suffix).name
+        else:
+            new_path_name = model_path.name
+        new_path = dir_path / new_path_name
+        _overwrite_helper(new_path, overwrite)
+
+        # download the resource to the new path
+        logger.debug(f"Downloading model {self.config.name} version {self.config.version} to {new_path}.")
+        from azure.core.exceptions import ServiceResponseError
+
+        with tempfile.TemporaryDirectory(dir=dir_path, prefix="olive_tmp") as temp_dir:
+            temp_dir = Path(temp_dir)
+            retry_func(
+                ml_client.models.download,
+                [self.config.name],
+                {"version": self.config.version, "download_path": temp_dir},
+                max_tries=self.config.azureml_client.max_operation_retries,
+                delay=self.config.azureml_client.operation_retry_interval,
+                exceptions=ServiceResponseError,
+            )
+            new_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.move(temp_dir / self.config.name / model_path.name, new_path)
+        return str(new_path)
+
+
 def _datastore_url_validator(v, values, **kwargs):
     aml_info_ready = all([values.get("azureml_client"), values.get("datastore_name"), values.get("relative_path")])
     if not v and not aml_info_ready:
diff --git a/perf_monitoring/best_metrics.json b/perf_monitoring/best_metrics.json
deleted file mode 100644
index 9652f297fc..0000000000
--- a/perf_monitoring/best_metrics.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "bert": [
-        0.91,
-        -21.68454
-    ],
-    "distilbert": [
-        0.94,
-        -5.60627
-    ],
-    "microsoft-deberta": [
-        0.34,
-        -49.40051
-    ],
-    "roberta": [
-        0.34,
-        -423.74402
-    ],
-    "bertweet": [
-        0.8,
-        -10.97
-    ],
-    "camembert": [
-        0.9918867945671082,
-        -217.06503
-    ]
-}

From da35cbe4dd7c4575df401da025ae0b1e62caf34f Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 06:25:40 +0000
Subject: [PATCH 57/67] merge regression check pipeline to performance check
 pipeline

---
 .../performance_check/best_metrics.json       |  47 ++++---
 .../performance_check/configs/bert.json       |   8 +-
 .../performance_check/configs/bert_gpu.json   |   4 +-
 .../performance_check/configs/deberta.json    |   8 +-
 .../configs/deberta_gpu.json                  |   8 +-
 .../performance_check/configs/distilbert.json |   9 +-
 .../configs/distilbert_gpu.json               |   4 +-
 .../configs/roberta_large.json                |   4 +-
 .../configs/roberta_large_gpu.json            |   4 +-
 .../run_performance_check.py                  |  82 ++++++++++---
 .../cpu_models/bert_cpu_config.json           |  72 -----------
 .../cpu_models/bertweet_cpu_config.json       |  73 -----------
 .../cpu_models/camembert_cpu_config.json      |  78 ------------
 .../cpu_models/distilbert_cpu_config.json     |  73 -----------
 .../microsoft-deberta_cpu_config.json         |  72 -----------
 .../cpu_models/roberta_cpu_config.json        |  72 -----------
 perf_monitoring/readme.md                     | 111 -----------------
 perf_monitoring/requirements.txt              |  11 --
 .../test_perf_monitoring_models_cpu.py        |  27 ----
 perf_monitoring/utils.py                      | 115 ------------------
 20 files changed, 121 insertions(+), 761 deletions(-)
 delete mode 100644 perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
 delete mode 100644 perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
 delete mode 100644 perf_monitoring/readme.md
 delete mode 100644 perf_monitoring/requirements.txt
 delete mode 100644 perf_monitoring/test_perf_monitoring_models_cpu.py
 delete mode 100644 perf_monitoring/utils.py

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
index 31dde5a112..0fa269b795 100644
--- a/.azure_pipelines/performance_check/best_metrics.json
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -2,41 +2,58 @@
     "bert": {
         "cpu": {
             "accuracy-accuracy": 0.88,
-            "latency-avg": 27.1239
+            "latency-avg": {
+                "8272CL": 13.96,
+                "E5-2673": 30.0,
+                "8171M": 20.89
+            }
         },
         "gpu": {
-            "accuracy-accuracy": 0.89,
+            "accuracy-accuracy": 0.9,
             "latency-avg": 1.61
         }
     },
     "deberta": {
         "cpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.84,
+            "latency-avg": {
+                "8272CL": 57.74,
+                "E5-2673": 117.00,
+                "8171M": 93.37
+            }
         },
         "gpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.88,
+            "latency-avg": 8.011
         }
     },
     "distilbert": {
         "cpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.94,
+            "latency-avg": {
+                "8272CL": 4.5,
+                "E5-2673": 11.0,
+                "8171M": 5.6,
+                "8370C": 4.57
+            }
         },
         "gpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.94,
+            "latency-avg": 0.831
         }
     },
     "roberta_large": {
         "cpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.88,
+            "latency-avg": {
+                "8272CL": 52.38,
+                "E5-2673": 140.34,
+                "8171M": 69.33
+            }
         },
         "gpu": {
-            "accuracy-accuracy": 0.92,
-            "latency-avg": 4.21
+            "accuracy-accuracy": 0.89,
+            "latency-avg": 6.164
         }
-    },
+    }
 }
diff --git a/.azure_pipelines/performance_check/configs/bert.json b/.azure_pipelines/performance_check/configs/bert.json
index b8ffee925d..e00801760d 100644
--- a/.azure_pipelines/performance_check/configs/bert.json
+++ b/.azure_pipelines/performance_check/configs/bert.json
@@ -25,14 +25,14 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}}
                     ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 10}}
                     ]
                 }
             ]
@@ -81,7 +81,7 @@
         "clean_cache": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_ptq"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/bert_ptq"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/bert_gpu.json b/.azure_pipelines/performance_check/configs/bert_gpu.json
index 15fc41d15a..b6e23fb929 100644
--- a/.azure_pipelines/performance_check/configs/bert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/bert_gpu.json
@@ -79,7 +79,7 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
         "clean_cache": true,
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_gpu"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/bert_gpu"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/deberta.json b/.azure_pipelines/performance_check/configs/deberta.json
index 894065962d..8bd3f94bff 100644
--- a/.azure_pipelines/performance_check/configs/deberta.json
+++ b/.azure_pipelines/performance_check/configs/deberta.json
@@ -30,7 +30,7 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1}
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}}
 
                     ]
                 },
@@ -38,7 +38,7 @@
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2}
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 1}}
                     ]
                 }
             ]
@@ -87,7 +87,7 @@
         "clean_cache": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/microsoft-deberta"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/microsoft-deberta"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/deberta_gpu.json b/.azure_pipelines/performance_check/configs/deberta_gpu.json
index bb9be2e749..9333761689 100644
--- a/.azure_pipelines/performance_check/configs/deberta_gpu.json
+++ b/.azure_pipelines/performance_check/configs/deberta_gpu.json
@@ -30,7 +30,7 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1}
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
 
                     ]
                 },
@@ -38,7 +38,7 @@
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2}
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
                     ]
                 }
             ]
@@ -85,7 +85,7 @@
         "clean_cache": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/microsoft-deberta_cuda"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/microsoft-deberta_cuda"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/distilbert.json b/.azure_pipelines/performance_check/configs/distilbert.json
index b6c9b8c7bc..005a2f5c0e 100644
--- a/.azure_pipelines/performance_check/configs/distilbert.json
+++ b/.azure_pipelines/performance_check/configs/distilbert.json
@@ -26,14 +26,14 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1}
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
                     ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2}
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
                     ]
                 }
             ]
@@ -79,10 +79,11 @@
                 "seed": 0
             }
         },
+        "log_severity_level": 0,
         "clean_cache": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/distilbert"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/distilbert"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/distilbert_gpu.json b/.azure_pipelines/performance_check/configs/distilbert_gpu.json
index a7abebbe34..051b8cd40e 100644
--- a/.azure_pipelines/performance_check/configs/distilbert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/distilbert_gpu.json
@@ -79,7 +79,7 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
         "clean_cache": true,
-        "cache_dir": "cache",
-        "output_dir" : "models/distilbert_cuda"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/distilbert_cuda"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/roberta_large.json b/.azure_pipelines/performance_check/configs/roberta_large.json
index 52c36ccc59..1b489893b0 100644
--- a/.azure_pipelines/performance_check/configs/roberta_large.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large.json
@@ -86,7 +86,7 @@
         "clean_cache": true,
         "evaluator": "common_evaluator",
         "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/roberta_large"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/roberta_large"
     }
 }
diff --git a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
index f79fd8374b..6700088b0d 100644
--- a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
@@ -84,7 +84,7 @@
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
         "clean_cache": true,
-        "cache_dir": "cache",
-        "output_dir" : "models/roberta_large"
+        "cache_dir": "run_cache/olive/cache",
+        "output_dir" : "run_cache/olive/roberta_large"
     }
 }
diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index 02d5118020..a9e812a6be 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -93,13 +93,13 @@
     "name": "accuracy",
     "type": "accuracy",
     "backend": "huggingface_metrics",
-    "sub_types": [{"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}],
+    "sub_types": [{"name": "accuracy", "priority": 1}],
 }
 
 LAT_METRIC = {
     "name": "latency",
     "type": "latency",
-    "sub_types": [{"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}],
+    "sub_types": [{"name": "avg", "priority": 2}],
 }
 
 
@@ -156,8 +156,10 @@ def export_optimum_dynamic_quantization(onnx_model_path, model_root_path):
 
 
 def run_with_config(tool, olive_config, metric_res):
+    print(f"Start evaluating {tool} model")
     outputs = olive_run(olive_config)
     if tool == "olive":
+        print(next(iter(outputs.values())).nodes.values())
         metric = str(next(iter(next(iter(outputs.values())).nodes.values())).metrics.value)
     else:
         metric = str(next(iter(outputs.values())))
@@ -238,6 +240,7 @@ def run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num):
         olive_config = f"{model_name}.json" if device == "cpu" else f"{model_name}_gpu.json"
         olive_config_path = cur_dir / "configs" / olive_config
         run_with_config("olive", olive_config_path, metric_res)
+    print(metric_res)
     for model, v in metric_res.items():
         for metric_name, metric_value_list in v.items():
             vsum = sum(float(v) for v in metric_value_list)
@@ -257,18 +260,62 @@ def print_perf_table(metric_res, device):
     print(table)
 
 
-def regression_check(metric, device):
-    metric_data_path = Path(__file__).absolute().parent / "best_metrics.json"
-    if not metric_data_path.exists():
-        metric_data_path.touch()
-    with open(metric_data_path) as f:
-        data = json.load(f)
-
-
-def no_regression(actual, expected, rel_tol):
-    if actual > expected:
-        return True
-    return abs(actual - expected) <= rel_tol * abs(expected)
+def regression_check(model_name, metrics, device, cpu_info):
+    best_metric_path = Path(__file__).absolute().parent / "best_metrics.json"
+    if not best_metric_path.exists():
+        print(f"Best metrics file {best_metric_path} does not exist, skip regression check")
+        return
+    metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
+    regression_res = {}
+    with open(best_metric_path) as f:
+        best_metric_json = json.load(f)
+        best_metrics = best_metric_json[model_name][device]
+        for metric_name in metrics_of_interest:
+            # There are 3 types of cpus in our cpu pool
+            # Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+            # Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
+            # Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz、
+            best_metric = best_metrics[metric_name]
+            if device == "cpu" and metric_name == "latency-avg":
+                if "8272CL" in cpu_info:
+                    best_metric = best_metric["8272CL"]
+                elif "E5-2673" in cpu_info:
+                    best_metric = best_metric["E5-2673"]
+                elif "8171M" in cpu_info:
+                    best_metric = best_metric["8171M"]
+                elif "8370C" in cpu_info:
+                    best_metric = best_metric["8370C"]
+                else:
+                    print(f"Unknown cpu type {cpu_info}, skip regression check")
+                    return
+            if best_metric == 0:
+                print("No data found, skip regression check")
+                return
+            no_regression, percentage_change = regression_cal(
+                best_metric, metrics[metric_name], metric_name.startswith("accuracy")
+            )
+            regression_res[metric_name] = {
+                "best_metric": best_metric,
+                "actual_metric": metrics[metric_name],
+                "no_regression": no_regression,
+                "percentage_change": percentage_change,
+            }
+        print(f"Regression check result: {regression_res}")
+        for metric_name, metric_value in regression_res.items():
+            assert metric_value["no_regression"], (
+                f"Regression check failed for {metric_name} metric with {metric_value['percentage_change']}"
+                "percentage change"
+            )
+
+
+def regression_cal(best_metric, real_metric, is_acc):
+    percentage_change = (real_metric - best_metric) / best_metric
+    tolerance = 0.01 if is_acc else 0.05
+    diff = real_metric - best_metric
+    if is_acc:
+        diff = -diff
+    no_regression = diff <= tolerance * abs(best_metric)
+    return no_regression, percentage_change
 
 
 def main():
@@ -297,10 +344,9 @@ def main():
         export_optimum_dynamic_quantization(onnx_model_path, model_root_path)
 
     metric_res = run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num)
-
+    lscpu = subprocess.check_output(["lscpu"]).decode("utf-8")
+    print(lscpu)
     if device == "cpu":
-        lscpu = subprocess.check_output(["lscpu"])
-        print(lscpu.decode("utf-8"))
         import psutil
 
         process = [(proc.name(), proc.cpu_percent()) for proc in psutil.process_iter()]
@@ -309,7 +355,7 @@ def main():
         nvidia_smi = subprocess.check_output(["nvidia-smi"])
         print(nvidia_smi.decode("utf-8"))
     print_perf_table(metric_res, device)
-    regression_check(metric_res["olive"], device)
+    regression_check(model_name, metric_res["olive"], device, lscpu)
 
 
 if __name__ == "__main__":
diff --git a/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
deleted file mode 100644
index 8eead050f6..0000000000
--- a/perf_monitoring/perf_models/cpu_models/bert_cpu_config.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "Intel/bert-base-uncased-mrpc",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mrpc",
-                    "split": "validation",
-                    "input_cols": ["sentence1", "sentence2"],
-                    "label_cols": ["label"],
-                    "batch_size": 1,
-                    "max_samples": 100
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy":true,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json b/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
deleted file mode 100644
index 7c4e6341b8..0000000000
--- a/perf_monitoring/perf_models/cpu_models/bertweet_cpu_config.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-    "input_model": {
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "finiteautomata/bertweet-base-sentiment-analysis",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"cardiffnlp/tweet_sentiment_multilingual",
-                    "subset": "english",
-                    "split": "test",
-                    "input_cols": ["text"],
-                    "label_cols": ["label"],
-                    "batch_size": 1,
-                    "max_samples": 100
-                }
-            }
-
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy":true,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/bertweet_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
deleted file mode 100644
index a95ae83782..0000000000
--- a/perf_monitoring/perf_models/cpu_models/camembert_cpu_config.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "Jean-Baptiste/camembert-ner",
-                "task": "ner",
-                "dataset": {
-                    "data_name":"Jean-Baptiste/wikiner_fr",
-                    "split": "test",
-                    "input_cols": ["tokens"],
-                    "label_cols": ["ner_tags"],
-                    "batch_size": 1,
-                    "max_samples": 100
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "sub_types": [
-                        {
-                            "name": "accuracy_score",
-                            "priority": 1,
-                            "metric_config": {
-                                "task": "multiclass",
-                                "num_classes": "5",
-                                "top_k": 1
-                            }
-                        }
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": true,
-        "log_severity_level": 0,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/camembert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json b/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
deleted file mode 100644
index 3d5eb2bb91..0000000000
--- a/perf_monitoring/perf_models/cpu_models/distilbert_cpu_config.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "sst2",
-                    "split": "validation",
-                    "input_cols": ["sentence"],
-                    "label_cols": ["label"],
-                    "batch_size": 1,
-                    "max_samples": 100
-                }
-            }
-        }
-    },
-
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": true,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/distilbert_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
deleted file mode 100644
index 42eaa1c8af..0000000000
--- a/perf_monitoring/perf_models/cpu_models/microsoft-deberta_cpu_config.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "microsoft/deberta-base-mnli",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mnli_matched",
-                    "split": "validation",
-                    "input_cols": ["premise"],
-                    "label_cols": ["label"],
-                    "batch_size": 1,
-                    "max_samples": 100
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy": true,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/microsoft-deberta_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json b/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
deleted file mode 100644
index 226e626bf6..0000000000
--- a/perf_monitoring/perf_models/cpu_models/roberta_cpu_config.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-    "input_model":{
-        "type": "PyTorchModel",
-        "config": {
-            "hf_config": {
-                "model_name": "roberta-large-mnli",
-                "task": "text-classification",
-                "dataset": {
-                    "data_name":"glue",
-                    "subset": "mnli_matched",
-                    "split": "validation",
-                    "input_cols": ["premise"],
-                    "label_cols": ["label"],
-                    "batch_size": 1,
-                    "max_samples": 50
-                }
-            }
-        }
-    },
-    "evaluators": {
-        "common_evaluator": {
-            "metrics":[
-                {
-                    "name": "accuracy",
-                    "type": "accuracy",
-                    "backend": "huggingface_metrics",
-                    "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-
-                    ]
-                },
-                {
-                    "name": "latency",
-                    "type": "latency",
-                    "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
-                }
-            ]
-        }
-    },
-    "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "config": {
-                "target_opset": 13
-            }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
-        },
-        "quantization": {
-            "type": "OnnxQuantization"
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning"
-        }
-    },
-    "engine": {
-        "search_strategy":true,
-        "evaluator": "common_evaluator",
-        "execution_providers": ["CPUExecutionProvider"],
-        "cache_dir": "cache",
-        "output_dir" : "models/roberta_workflow_cpu"
-    }
-}
diff --git a/perf_monitoring/readme.md b/perf_monitoring/readme.md
deleted file mode 100644
index ae6a9f458b..0000000000
--- a/perf_monitoring/readme.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# Performance Monitoring in Azure DevOps with Olive
-
-Contains Python scripts and Azure DevOps YAML files for performance monitoring of Olive models. The scripts and YAML files help you compare the performance of different models and ensure no regression occurs over time.
-
-## Contents
-
-### Azure DevOps YAML Files
-
-The YAML files define Azure DevOps pipelines for automated testing and performance monitoring.
-
-### -olive-perf-monitoring-template.yaml
-
-This YAML file defines a pipeline template for performance monitoring of the models. It uses Python 3.8, installs Olive, runs the performance monitoring script, detects and registers components, publishes test results, and cleans up.
-
-### -perfmonitoring-ci.yaml
-
-This YAML file defines a CI pipeline triggered on changes to the main branch, excluding changes only to documentation and README files. It runs the performance monitoring template for several models on both Windows and Linux environments.
-
-### Python Scripts
-The Python scripts utils.py and test_perf_monitoring.py perform the main tasks of model performance comparison. They load model configurations, run the models, and compare the performance metrics.
-
-The utils.py script contains several utility functions, including functions to:
-
--Patch the model configuration JSON file
-
--Extract the best metrics from the model's performance footprint
-
--Compare the metrics against previous best metrics
-
--Assert whether performance has not regressed
-
--The test_perf_monitoring.py script uses pytest to set up a testing environment, run the model, and check the results.
-
-### -Bash and Batch Scripts
-Two additional scripts, perf_monitoring.sh for Unix-like systems and perf_monitoring.bat for Windows, are used for the performance monitoring tasks. They perform the following tasks:
-
--Set environment variables based on pipeline parameters
-
--Activate the Python virtual environment if in a pipeline
-
--Install pytest
-
--Install the necessary Python packages for performance monitoring
-
--Run the performance monitoring Python script with pytest and capture the results
-
-
-## Models
-
-The performance monitoring setup utilizes the following models, each configured via their respective JSON files:
-
-1. **BERT** (`bert_cpu_config.json`): BERT (Bidirectional Encoder Representations from Transformers) is a state-of-the-art transformer model for a wide range of NLP tasks.
-
-2. **RoBERTa** (`roberta_cpu_config.json`): RoBERTa is a variant of BERT that uses a different training approach for improved performance.
-
-3. **DeBERTa** (`microsoft-deberta_cpu_config.json`): DeBERTa (Decoding-enhanced BERT with disentangled attention) improves the BERT and RoBERTa models through a two-step disentangled attention mechanism.
-
-4. **CamemBERT** (`camembert_cpu_config.json`): A BERT-based model specifically trained for French language tasks.
-
-5. **DistilBERT** (`distilbert_cpu_config.json`): DistilBERT is a smaller, faster, cheaper, lighter version of BERT, trained using knowledge distillation.
-
-6. **BERTweet** (`bertweet_cpu_config.json`): BERTweet is a BERT-based model specifically fine-tuned for English Twitter sentiment analysis tasks.
-
-Each JSON file contains configurations for the input model, evaluators, passes, and the engine.
-
-The **input_model** section specifies the type of model (PyTorchModel in these cases), and the model's configuration, including the Hugging Face (hf) configuration for the model name, task, and dataset.
-
-The **evaluators** section defines the metrics to be used for evaluation, such as accuracy and latency.
-
-The **passes** section includes the type and configuration of optimization and conversion processes to be performed on the model.
-
-The **engine** section specifies the search strategy for performance tuning, the evaluator to use, the execution providers, and the directories for caching and output.
-
-
-## USAGE (Locally and on CI)
-
-**On CI Pipeline**
-
-Set up your Python environment with Python 3.8 and the necessary packages.
-
-Define the necessary environment variables, pools, and connection strings in your Azure DevOps environment.
-
-Adjust the paths and model names in the Python scripts and YAML files to match your specific requirements.
-
-Manually start the pipelines, or push a change to your repository to trigger them automatically.
-
-Check the results in the Azure DevOps portal.
-
-**Running the Scripts Locally**
-
-After setting up your environment and familiarizing yourself with the configuration files, you can run the performance monitoring script:
-
-```bash
-python -m pytest -v -s test_perf_monitoring_models_cpu.py
-```
-
-This command starts pytest, which runs all the test cases in the test_perf_monitoring_models_cpu.py script.
-
-
-## Comparison Metrics and Process
-Performance comparison takes into account the following metrics:
-
-**Accuracy** - Calculated as the number of correct predictions divided by the total number of predictions.
-
-**Latency** - Measured in seconds, calculated as the time taken to run the model on a specific test set.
-
-The metrics from each model run are compared with a set of reference metrics stored in a **best_metrics.json** file using the **compared_metrics** function in **utils.py**. This function calculates the percentage change in each metric relative to the stored "best" metrics.
-
-If the function detects a regression (increase in latency or decrease in accuracy) exceeding a predefined threshold, it raises an error, causing the Azure pipeline to fail.
-
-The **best_metrics.json** file is updated with the new metrics if the function does not detect a regression.
diff --git a/perf_monitoring/requirements.txt b/perf_monitoring/requirements.txt
deleted file mode 100644
index 5826720bcd..0000000000
--- a/perf_monitoring/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-datasets
-evaluate
-onnxruntime
-scipy
-scikit-learn
-transformers
-sentencepiece
-evaluate
-seqeval
-emoji==0.6.0
-psutil
diff --git a/perf_monitoring/test_perf_monitoring_models_cpu.py b/perf_monitoring/test_perf_monitoring_models_cpu.py
deleted file mode 100644
index d803a8014c..0000000000
--- a/perf_monitoring/test_perf_monitoring_models_cpu.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import os
-from pathlib import Path
-
-import pytest
-from utils import extract_best_models, patch_config
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup():
-    """setup any state specific to the execution of the given module."""
-    cur_dir = Path(__file__).resolve().parent.parent
-    example_dir = cur_dir / "perf_monitoring"
-    os.chdir(example_dir)
-    yield
-    os.chdir(cur_dir)
-
-
-def test_models():
-
-    model_name = os.environ["TEST_MODEL"]
-    olive_json = f"perf_models/cpu_models/{model_name}_cpu_config.json"
-    print(olive_json)
-    from olive.workflows import run as olive_run
-
-    olive_config = patch_config(olive_json)
-    footprint = olive_run(olive_config)
-    extract_best_models(footprint, model_name)
diff --git a/perf_monitoring/utils.py b/perf_monitoring/utils.py
deleted file mode 100644
index 231f72940b..0000000000
--- a/perf_monitoring/utils.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import json
-
-
-def patch_config(config_json_path: str):
-    """Load the config json file and patch it with default search algorithm (exhaustive)"""
-    with open(config_json_path, "r") as fin:
-        olive_config = json.load(fin)
-    # set default logger severity
-    olive_config["engine"]["log_severity_level"] = 0
-    # set clean cache
-    olive_config["engine"]["clean_cache"] = True
-    return olive_config
-
-
-def extract_best_models(footprint, model_name):
-    print("Footprint: ", footprint)
-    footprint = list(footprint.values())[0]
-    print(
-        "Footprint: ",
-        footprint,
-    )
-    metrics_of_interest = ["accuracy-accuracy", "accuracy-accuracy_score", "latency-avg"]
-    # gather the metrics from all pareto frontier nodes
-    all_metrics = []
-    # we iterate over the nodes in the pareto frontier
-    for node in footprint.nodes.values():
-        metrics = []
-        # collecting the metrics of interest
-        for name in metrics_of_interest:
-            # (value of metric * direction of comparison)
-            # now higher is better for all metrics
-            if name in node.metrics.value:
-                metrics.append(node.metrics.value[name].value * node.metrics.cmp_direction[name])
-        all_metrics.append(metrics)
-    # sort the metrics
-    # this sorts it
-    sorted_metrics = sorted(all_metrics, reverse=True)
-    # get best metrics
-    # last one is the best
-    best_metrics = sorted_metrics[0]
-    print("Best metrics: ", best_metrics)
-    compared_metric = compare_metrics(best_metrics, model_name)
-    print("Compared metrics: ", compared_metric)
-    compared_new_input_metric = compare_input_metrics(best_metrics, model_name)
-    print("Compared new input metrics: ", compared_new_input_metric)
-
-
-def no_regression(actual, expected, rel_tol):  # check for tolerance
-    if actual > expected:
-        return True
-    return abs(actual - expected) <= rel_tol * abs(expected)
-
-
-def compare_metrics(best_metrics, model_name):
-    # open best metrics json
-    with open("best_metrics.json") as f:
-        data = json.load(f)
-
-    if model_name in data:
-        model_data = data[model_name]
-        if len(model_data) == 0:
-            print("No data in best_metrics.json")
-            return {"accuracy": True, "latency": True, "accuracy_percentage_change": 0, "latency_percentage_change": 0}
-        print(model_data[0], model_data[1])
-        print(best_metrics[0], best_metrics[1])
-
-        accuracy_percentage_change = ((best_metrics[0] - model_data[0]) / model_data[0]) * 100
-        latency_percentage_change = -((best_metrics[1] - model_data[1]) / model_data[1]) * 100
-
-        comparison_result = {
-            "accuracy": no_regression(best_metrics[0], model_data[0], 0.09),
-            "latency": no_regression(best_metrics[1], model_data[1], 0.095),
-            "accuracy_percentage_change": accuracy_percentage_change,
-            "latency_percentage_change": latency_percentage_change,
-        }
-
-        # Assert that both accuracy and latency are True
-        assert comparison_result["accuracy"], "accuracy must be True"
-        assert comparison_result["latency"], "latency must be True"
-
-    else:
-        print(f"{model_name} not found in best_metrics.json, creating new entry...")
-        data[model_name] = best_metrics
-        comparison_result = {
-            "accuracy": True,
-            "latency": True,
-            "accuracy_percentage_change": 0,
-            "latency_percentage_change": 0,
-        }
-
-    # Save the updated data back to the file
-    with open("best_metrics.json", "w") as f:
-        json.dump(data, f, indent=4)
-    return comparison_result
-
-
-def compare_input_metrics(best_metrics, model_name):
-    with open(f"models/{model_name}_workflow_cpu/cpu-cpu_input_model_metrics.json") as f:
-        data = json.load(f)
-    if "accuracy-accuracy" in data:
-        accuracy = data["accuracy-accuracy"]["value"]
-    else:
-        accuracy = data["accuracy-accuracy_score"]["value"]
-
-    latency = data["latency-avg"]["value"] * -1
-    # print latency and accuracy values on next line
-    print("accuracy: ", accuracy, "latency: ", latency)
-    accuracy_percentage_change = ((best_metrics[0] - accuracy) / accuracy) * 100
-    latency_percentage_change = -((best_metrics[1] - latency) / latency) * 100
-
-    comparison_result = {
-        "accuracy_percentage_change": accuracy_percentage_change,
-        "latency_percentage_change": latency_percentage_change,
-    }
-    return comparison_result

From d9a40b3fd2a52e604960f69bc13f256ae5e84114 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 06:28:47 +0000
Subject: [PATCH 58/67] update config

---
 .azure_pipelines/performance_check/configs/bert.json      | 8 +-------
 .azure_pipelines/performance_check/configs/bert_gpu.json  | 3 ---
 .azure_pipelines/performance_check/configs/deberta.json   | 8 +-------
 .../performance_check/configs/deberta_gpu.json            | 3 ---
 .../performance_check/configs/distilbert.json             | 8 +-------
 .../performance_check/configs/distilbert_gpu.json         | 3 ---
 .../performance_check/configs/roberta_large.json          | 8 +-------
 .../performance_check/configs/roberta_large_gpu.json      | 3 ---
 8 files changed, 4 insertions(+), 40 deletions(-)

diff --git a/.azure_pipelines/performance_check/configs/bert.json b/.azure_pipelines/performance_check/configs/bert.json
index e00801760d..40452bd5a7 100644
--- a/.azure_pipelines/performance_check/configs/bert.json
+++ b/.azure_pipelines/performance_check/configs/bert.json
@@ -47,13 +47,7 @@
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
-            "disable_search": true,
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
+            "disable_search": true
         },
         "quantization": {
             "type": "OnnxQuantization",
diff --git a/.azure_pipelines/performance_check/configs/bert_gpu.json b/.azure_pipelines/performance_check/configs/bert_gpu.json
index b6e23fb929..bf809bf691 100644
--- a/.azure_pipelines/performance_check/configs/bert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/bert_gpu.json
@@ -49,9 +49,6 @@
             "type": "OrtTransformersOptimization",
             "disable_search": true,
             "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
                 "float16": true
             }
         },
diff --git a/.azure_pipelines/performance_check/configs/deberta.json b/.azure_pipelines/performance_check/configs/deberta.json
index 8bd3f94bff..624a3a9cad 100644
--- a/.azure_pipelines/performance_check/configs/deberta.json
+++ b/.azure_pipelines/performance_check/configs/deberta.json
@@ -53,13 +53,7 @@
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
-            "disable_search": true,
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
+            "disable_search": true
         },
         "quantization": {
             "type": "OnnxQuantization",
diff --git a/.azure_pipelines/performance_check/configs/deberta_gpu.json b/.azure_pipelines/performance_check/configs/deberta_gpu.json
index 9333761689..c0a4ba8a83 100644
--- a/.azure_pipelines/performance_check/configs/deberta_gpu.json
+++ b/.azure_pipelines/performance_check/configs/deberta_gpu.json
@@ -55,9 +55,6 @@
             "type": "OrtTransformersOptimization",
             "disable_search": true,
             "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
                 "float16": true
             }
         },
diff --git a/.azure_pipelines/performance_check/configs/distilbert.json b/.azure_pipelines/performance_check/configs/distilbert.json
index 005a2f5c0e..4a9dbecb08 100644
--- a/.azure_pipelines/performance_check/configs/distilbert.json
+++ b/.azure_pipelines/performance_check/configs/distilbert.json
@@ -48,13 +48,7 @@
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
-            "disable_search": true,
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
+            "disable_search": true
         },
         "quantization": {
             "type": "OnnxQuantization",
diff --git a/.azure_pipelines/performance_check/configs/distilbert_gpu.json b/.azure_pipelines/performance_check/configs/distilbert_gpu.json
index 051b8cd40e..28096bdd99 100644
--- a/.azure_pipelines/performance_check/configs/distilbert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/distilbert_gpu.json
@@ -49,9 +49,6 @@
             "type": "OrtTransformersOptimization",
             "disable_search": true,
             "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
                 "float16": true
             }
         },
diff --git a/.azure_pipelines/performance_check/configs/roberta_large.json b/.azure_pipelines/performance_check/configs/roberta_large.json
index 1b489893b0..2d2fd8fe7f 100644
--- a/.azure_pipelines/performance_check/configs/roberta_large.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large.json
@@ -52,13 +52,7 @@
         },
         "transformers_optimization": {
             "type": "OrtTransformersOptimization",
-            "disable_search": true,
-            "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
-                "float16": false
-            }
+            "disable_search": true
         },
         "quantization": {
             "type": "OnnxQuantization",
diff --git a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
index 6700088b0d..ef9b2f1bef 100644
--- a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
@@ -54,9 +54,6 @@
             "type": "OrtTransformersOptimization",
             "disable_search": true,
             "config": {
-                "model_type": "bert",
-                "num_heads": 12,
-                "hidden_size": 768,
                 "float16": true
             }
         },

From a1379cc553a9760926be1cece91fe17939713bc2 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 08:37:22 +0000
Subject: [PATCH 59/67] Update hf model map

---
 olive/model/hf_mappings.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/olive/model/hf_mappings.py b/olive/model/hf_mappings.py
index 71eeea8965..754c2b2420 100644
--- a/olive/model/hf_mappings.py
+++ b/olive/model/hf_mappings.py
@@ -72,14 +72,16 @@
 # To extend following list/map from huggingface config
 # there is the priority order: NUM_HEADS_NAMES[0] and HIDDEN_SIZE_NAMES[0] are the first choice
 # which means user can override the value in config file
-NUM_HEADS_NAMES = ["num_heads", "num_attention_heads", "n_head", "encoder_attention_heads"]
-HIDDEN_SIZE_NAMES = ["hidden_size", "d_model", "n_embd"]
+NUM_HEADS_NAMES = ["num_heads", "num_attention_heads", "n_head", "n_heads", "encoder_attention_heads"]
+HIDDEN_SIZE_NAMES = ["hidden_size", "dim", "d_model", "n_embd"]
 MODEL_TYPE_MAPPING = {
     "whisper": "bart",
     "camembert": "bert",
     "deberta": "bert",
     "deberta-v2": "bert",
+    "distilbert": "bert",
     "gpt_neox": "gpt2",
     "gpt-j": "gpt2",
     "llama": "gpt2",
+    "roberta": "bert",
 }

From b8052649a9a463bc1f758e5a2be8eeacca686b68 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 19:30:16 +0000
Subject: [PATCH 60/67] update config

---
 .azure_pipelines/performance_check/best_metrics.json    | 2 +-
 .azure_pipelines/performance_check/configs/deberta.json | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
index 0fa269b795..982d7a33b4 100644
--- a/.azure_pipelines/performance_check/best_metrics.json
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -3,7 +3,7 @@
         "cpu": {
             "accuracy-accuracy": 0.88,
             "latency-avg": {
-                "8272CL": 13.96,
+                "8272CL": 14.5,
                 "E5-2673": 30.0,
                 "8171M": 20.89
             }
diff --git a/.azure_pipelines/performance_check/configs/deberta.json b/.azure_pipelines/performance_check/configs/deberta.json
index 624a3a9cad..5ccd9167e8 100644
--- a/.azure_pipelines/performance_check/configs/deberta.json
+++ b/.azure_pipelines/performance_check/configs/deberta.json
@@ -58,6 +58,7 @@
         "quantization": {
             "type": "OnnxQuantization",
             "config": {
+                "quant_mode": "dynamic",
                 "quant_preprocess": true,
                 "data_config": "__input_model_data_config__"
             }

From ed9a6f3f3e868fec619273f7c1c2c42bc6f815eb Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 19:38:12 +0000
Subject: [PATCH 61/67] remove unused files

---
 .../olive-perf-monitoring-template.yaml       | 58 -------------------
 .azure_pipelines/perfmonitoring-ci.yaml       | 57 ------------------
 .../run_performance_check.py                  |  2 +-
 examples/bert/bert_cuda_gpu.json              | 32 +---------
 scripts/perf_monitoring.bat                   | 29 ----------
 scripts/perf_monitoring.sh                    | 27 ---------
 6 files changed, 2 insertions(+), 203 deletions(-)
 delete mode 100644 .azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
 delete mode 100644 .azure_pipelines/perfmonitoring-ci.yaml
 delete mode 100644 scripts/perf_monitoring.bat
 delete mode 100644 scripts/perf_monitoring.sh

diff --git a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml b/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
deleted file mode 100644
index 7342b93ebf..0000000000
--- a/.azure_pipelines/job_templates/olive-perf-monitoring-template.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# Olive performance monitoring template on Azure DevOps
-
-parameters:
-  name: ''
-  pool: ''
-
-jobs:
-- job: ${{parameters.name}}_Examples_performance_monitoring_olive
-  timeoutInMinutes: 300
-  pool:
-    name: ${{ parameters.pool}}
-  strategy:
-    matrix:
-      ${{ insert }}: ${{ parameters.examples }}
-  variables:
-    WINDOWS: ${{ parameters.windows }}
-    runCodesignValidationInjection: false
-
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: 3.8
-    displayName: Use Python 3.8
-
-  - script: python -m pip install .[$(device)]
-    displayName: Install Olive
-
-  - task: AzureCLI@1
-    inputs:
-      azureSubscription: $(OLIVE_RG_SERVICE_CONNECTION)
-      scriptLocation: 'inlineScript'
-      inlineScript: make perf-monitoring PIPELINE=True WINDOWS=$(WINDOWS) PERF_MONITORING_SCRIPT_NAME=perf_monitoring_models_cpu
-    displayName: performance monitoring
-    env:
-      OLIVEWHEELS_STORAGE_CONNECTION_STRING: $(olive-wheels-storage-connection-string)
-      WORKSPACE_SUBSCRIPTION_ID: $(workspace-subscription-id)
-      WORKSPACE_RESOURCE_GROUP: $(workspace-resource-group)
-      WORKSPACE_NAME: $(workspace-name)
-      TEST_MODEL: $(testModel)
-
-
-  - task: ComponentGovernanceComponentDetection@0
-    inputs:
-      scanType: 'Register'
-      verbosity: 'Verbose'
-      alertWarningLevel: 'High'
-    displayName: Component Detection
-
-  - task: PublishTestResults@2
-    condition: succeededOrFailed()
-    inputs:
-      testResultsFiles: '**/*TestOlive*.xml'
-      testRunTitle: '$(Build.BuildNumber)[$(Agent.JobName)]'
-    displayName: Upload pipeline run test results
-
-  - script: make clean WINDOWS=$(WINDOWS)
-    condition: always()
-    displayName: Clean remaining artifacts
diff --git a/.azure_pipelines/perfmonitoring-ci.yaml b/.azure_pipelines/perfmonitoring-ci.yaml
deleted file mode 100644
index c4e8fa0102..0000000000
--- a/.azure_pipelines/perfmonitoring-ci.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-trigger:
-  batch: true
-  branches:
-    include:
-    - main
-  paths:
-    exclude:
-    - docs/*
-    - examples/README.md
-    - examples/**/README.md
-    - README.md
-    - CONTRIBUTING.md
-    - LICENSE
-pr: none
-
-
-variables:
-    ComponentDetection.Timeout: 2400
-
-jobs:
-- template: job_templates/olive-perf-monitoring-template.yaml
-  parameters:
-    name: Windows_CI
-    pool: $(OLIVE_POOL_WIN2019)
-    windows: True
-    examples:
-      bert:
-        testModel: bert
-      bertweet:
-        testModel: bertweet
-      camembert:
-        testModel: camembert
-      distilbert:
-        testModel: distilbert
-      microsoft-deberta:
-        testModel: microsoft-deberta
-      roberta:
-        testModel: roberta
-
-- template: job_templates/olive-perf-monitoring-template.yaml
-  parameters:
-    name: Linux_CI
-    pool: $(OLIVE_POOL_UBUNTU2004)
-    windows: False
-    examples:
-      bert:
-        testModel: bert
-      bertweet:
-        testModel: bertweet
-      camembert:
-        testModel: camembert
-      distilbert:
-        testModel: distilbert
-      microsoft-deberta:
-        testModel: microsoft-deberta
-      roberta:
-        testModel: roberta
diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index a9e812a6be..a44348dde1 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -267,7 +267,7 @@ def regression_check(model_name, metrics, device, cpu_info):
         return
     metrics_of_interest = ["accuracy-accuracy", "latency-avg"]
     regression_res = {}
-    with open(best_metric_path) as f:
+    with best_metric_path.open("r") as f:
         best_metric_json = json.load(f)
         best_metrics = best_metric_json[model_name][device]
         for metric_name in metrics_of_interest:
diff --git a/examples/bert/bert_cuda_gpu.json b/examples/bert/bert_cuda_gpu.json
index 080f738e10..9e6a852cfe 100644
--- a/examples/bert/bert_cuda_gpu.json
+++ b/examples/bert/bert_cuda_gpu.json
@@ -16,30 +16,6 @@
             }
         }
     },
-    "systems": {
-        "aml_gpu_system": {
-            "type": "AzureML",
-            "config": {
-                "accelerators": ["GPU"],
-                "aml_compute": "gpu-cluster",
-                "aml_docker_config": {
-                    "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04",
-                    "conda_file_path": "conda_gpu.yaml"
-                }
-            }
-        },
-        "aml_arc_system": {
-            "type": "AzureML",
-            "config": {
-                "accelerators": ["NPU"],
-                "aml_compute": "olive-intel-npu",
-                "aml_docker_config": {
-                    "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
-                    "conda_file_path": "conda.yaml"
-                }
-            }
-        }
-    },
     "evaluators": {
         "common_evaluator": {
             "metrics":[
@@ -98,15 +74,9 @@
                 "seed": 0
             }
         },
-        "packaging_config": {
-            "type": "Zipfile",
-            "name": "OutputModels"
-        },
         "evaluator": "common_evaluator",
         "execution_providers": ["CUDAExecutionProvider"],
-        "host": "local_system",
-        "target": "local_system",
         "cache_dir": "cache",
         "output_dir" : "models/bert_cuda"
     }
-}
+}
\ No newline at end of file
diff --git a/scripts/perf_monitoring.bat b/scripts/perf_monitoring.bat
deleted file mode 100644
index 11b008d251..0000000000
--- a/scripts/perf_monitoring.bat
+++ /dev/null
@@ -1,29 +0,0 @@
-REM -------------------------------------------------------------------------
-REM Copyright (c) Microsoft Corporation. All rights reserved.
-REM Licensed under the MIT License.
-REM --------------------------------------------------------------------------
-@echo off
-
-set PIPELINE=%1
-set ROOT_DIR=%2
-set PERF_MONITORING_SCRIPT_NAME=%3
-
-if "%PIPELINE%"=="True" (
-    call olive-venv\\Scripts\\activate.bat || goto :error
-)
-
-rem install pytest
-call python -m pip install pytest
-
-rem performance monitoring
-call echo "performance monitoring examples"
-call python -m pip install -r %ROOT_DIR%\\perf_monitoring\\requirements.txt || goto :error
-
-call python -m pytest -v -s --log-cli-level=WARNING --junitxml=%ROOT_DIR%\\logs\\performance-monitoring-TestOlive.xml^
- %ROOT_DIR%\\perf_monitoring\\test_%PERF_MONITORING_SCRIPT_NAME%.py || goto :error
-
-goto :EOF
-
-:error
-echo Failed with error #%errorlevel%.
-exit /b %errorlevel%
diff --git a/scripts/perf_monitoring.sh b/scripts/perf_monitoring.sh
deleted file mode 100644
index 9ae4e00003..0000000000
--- a/scripts/perf_monitoring.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-set -eoux pipefail
-
-PIPELINE=$1
-ROOT_DIR=$2
-PERF_MONITORING_SCRIPT_NAME=$3
-
-
-echo $PIPELINE
-if [[ "$PIPELINE" == "True" ]]; then
-    set +x
-    source olive-venv/bin/activate
-    set -x
-fi
-
-# install pytest
-python -m pip install pytest
-
-# performance monitoring
-echo "performance monitoring examples"
-python -m pip install -r $ROOT_DIR/perf_monitoring/requirements.txt
-
-python -m pytest -v -s --log-cli-level=WARNING --junitxml=$ROOT_DIR/logs/performance-monitoring-TestOlive.xml $ROOT_DIR/perf_monitoring/test_$PERF_MONITORING_SCRIPT_NAME.py

From 19a1e92589224753d7950f97b93e4ba056f206dd Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 19:49:38 +0000
Subject: [PATCH 62/67] update comments

---
 .azure_pipelines/performance_check/run_performance_check.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index a44348dde1..c61f5fe94c 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -271,10 +271,12 @@ def regression_check(model_name, metrics, device, cpu_info):
         best_metric_json = json.load(f)
         best_metrics = best_metric_json[model_name][device]
         for metric_name in metrics_of_interest:
-            # There are 3 types of cpus in our cpu pool
+            # There are 4 types of cpus in our cpu pool
             # Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
             # Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
-            # Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz、
+            # Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+            # ? (8370C)
+            # Need to collect the best metrics for each type of cpu
             best_metric = best_metrics[metric_name]
             if device == "cpu" and metric_name == "latency-avg":
                 if "8272CL" in cpu_info:

From e81c8bf8120b8ea0c23a6750bb8139c64ef2cf06 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 10 Oct 2023 22:01:50 +0000
Subject: [PATCH 63/67] remove deberta latency goal

---
 .azure_pipelines/performance_check/configs/deberta.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure_pipelines/performance_check/configs/deberta.json b/.azure_pipelines/performance_check/configs/deberta.json
index 5ccd9167e8..847707936a 100644
--- a/.azure_pipelines/performance_check/configs/deberta.json
+++ b/.azure_pipelines/performance_check/configs/deberta.json
@@ -38,7 +38,7 @@
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 1}}
+                        {"name": "avg", "priority": 2}
                     ]
                 }
             ]

From 9ae9387a45a2fbd2fd5026026c25033571e70cf2 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Wed, 11 Oct 2023 01:17:11 +0000
Subject: [PATCH 64/67] update metric

---
 .../performance_check/best_metrics.json       |  2 +-
 .../performance_check/configs/bert.json       | 21 ++--------------
 olive/engine/engine.py                        | 25 ++++++++++---------
 3 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
index 982d7a33b4..7fe52b4c74 100644
--- a/.azure_pipelines/performance_check/best_metrics.json
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -3,7 +3,7 @@
         "cpu": {
             "accuracy-accuracy": 0.88,
             "latency-avg": {
-                "8272CL": 14.5,
+                "8272CL": 15.5,
                 "E5-2673": 30.0,
                 "8171M": 20.89
             }
diff --git a/.azure_pipelines/performance_check/configs/bert.json b/.azure_pipelines/performance_check/configs/bert.json
index 40452bd5a7..dac4a3d52d 100644
--- a/.azure_pipelines/performance_check/configs/bert.json
+++ b/.azure_pipelines/performance_check/configs/bert.json
@@ -25,14 +25,14 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}}
+                        {"name": "accuracy", "priority": 1}
                     ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 10}}
+                        {"name": "avg", "priority": 2}
                     ]
                 }
             ]
@@ -44,23 +44,6 @@
             "config": {
                 "target_opset": 13
             }
-        },
-        "transformers_optimization": {
-            "type": "OrtTransformersOptimization",
-            "disable_search": true
-        },
-        "quantization": {
-            "type": "OnnxQuantization",
-            "config": {
-                "quant_preprocess": true,
-                "data_config": "__input_model_data_config__"
-            }
-        },
-        "perf_tuning": {
-            "type": "OrtPerfTuning",
-            "config": {
-                "data_config": "__input_model_data_config__"
-            }
         }
     },
     "engine": {
diff --git a/olive/engine/engine.py b/olive/engine/engine.py
index 9a0102a9de..732145e5a8 100644
--- a/olive/engine/engine.py
+++ b/olive/engine/engine.py
@@ -748,18 +748,19 @@ def resolve_goals(
             for sub_type_name, goal in sub_type_goals.items():
                 # TODO(trajep): make the logic cleaner
                 resolved_goal_value = None
-                baseline_sub_type = baseline.get_value(metric_name, sub_type_name)
-                multiplier = multipliers[metric_name][sub_type_name]
-                if goal.type == "threshold":
-                    resolved_goal_value = goal.value
-                elif goal.type == "max-degradation":
-                    resolved_goal_value = baseline_sub_type - multiplier * goal.value
-                elif goal.type == "min-improvement":
-                    resolved_goal_value = baseline_sub_type + multiplier * goal.value
-                elif goal.type == "percent-max-degradation":
-                    resolved_goal_value = baseline_sub_type * (1 - multiplier * goal.value / 100)
-                elif goal.type == "percent-min-improvement":
-                    resolved_goal_value = baseline_sub_type * (1 + multiplier * goal.value / 100)
+                if goal is not None:
+                    baseline_sub_type = baseline.get_value(metric_name, sub_type_name)
+                    multiplier = multipliers[metric_name][sub_type_name]
+                    if goal.type == "threshold":
+                        resolved_goal_value = goal.value
+                    elif goal.type == "max-degradation":
+                        resolved_goal_value = baseline_sub_type - multiplier * goal.value
+                    elif goal.type == "min-improvement":
+                        resolved_goal_value = baseline_sub_type + multiplier * goal.value
+                    elif goal.type == "percent-max-degradation":
+                        resolved_goal_value = baseline_sub_type * (1 - multiplier * goal.value / 100)
+                    elif goal.type == "percent-min-improvement":
+                        resolved_goal_value = baseline_sub_type * (1 + multiplier * goal.value / 100)
 
                 resolved_goals[joint_metric_key(metric_name, sub_type_name)] = resolved_goal_value
         if len(resolved_goals) > 0:

From 843384bf50aadefe00f1283432d64cccaf8e0b48 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <xiaoyuzhang@microsoft.com>
Date: Wed, 11 Oct 2023 13:12:27 -0700
Subject: [PATCH 65/67] update metric value

---
 .azure_pipelines/performance_check/best_metrics.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
index 7fe52b4c74..022c7295a5 100644
--- a/.azure_pipelines/performance_check/best_metrics.json
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -4,7 +4,7 @@
             "accuracy-accuracy": 0.88,
             "latency-avg": {
                 "8272CL": 15.5,
-                "E5-2673": 30.0,
+                "E5-2673": 44.5,
                 "8171M": 20.89
             }
         },
@@ -33,13 +33,13 @@
             "latency-avg": {
                 "8272CL": 4.5,
                 "E5-2673": 11.0,
-                "8171M": 5.6,
+                "8171M": 5.9,
                 "8370C": 4.57
             }
         },
         "gpu": {
             "accuracy-accuracy": 0.94,
-            "latency-avg": 0.831
+            "latency-avg": 0.91
         }
     },
     "roberta_large": {

From d3767500a6df876a5b1fa3d71af55f94280138e1 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <xiaoyuzhang@microsoft.com>
Date: Wed, 11 Oct 2023 13:19:01 -0700
Subject: [PATCH 66/67] update log

---
 .../performance_check/configs/bert.json       | 29 +++++++--
 .../run_performance_check.py                  |  5 +-
 Makefile                                      | 61 -------------------
 examples/bert/bert_cuda_gpu.json              |  2 +-
 4 files changed, 28 insertions(+), 69 deletions(-)
 delete mode 100644 Makefile

diff --git a/.azure_pipelines/performance_check/configs/bert.json b/.azure_pipelines/performance_check/configs/bert.json
index dac4a3d52d..4544781596 100644
--- a/.azure_pipelines/performance_check/configs/bert.json
+++ b/.azure_pipelines/performance_check/configs/bert.json
@@ -25,15 +25,13 @@
                     "type": "accuracy",
                     "backend": "huggingface_metrics",
                     "sub_types": [
-                        {"name": "accuracy", "priority": 1}
-                    ]
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.05}}                    ]
                 },
                 {
                     "name": "latency",
                     "type": "latency",
                     "sub_types": [
-                        {"name": "avg", "priority": 2}
-                    ]
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 5}}                    ]
                 }
             ]
         }
@@ -44,6 +42,29 @@
             "config": {
                 "target_opset": 13
             }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization",
+            "config": {
+                "quant_preprocess": true,
+                "data_config": "__input_model_data_config__"
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
         }
     },
     "engine": {
diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index c61f5fe94c..9e217636d7 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -159,7 +159,6 @@ def run_with_config(tool, olive_config, metric_res):
     print(f"Start evaluating {tool} model")
     outputs = olive_run(olive_config)
     if tool == "olive":
-        print(next(iter(outputs.values())).nodes.values())
         metric = str(next(iter(next(iter(outputs.values())).nodes.values())).metrics.value)
     else:
         metric = str(next(iter(outputs.values())))
@@ -240,12 +239,12 @@ def run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num):
         olive_config = f"{model_name}.json" if device == "cpu" else f"{model_name}_gpu.json"
         olive_config_path = cur_dir / "configs" / olive_config
         run_with_config("olive", olive_config_path, metric_res)
-    print(metric_res)
+    print(f"All metric results {metric_res}")
     for model, v in metric_res.items():
         for metric_name, metric_value_list in v.items():
             vsum = sum(float(v) for v in metric_value_list)
             metric_res[model][metric_name] = round((vsum / len(metric_value_list)), 4)
-    print(metric_res)
+    print(f"Avg metric results {metric_res}")
     return metric_res
 
 
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 5a15c12c2c..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,61 +0,0 @@
-WINDOWS                    ?= False
-PIPELINE                   ?= False
-INSTALL_DEV_MODE           ?= False
-EXAMPLE_FOLDER             ?=
-EXAMPLE_NAME               ?=
-PERF_MONITORING_SCRIPT_NAME ?=
-INSTALL_EXTRAS             ?=
-VERSION                    ?=
-ifeq ($(WINDOWS), True)
-	CURRENT_DIR             = "$(subst /,\\,${CURDIR})"
-	MKDIR_LOG_CMD           = mkdir logs | exit 0
-	INSTALL_OLIVE_CMD       = "scripts\\install_olive.bat"
-	TEST_CMD                = "scripts\\test.bat"
-	TEST_EXAMPLES_CMD       = "scripts\\test_examples.bat"
-	PERFORMANCE_MONITORING_CMD = "scripts\\perf_monitoring.bat"
-	OVERWRITE_VERSION       = "python scripts\\overwrite_version.py --version $(VERSION)"
-else
-	CURRENT_DIR             = ${CURDIR}
-	MKDIR_LOG_CMD           = mkdir -p logs
-	INSTALL_OLIVE_CMD       = bash scripts/install_olive.sh
-	TEST_CMD                = bash scripts/test.sh
-	TEST_EXAMPLES_CMD       = bash scripts/test_examples.sh
-	PERFORMANCE_MONITORING_CMD = bash scripts/perf_monitoring.sh
-	OVERWRITE_VERSION       = python scripts/overwrite_version.py --version $(VERSION)
-endif
-
-.PHONY: all
-all:
-	@echo "Please specify your command. Options: install-olive, test-examples, clean."
-
-logs/:
-	$(MKDIR_LOG_CMD)
-
-.PHONY: overwrite-version
-overwrite-version:
-	$(OVERWRITE_VERSION)
-
-.PHONY: install-olive
-install-olive:
-	$(INSTALL_OLIVE_CMD) $(PIPELINE) $(INSTALL_DEV_MODE)
-
-.PHONY: unit_test
-unit_test:
-	$(TEST_CMD) $(PIPELINE) $(CURRENT_DIR) unit_test
-
-.PHONY: integ_test
-integ_test:
-	$(TEST_CMD) $(PIPELINE) $(CURRENT_DIR) integ_test
-
-.PHONY: test-examples
-test-examples: logs/
-test-examples:
-	$(TEST_EXAMPLES_CMD) $(PIPELINE) $(CURRENT_DIR) $(EXAMPLE_FOLDER) $(EXAMPLE_NAME)
-
-.PHONY: clean
-clean:
-	git clean -dfX
-
-.PHONY: perf-monitoring
-perf-monitoring:
-	$(PERFORMANCE_MONITORING_CMD) $(PIPELINE) $(CURRENT_DIR) $(PERF_MONITORING_SCRIPT_NAME)
diff --git a/examples/bert/bert_cuda_gpu.json b/examples/bert/bert_cuda_gpu.json
index 9e6a852cfe..bdd9f25748 100644
--- a/examples/bert/bert_cuda_gpu.json
+++ b/examples/bert/bert_cuda_gpu.json
@@ -79,4 +79,4 @@
         "cache_dir": "cache",
         "output_dir" : "models/bert_cuda"
     }
-}
\ No newline at end of file
+}

From 432d4f579271468e25770a0af906fa2e8e5eabb9 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <xiaoyuzhang@microsoft.com>
Date: Wed, 11 Oct 2023 15:54:13 -0700
Subject: [PATCH 67/67] update data for new cpu

---
 .azure_pipelines/performance_check/best_metrics.json     | 9 ++++++---
 .../performance_check/run_performance_check.py           | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.azure_pipelines/performance_check/best_metrics.json b/.azure_pipelines/performance_check/best_metrics.json
index 022c7295a5..36890b63be 100644
--- a/.azure_pipelines/performance_check/best_metrics.json
+++ b/.azure_pipelines/performance_check/best_metrics.json
@@ -5,7 +5,8 @@
             "latency-avg": {
                 "8272CL": 15.5,
                 "E5-2673": 44.5,
-                "8171M": 20.89
+                "8171M": 20.89,
+                "8370C": 18
             }
         },
         "gpu": {
@@ -19,7 +20,8 @@
             "latency-avg": {
                 "8272CL": 57.74,
                 "E5-2673": 117.00,
-                "8171M": 93.37
+                "8171M": 93.37,
+                "8370C": 0
             }
         },
         "gpu": {
@@ -48,7 +50,8 @@
             "latency-avg": {
                 "8272CL": 52.38,
                 "E5-2673": 140.34,
-                "8171M": 69.33
+                "8171M": 69.33,
+                "8370C": 0
             }
         },
         "gpu": {
diff --git a/.azure_pipelines/performance_check/run_performance_check.py b/.azure_pipelines/performance_check/run_performance_check.py
index 9e217636d7..99439a7207 100644
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@@ -274,7 +274,7 @@ def regression_check(model_name, metrics, device, cpu_info):
             # Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
             # Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
             # Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
-            # ? (8370C)
+            # Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
             # Need to collect the best metrics for each type of cpu
             best_metric = best_metrics[metric_name]
             if device == "cpu" and metric_name == "latency-avg":