diff --git a/README.md b/README.md
index 91e4d21b22..e0ea89bcc3 100644
--- a/README.md
+++ b/README.md
@@ -139,12 +139,12 @@ deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
 ## 👩‍💻 NLP Inference Example
 
 ```python
-from deepsparse.transformers import pipeline
+from deepsparse import Pipeline
 
 # SparseZoo model stub or path to ONNX file
 model_path = "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni"
 
-qa_pipeline = pipeline(
+qa_pipeline = Pipeline.create(
     task="question-answering",
     model_path=model_path,
 )
diff --git a/setup.py b/setup.py
index 94d8cfc70c..8237ae3a29 100644
--- a/setup.py
+++ b/setup.py
@@ -45,6 +45,7 @@
 _deps = [
     "numpy>=1.16.3",
     "onnx>=1.5.0,<=1.10.1",
+    "pydantic>=1.8.2",
     "requests>=2.0.0",
     "tqdm>=4.0.0",
     "protobuf>=3.12.2",
@@ -74,13 +75,22 @@
     "uvicorn>=0.15.0",
     "fastapi>=0.70.0",
     "starlette>=0.16.0",
-    "pydantic>=1.8.2",
     "requests>=2.26.0",
 ]
 _onnxruntime_deps = [
     "onnxruntime>=1.7.0",
 ]
 
+_ic_integration_deps = [
+    "click<8.1",
+    "opencv-python",
+]
+
+_yolo_integration_deps = [
+    "torchvision>=0.3.0,<=0.10.1",
+    "opencv-python",
+]
+
 
 class OverrideInstall(install):
     """
@@ -173,12 +183,15 @@ def _setup_extras() -> Dict:
         "dev": _dev_deps,
         "server": _server_deps,
         "onnxruntime": _onnxruntime_deps,
+        "image_classification": _ic_integration_deps,
+        "yolo": _yolo_integration_deps,
     }
 
 
 def _setup_entry_points() -> Dict:
     data_api_entrypoint = "deepsparse.transformers.pipelines_cli:cli"
     eval_downstream = "deepsparse.transformers.eval_downstream:main"
+
     return {
         "console_scripts": [
             f"deepsparse.transformers.run_inference={data_api_entrypoint}",
@@ -187,6 +200,7 @@ def _setup_entry_points() -> Dict:
             "deepsparse.check_hardware=deepsparse.cpu:print_hardware_capability",
             "deepsparse.benchmark=deepsparse.benchmark.benchmark_model:main",
             "deepsparse.server=deepsparse.server.main:start_server",
+            "deepsparse.object_detection.annotate=deepsparse.yolo.annotate:main",
         ]
     }
 
diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index 3d3113b74b..d9c28dc591 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -31,6 +31,7 @@
     cpu_vnni_compatible,
 )
 from .engine import *
+from .pipeline import *
 from .version import __version__, is_release
 
 
diff --git a/src/deepsparse/image_classification/__init__.py b/src/deepsparse/image_classification/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/image_classification/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/image_classification/constants.py b/src/deepsparse/image_classification/constants.py
new file mode 100644
index 0000000000..d035e44513
--- /dev/null
+++ b/src/deepsparse/image_classification/constants.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IMAGENET_RGB_MEANS = [0.485, 0.456, 0.406]
+IMAGENET_RGB_STDS = [0.229, 0.224, 0.225]
diff --git a/src/deepsparse/image_classification/pipelines.py b/src/deepsparse/image_classification/pipelines.py
new file mode 100644
index 0000000000..e085937728
--- /dev/null
+++ b/src/deepsparse/image_classification/pipelines.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Image classification pipeline
+"""
+import json
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+import onnx
+
+from deepsparse.image_classification.constants import (
+    IMAGENET_RGB_MEANS,
+    IMAGENET_RGB_STDS,
+)
+from deepsparse.image_classification.schemas import (
+    ImageClassificationInput,
+    ImageClassificationOutput,
+)
+from deepsparse.pipeline import Pipeline
+from deepsparse.utils import model_to_path
+
+
+try:
+    import cv2
+
+    cv2_error = None
+except ModuleNotFoundError as cv2_import_error:
+    cv2 = None
+    cv2_error = cv2_import_error
+
+
+@Pipeline.register(
+    task="image_classification",
+    default_model_path=(
+        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/"
+        "imagenet/pruned85_quant-none-vnni"
+    ),
+)
+class ImageClassificationPipeline(Pipeline):
+    """
+    Image classification pipeline for DeepSparse
+
+    :param model_path: path on local system or SparseZoo stub to load the model from
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param class_names: Optional dict, or json file of class names to use for
+        mapping class ids to class labels. Default is None
+    """
+
+    def __init__(
+        self,
+        *,
+        class_names: Union[None, str, Dict[str, str]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(class_names, str) and class_names.endswith(".json"):
+            self._class_names = json.load(open(class_names))
+        elif isinstance(class_names, dict):
+            self._class_names = class_names
+        else:
+            self._class_names = None
+
+        self._image_size = self._infer_image_size()
+
+    @property
+    def class_names(self) -> Optional[Dict[str, str]]:
+        """
+        :return: Optional dict, or json file of class names to use for
+            mapping class ids to class labels
+        """
+        return self._class_names
+
+    @property
+    def input_schema(self) -> Type[ImageClassificationInput]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return ImageClassificationInput
+
+    @property
+    def output_schema(self) -> Type[ImageClassificationOutput]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        return ImageClassificationOutput
+
+    def setup_onnx_file_path(self) -> str:
+        """
+        Performs any setup to unwrap and process the given `model_path` and other
+        class properties into an inference ready onnx file to be compiled by the
+        engine of the pipeline
+
+        :return: file path to the ONNX file for the engine to compile
+        """
+
+        return model_to_path(self.model_path)
+
+    def process_inputs(self, inputs: ImageClassificationInput) -> List[numpy.ndarray]:
+        """
+        Pre-Process the Inputs for DeepSparse Engine
+
+        :param inputs: input model
+        :return: list of preprocessed numpy arrays
+        """
+
+        if isinstance(inputs.images, numpy.ndarray):
+            image_batch = inputs.images
+        else:
+
+            image_batch = []
+
+            if isinstance(inputs.images, str):
+                inputs.images = [inputs.images]
+
+            for image in inputs.images:
+                if cv2 is None:
+                    raise RuntimeError(
+                        "cv2 is required to load image inputs from file "
+                        f"Unable to import: {cv2_error}"
+                    )
+                img = cv2.imread(image) if isinstance(image, str) else image
+
+                img = cv2.resize(img, dsize=self._image_size)
+                img = img[:, :, ::-1].transpose(2, 0, 1)
+                image_batch.append(img)
+
+            image_batch = numpy.stack(image_batch, axis=0)
+
+        original_dtype = image_batch.dtype
+        image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32)
+
+        if original_dtype == numpy.uint8:
+
+            image_batch /= 255
+
+        # normalize entire batch
+        image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1))
+        image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1))
+
+        return [image_batch]
+
+    def process_engine_outputs(
+        self,
+        engine_outputs: List[numpy.ndarray],
+    ) -> ImageClassificationOutput:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+        labels = numpy.argmax(engine_outputs[0], axis=1).tolist()
+
+        if self.class_names is not None:
+            labels = [self.class_names[str(class_id)] for class_id in labels]
+
+        return self.output_schema(
+            scores=numpy.max(engine_outputs[0], axis=1).tolist(),
+            labels=labels,
+        )
+
+    def _infer_image_size(self) -> Tuple[int, ...]:
+        """
+        Infer and return the expected shape of the input tensor
+
+        :return: The expected shape of the input tensor from onnx graph
+        """
+        onnx_model = onnx.load(self.onnx_file_path)
+        input_tensor = onnx_model.graph.input[0]
+        return (
+            input_tensor.type.tensor_type.shape.dim[2].dim_value,
+            input_tensor.type.tensor_type.shape.dim[3].dim_value,
+        )
diff --git a/src/deepsparse/image_classification/schemas.py b/src/deepsparse/image_classification/schemas.py
new file mode 100644
index 0000000000..5a92b90e3b
--- /dev/null
+++ b/src/deepsparse/image_classification/schemas.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Input/Output Schemas for Image Classification.
+"""
+
+from typing import List, Union
+
+import numpy
+from pydantic import BaseModel
+
+
+class ImageClassificationInput(BaseModel):
+    """
+    Input model for image classification
+    """
+
+    images: Union[str, numpy.ndarray, List[str]]
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class ImageClassificationOutput(BaseModel):
+    """
+    Output model for image classification
+    """
+
+    labels: List[Union[int, str]]
+    scores: List[float]
diff --git a/src/deepsparse/image_classification/validation_script.py b/src/deepsparse/image_classification/validation_script.py
new file mode 100644
index 0000000000..e176b4072c
--- /dev/null
+++ b/src/deepsparse/image_classification/validation_script.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage: validation_script.py [OPTIONS]
+
+  Validation Script for Image Classification Models
+
+Options:
+  --dataset-path, --dataset_path DIRECTORY
+                                  Path to the validation dataset  [required]
+  --model-path, --model_path TEXT
+                                  Path/SparseZoo stub for the Image
+                                  Classification model to be evaluated.
+                                  Defaults to resnet50 trained on
+                                  Imagenette  [default: zoo:cv/classification/
+                                  resnet_v1-50/pytorch/sparseml/imagenette/
+                                  base-none]
+  --batch-size, --batch_size INTEGER
+                                  Test batch size, must divide the dataset
+                                  evenly, else the last batch will be dropped
+                                  [default: 1]
+  --help                          Show this message and exit.
+
+#########
+EXAMPLES
+#########
+
+##########
+Example command for validating pruned resnet50 on imagenette dataset:
+python validation_script.py \
+  --dataset-path /path/to/imagenette/
+
+"""
+from tqdm import tqdm
+
+from deepsparse.pipeline import Pipeline
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+
+try:
+    import torchvision
+
+except ModuleNotFoundError as torchvision_error:  # noqa: F841
+    print(
+        "Torchvision not installed. Please install it using the command:"
+        "pip install torchvision>=0.3.0,<=0.10.1"
+    )
+    exit(1)
+
+import click
+
+
+resnet50_imagenet_pruned = (
+    "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenette/base-none"
+)
+
+
+@click.command()
+@click.option(
+    "--dataset-path",
+    "--dataset_path",
+    required=True,
+    type=click.Path(dir_okay=True, file_okay=False),
+    help="Path to the validation dataset",
+)
+@click.option(
+    "--model-path",
+    "--model_path",
+    type=str,
+    default=resnet50_imagenet_pruned,
+    help="Path/SparseZoo stub for the Image Classification model to be "
+    "evaluated. Defaults to dense (vanilla) resnet50 trained on Imagenette",
+    show_default=True,
+)
+@click.option(
+    "--batch-size",
+    "--batch_size",
+    type=int,
+    default=1,
+    show_default=True,
+    help="Test batch size, must divide the dataset evenly, else last "
+    "batch will be dropped",
+)
+@click.option(
+    "--image-size",
+    "--image_size",
+    type=int,
+    default=224,
+    show_default=True,
+    help="Test batch size, must divide the dataset evenly, else last "
+    "batch will be dropped",
+)
+def main(dataset_path: str, model_path: str, batch_size: int, image_size: int):
+    """
+    Validation Script for Image Classification Models
+    """
+
+    dataset = torchvision.datasets.ImageFolder(
+        root=dataset_path,
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Resize(size=(image_size, image_size)),
+            ]
+        ),
+    )
+
+    data_loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        drop_last=True,
+    )
+
+    pipeline = Pipeline.create(
+        task="image_classification",
+        model_path=model_path,
+        batch_size=batch_size,
+    )
+    correct = total = 0
+    progress_bar = tqdm(data_loader)
+
+    for batch in progress_bar:
+        batch, actual_labels = batch
+        batch = batch.numpy()
+        outs = pipeline(images=batch)
+        predicted_labels = outs.labels
+
+        for actual, predicted in zip(actual_labels, predicted_labels):
+            total += 1
+            if isinstance(predicted, str):
+                predicted = int(predicted)
+            if actual.item() == predicted:
+                correct += 1
+
+        if total > 0:
+            progress_bar.set_postfix(
+                {"Running Accuracy": f"{correct * 100 / total:.2f}%"}
+            )
+
+    # prevent division by zero
+    if total == 0:
+        epsilon = 1e-5
+        total += epsilon
+
+    print(f"Accuracy: {correct * 100 / total:.2f} %")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
new file mode 100644
index 0000000000..5ab6b9ec63
--- /dev/null
+++ b/src/deepsparse/pipeline.py
@@ -0,0 +1,546 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Classes and registry for end to end inference pipelines that wrap an underlying
+inference engine and include pre/postprocessing
+"""
+
+
+import os
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+from pydantic import BaseModel, Field
+
+from deepsparse import Engine, Scheduler
+from deepsparse.benchmark import ORTEngine
+from deepsparse.tasks import SupportedTasks
+
+
+__all__ = [
+    "DEEPSPARSE_ENGINE",
+    "ORT_ENGINE",
+    "SUPPORTED_PIPELINE_ENGINES",
+    "Pipeline",
+    "PipelineConfig",
+]
+
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+
+SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
+
+
+_REGISTERED_PIPELINES = {}
+
+
+class Pipeline(ABC):
+    """
+    Generic Pipeline abstract class meant to wrap inference engine objects to include
+    data pre/post-processing. Inputs and outputs of pipelines should be serialized
+    as pydantic Models.
+
+    Pipelines should not be instantiated by their constructors, but rather the
+    `Pipeline.create()` method. The task name given to `create` will be used to
+    load the appropriate pipeline. When creating a Pipeline, the pipeline should
+    inherit from `Pipeline` and implement the `setup_onnx_file_path`, `process_inputs`,
+    `process_engine_outputs`, `input_schema`, and `output_schema` abstract methods.
+
+    Finally, the class definition should be decorated by the `Pipeline.register`
+    function. This defines the task name and task aliases for the pipeline and
+    ensures that it will be accessible by `Pipeline.create`. The implemented
+    `Pipeline` subclass must be imported at runtime to be accessible.
+
+    Pipeline lifecycle:
+     - On instantiation
+         * `onnx_file_path` <- `setup_onnx_file_path`
+         * `engine` <- `_initialize_engine`
+
+     - on __call__:
+         * `parsed_inputs: input_schema` <- `parse_inputs(*args, **kwargs)`
+         * `pre_processed_inputs` <- `process_inputs(parsed_inputs)`
+         * `engine_outputs` <- `engine(pre_processed_inputs)`
+         * `outputs: output_schema` <- `process_engine_outputs(engine_outputs)`
+
+    Example use of register:
+     ```python
+     @Pipeline.register(
+     task="example_task",
+     task_aliases=["example_alias_1", "example_alias_2"],
+     )
+     class PipelineImplementation(Pipeline):
+     # implementation of Pipeline abstract methods here
+     ```
+
+    Example use of pipeline:
+     ```python
+     example_pipeline = Pipeline.create(
+         task="example_task",
+         model_path="model.onnx",
+     )
+     pipeline_outputs = example_pipeline(pipeline_inputs)
+     ```
+
+    :param model_path: path on local system or SparseZoo stub to load the model from
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        engine_type: str = DEEPSPARSE_ENGINE,
+        batch_size: int = 1,
+        num_cores: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
+        alias: Optional[str] = None,
+    ):
+        self._model_path_orig = model_path
+        self._model_path = model_path
+        self._engine_type = engine_type
+        self._alias = alias
+
+        self._engine_args = dict(
+            batch_size=batch_size,
+            num_cores=num_cores,
+            input_shapes=input_shapes,
+        )
+        if engine_type.lower() == DEEPSPARSE_ENGINE:
+            self._engine_args["scheduler"] = scheduler
+
+        self.onnx_file_path = self.setup_onnx_file_path()
+        self.engine = self._initialize_engine()
+
+    def __call__(self, *args, **kwargs) -> BaseModel:
+        # parse inputs into input_schema schema if necessary
+        pipeline_inputs = self.parse_inputs(*args, **kwargs)
+        if not isinstance(pipeline_inputs, self.input_schema):
+            raise RuntimeError(
+                f"Unable to parse {self.__class__} inputs into a "
+                f"{self.input_schema} object. Inputs parsed to {type(pipeline_inputs)}"
+            )
+
+        # run pipeline
+        engine_inputs: List[numpy.ndarray] = self.process_inputs(pipeline_inputs)
+
+        if isinstance(engine_inputs, tuple):
+            engine_inputs, postprocess_kwargs = engine_inputs
+        else:
+            postprocess_kwargs = {}
+
+        engine_outputs: List[numpy.ndarray] = self.engine(engine_inputs)
+        pipeline_outputs = self.process_engine_outputs(
+            engine_outputs, **postprocess_kwargs
+        )
+
+        # validate outputs format
+        if not isinstance(pipeline_outputs, self.output_schema):
+            raise ValueError(
+                f"Outputs of {self.__class__} must be instances of "
+                f"{self.output_schema} found output of type {type(pipeline_outputs)}"
+            )
+
+        return pipeline_outputs
+
+    @staticmethod
+    def create(
+        task: str,
+        model_path: str = None,
+        engine_type: str = DEEPSPARSE_ENGINE,
+        batch_size: int = 1,
+        num_cores: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
+        alias: Optional[str] = None,
+        **kwargs,
+    ) -> "Pipeline":
+        """
+        :param task: name of task to create a pipeline for
+        :param model_path: path on local system or SparseZoo stub to load the model
+            from. Some tasks may have a default model path
+        :param engine_type: inference engine to use. Currently supported values
+            include 'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+        :param batch_size: static batch size to use for inference. Default is 1
+        :param num_cores: number of CPU cores to allocate for inference engine. None
+            specifies all available cores. Default is None
+        :param scheduler: (deepsparse only) kind of scheduler to execute with.
+            Pass None for the default
+        :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+            to use model as-is. Default is None
+        :param alias: optional name to give this pipeline instance, useful when
+            inferencing with multiple models. Default is None
+        :param kwargs: extra task specific kwargs to be passed to task Pipeline
+            implementation
+        :return: pipeline object initialized for the given task
+        """
+        task = task.lower().replace("-", "_")
+
+        # extra step to register pipelines for a given task domain
+        # for cases where imports should only happen once a user specifies
+        # that domain is to be used. (ie deepsparse.transformers will auto
+        # install extra packages so should only import and register once a
+        # transformers task is specified)
+        SupportedTasks.check_register_task(task)
+
+        if task not in _REGISTERED_PIPELINES:
+            raise ValueError(
+                f"Unknown Pipeline task {task}. Pipeline tasks should be "
+                "must be declared with the Pipeline.register decorator. Currently "
+                f"registered pipelines: {list(_REGISTERED_PIPELINES.keys())}"
+            )
+
+        pipeline_constructor = _REGISTERED_PIPELINES[task]
+
+        if (
+            model_path is None
+            and hasattr(pipeline_constructor, "default_model_path")
+            and pipeline_constructor.default_model_path
+        ):
+            model_path = pipeline_constructor.default_model_path
+
+        if model_path is None:
+            raise ValueError(
+                f"No model_path provided for pipeline {pipeline_constructor}. Must "
+                "provide a model path for pipelines that do not have a default defined"
+            )
+
+        return pipeline_constructor(
+            model_path=model_path,
+            engine_type=engine_type,
+            batch_size=batch_size,
+            num_cores=num_cores,
+            scheduler=scheduler,
+            input_shapes=input_shapes,
+            alias=alias,
+            **kwargs,
+        )
+
+    @classmethod
+    def register(
+        cls,
+        task: str,
+        task_aliases: Optional[List[str]] = None,
+        default_model_path: Optional[str] = None,
+    ):
+        """
+        Pipeline implementer class decorator that registers the pipeline
+        task name and its aliases as valid tasks that can be used to load
+        the pipeline through `Pipeline.create()`.
+
+        Multiple pipelines may not have the same task name. An error will
+        be raised if two different pipelines attempt to register the same task name
+
+        :param task: main task name of this pipeline
+        :param task_aliases: list of extra task names that may be used to reference
+            this pipeline. Default is None
+        :param default_model_path: path (ie zoo stub) to use as default for this
+            task if None is provided
+        """
+        task_names = [task]
+        if task_aliases:
+            task_names.extend(task_aliases)
+
+        def _register_task(task_name, pipeline_class):
+            if task_name in _REGISTERED_PIPELINES and (
+                pipeline_class is not _REGISTERED_PIPELINES[task_name]
+            ):
+                raise RuntimeError(
+                    f"task {task_name} already registered by Pipeline.register. "
+                    f"attempting to register pipeline: {pipeline_class}, but"
+                    f"pipeline: {_REGISTERED_PIPELINES[task_name]}, already registered"
+                )
+            _REGISTERED_PIPELINES[task_name] = pipeline_class
+
+        def _register_pipeline_tasks_decorator(pipeline_class: Pipeline):
+            if not issubclass(pipeline_class, cls):
+                raise RuntimeError(
+                    f"Attempting to register pipeline pipeline_class. "
+                    f"Registered pipelines must inherit from {cls}"
+                )
+            for task_name in task_names:
+                _register_task(task_name, pipeline_class)
+
+            # set task and task_aliases as class level property
+            pipeline_class.task = task
+            pipeline_class.task_aliases = task_aliases
+            pipeline_class.default_model_path = default_model_path
+
+            return pipeline_class
+
+        return _register_pipeline_tasks_decorator
+
+    @classmethod
+    def from_config(cls, config: Union["PipelineConfig", str, Path]) -> "Pipeline":
+        """
+        :param config: PipelineConfig object, filepath to a json serialized
+            PipelineConfig, or raw string of a json serialized PipelineConfig
+        :return: loaded Pipeline object from the config
+        """
+        if isinstance(config, Path) or (
+            isinstance(config, str) and os.path.exists(config)
+        ):
+            if isinstance(config, str):
+                config = Path(config)
+            config = PipelineConfig.parse_file(config)
+        if isinstance(config, str):
+            config = PipelineConfig.parse_raw(config)
+
+        return cls.create(
+            task=config.task,
+            model_path=config.model_path,
+            engine_type=config.engine_type,
+            batch_size=config.batch_size,
+            num_cores=config.num_cores,
+            scheduler=config.scheduler,
+            input_shapes=config.input_shapes,
+            alias=config.alias,
+            **config.kwargs,
+        )
+
+    @abstractmethod
+    def setup_onnx_file_path(self) -> str:
+        """
+        Performs any setup to unwrap and process the given `model_path` and other
+        class properties into an inference ready onnx file to be compiled by the
+        engine of the pipeline
+
+        :return: file path to the ONNX file for the engine to compile
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def process_inputs(
+        self,
+        inputs: BaseModel,
+    ) -> Union[List[numpy.ndarray], Tuple[List[numpy.ndarray], Dict[str, Any]]]:
+        """
+        :param inputs: inputs to the pipeline. Must be the type of the `input_schema`
+            of this pipeline
+        :return: inputs of this model processed into a list of numpy arrays that
+            can be directly passed into the forward pass of the pipeline engine. Can
+            also include a tuple with engine inputs and special key word arguments
+            to pass to process_engine_outputs to facilitate information from the raw
+            inputs to postprocessing that may not be included in the engine inputs
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def process_engine_outputs(
+        self,
+        engine_outputs: List[numpy.ndarray],
+        **kwargs,
+    ) -> BaseModel:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        raise NotImplementedError()
+
+    @property
+    def alias(self) -> str:
+        """
+        :return: optional name to give this pipeline instance, useful when
+            inferencing with multiple models
+        """
+        return self._alias
+
+    @property
+    def model_path_orig(self) -> str:
+        """
+        :return: value originally passed to the `model_path` argument to initialize
+            this Pipeline
+        """
+        return self._model_path_orig
+
+    @property
+    def model_path(self) -> str:
+        """
+        :return: path on local system to the onnx file of this model or directory
+            containing a model.onnx file along with supporting files
+        """
+        return self._model_path
+
+    @property
+    def engine_args(self) -> Dict[str, Any]:
+        """
+        :return: arguments besides onnx filepath used to instantiate engine
+        """
+        return self._engine_args
+
+    @property
+    def engine_type(self) -> str:
+        """
+        :return: type of inference engine used for model forward pass
+        """
+        return self._engine_type
+
+    def to_config(self) -> "PipelineConfig":
+        """
+        :return: PipelineConfig that can be used to reload this object
+        """
+
+        if not hasattr(self, "task"):
+            raise RuntimeError(
+                f"{self.__class__} instance has no attribute task. Pipeline objects "
+                "must have a task to be serialized to a config. Pipeline objects "
+                "must be declared with the Pipeline.register object to be assigned a "
+                "task"
+            )
+
+        # parse any additional properties as kwargs
+        kwargs = {}
+        for attr_name, attr in self.__class__.__dict__.items():
+            if isinstance(attr, property) and attr_name not in dir(PipelineConfig):
+                kwargs[attr_name] = getattr(self, attr_name)
+
+        return PipelineConfig(
+            task=self.task,
+            model_path=self.model_path_orig,
+            engine_type=self.engine_type,
+            batch_size=self.batch_size,
+            num_cores=self.num_cores,
+            scheduler=self.scheduler,
+            input_shapes=self.input_shapes,
+            alias=self.alias,
+            kwargs=kwargs,
+        )
+
+    def parse_inputs(self, *args, **kwargs) -> BaseModel:
+        """
+        :param args: ordered arguments to pipeline, only an input_schema object
+            is supported as an arg for this function
+        :param kwargs: keyword arguments to pipeline
+        :return: pipeline arguments parsed into the given `input_schema`
+            schema if necessary. If an instance of the `input_schema` is provided
+            it will be returned
+        """
+        # passed input_schema schema directly
+        if len(args) == 1 and isinstance(args[0], self.input_schema) and not kwargs:
+            return args[0]
+
+        if args:
+            raise ValueError(
+                f"pipeline {self.__class__} only supports either only a "
+                f"{self.input_schema} object. or keyword arguments to be construct "
+                f"one. Found {len(args)} args and {len(kwargs)} kwargs"
+            )
+
+        return self.input_schema(**kwargs)
+
+    def _initialize_engine(self) -> Union[Engine, ORTEngine]:
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            return Engine(self.onnx_file_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_file_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
+
+
+class PipelineConfig(BaseModel):
+    """
+    Configuration for creating a Pipeline object
+
+    Can be used to create a Pipeline from a config object or file with
+    Pipeline.from_config(), or used as a building block for other configs
+    such as for deepsparse.server
+    """
+
+    task: str = Field(
+        description="name of task to create a pipeline for",
+    )
+    model_path: str = Field(
+        description="path on local system or SparseZoo stub to load the model from",
+    )
+    engine_type: str = Field(
+        default=DEEPSPARSE_ENGINE,
+        description=(
+            "inference engine to use. Currently supported values include "
+            "'deepsparse' and 'onnxruntime'. Default is 'deepsparse'"
+        ),
+    )
+    batch_size: int = Field(
+        default=1,
+        description=("static batch size to use for inference. Default is 1"),
+    )
+    num_cores: int = Field(
+        default=None,
+        description=(
+            "number of CPU cores to allocate for inference engine. None"
+            "specifies all available cores. Default is None"
+        ),
+    )
+    scheduler: str = Field(
+        default="async",
+        description=(
+            "(deepsparse only) kind of scheduler to execute with. Defaults to async"
+        ),
+    )
+    input_shapes: List[List[int]] = Field(
+        default=None,
+        description=(
+            "list of shapes to set ONNX the inputs to. Pass None to use model as-is. "
+            "Default is None"
+        ),
+    )
+    alias: str = Field(
+        default=None,
+        description=(
+            "optional name to give this pipeline instance, useful when inferencing "
+            "with multiple models. Default is None"
+        ),
+    )
+    kwargs: Dict[str, Any] = Field(
+        default={},
+        description=(
+            "Additional arguments for inference with the model that will be passed "
+            "into the pipeline as kwargs"
+        ),
+    )
diff --git a/src/deepsparse/server/config.py b/src/deepsparse/server/config.py
index 7f9ac9bd59..0d0be42ec0 100644
--- a/src/deepsparse/server/config.py
+++ b/src/deepsparse/server/config.py
@@ -19,18 +19,18 @@
 import json
 import os
 from functools import lru_cache
-from typing import Any, Dict, List
+from typing import List
 
 import yaml
 from pydantic import BaseModel, Field
 
+from deepsparse import PipelineConfig
 from deepsparse.cpu import cpu_architecture
 
 
 __all__ = [
     "ENV_DEEPSPARSE_SERVER_CONFIG",
     "ENV_SINGLE_PREFIX",
-    "ServeModelConfig",
     "ServerConfig",
 ]
 
@@ -39,75 +39,15 @@
 ENV_SINGLE_PREFIX = "DEEPSPARSE_SINGLE_MODEL:"
 
 
-class ServeModelConfig(BaseModel):
-    """
-    Configuration for serving a model for a given task in the DeepSparse server
-    """
-
-    task: str = Field(
-        description=(
-            "The task the model_path is serving. For example, one of: "
-            "question_answering, text_classification, token_classification."
-        ),
-    )
-    model_path: str = Field(
-        description=(
-            "The path to a model.onnx file, "
-            "a model folder containing the model.onnx and supporting files, "
-            "or a SparseZoo model stub."
-        ),
-    )
-    batch_size: int = Field(
-        default=1,
-        description=(
-            "The batch size to instantiate the model with and use for serving"
-        ),
-    )
-    alias: str = Field(
-        default=None,
-        description=(
-            "Alias name for model pipeline to be served. A convenience route of "
-            "/predict/alias will be added to the server if present. "
-        ),
-    )
-    kwargs: Dict[str, Any] = Field(
-        default={},
-        description=(
-            "Additional arguments for inference with the model that will be passed "
-            "into the pipeline as kwargs"
-        ),
-    )
-    engine: str = Field(
-        default="deepsparse",
-        description=(
-            "The engine to use for serving the models such as deepsparse or onnxruntime"
-        ),
-    )
-    num_cores: int = Field(
-        default=None,
-        description=(
-            "The number of physical cores to restrict the DeepSparse Engine to. "
-            "Defaults to all cores."
-        ),
-    )
-    scheduler: str = Field(
-        default="async",
-        description=(
-            "The scheduler to use with the DeepSparse Engine such as sync or async. "
-            "Defaults to async"
-        ),
-    )
-
-
 class ServerConfig(BaseModel):
     """
     A configuration for serving models in the DeepSparse inference server
     """
 
-    models: List[ServeModelConfig] = Field(
+    models: List[PipelineConfig] = Field(
         default=[],
         description=(
-            "The models to serve in the server defined by the additional arguments"
+            "The models to serve in the server defined by PipelineConfig objects"
         ),
     )
     workers: str = Field(
@@ -148,7 +88,7 @@ def server_config_from_env(env_key: str = ENV_DEEPSPARSE_SERVER_CONFIG):
         config_dict = json.loads(config_file.replace(ENV_SINGLE_PREFIX, ""))
         config = ServerConfig()
         config.models.append(
-            ServeModelConfig(
+            PipelineConfig(
                 task=config_dict["task"],
                 model_path=config_dict["model_path"],
                 batch_size=config_dict["batch_size"],
@@ -158,7 +98,7 @@ def server_config_from_env(env_key: str = ENV_DEEPSPARSE_SERVER_CONFIG):
         with open(config_file) as file:
             config_dict = yaml.safe_load(file.read())
         config_dict["models"] = (
-            [ServeModelConfig(**model) for model in config_dict["models"]]
+            [PipelineConfig(**model) for model in config_dict["models"]]
             if "models" in config_dict
             else []
         )
diff --git a/src/deepsparse/server/main.py b/src/deepsparse/server/main.py
index e8efead286..dc31f6427f 100644
--- a/src/deepsparse/server/main.py
+++ b/src/deepsparse/server/main.py
@@ -84,6 +84,7 @@
 
 import click
 
+from deepsparse import Pipeline
 from deepsparse.log import set_logging_level
 from deepsparse.server.asynchronous import execute_async, initialize_aysnc
 from deepsparse.server.config import (
@@ -91,7 +92,6 @@
     server_config_from_env,
     server_config_to_env,
 )
-from deepsparse.server.pipelines import load_pipelines_definitions
 from deepsparse.server.utils import serializable_response
 from deepsparse.version import version
 
@@ -130,7 +130,11 @@ def _home():
 
 
 def _add_pipeline_route(
-    app, pipeline_def, num_models: int, defined_tasks: set, integration: str
+    app,
+    pipeline: Pipeline,
+    num_models: int,
+    defined_tasks: set,
+    integration: str,
 ):
     path = "/predict"
 
@@ -142,26 +146,27 @@ def _add_pipeline_route(
             )
         # required path name for Sagemaker
         path = "/invocations"
-    elif pipeline_def.config.alias:
-        path = f"/predict/{pipeline_def.config.alias}"
+    elif pipeline.alias:
+        path = f"/predict/{pipeline.alias}"
     elif num_models > 1:
-        if pipeline_def.config.task in defined_tasks:
+        if pipeline.task in defined_tasks:
             raise ValueError(
-                f"Multiple tasks defined for {pipeline_def.config.task} and no alias "
-                f"given for {pipeline_def.config}. "
+                f"Multiple tasks defined for {pipeline.task} and no alias "
+                f"given for pipeline with model {pipeline.model_path_orig}. "
                 "Either define an alias or supply a single model for the task"
             )
-        path = f"/predict/{pipeline_def.config.task}"
-        defined_tasks.add(pipeline_def.config.task)
+        path = f"/predict/{pipeline.task}"
+        defined_tasks.add(pipeline.task)
 
     @app.post(
         path,
-        response_model=pipeline_def.response_model,
+        response_model=pipeline.output_schema,
         tags=["prediction"],
     )
-    async def _predict_func(request: pipeline_def.request_model):
+    async def _predict_func(request: pipeline.input_schema):
         results = await execute_async(
-            pipeline_def.pipeline, **vars(request), **pipeline_def.kwargs
+            pipeline,
+            request,
         )
         return serializable_response(results)
 
@@ -183,15 +188,12 @@ def server_app_factory():
     _LOGGER.debug("loaded server config %s", config)
     _add_general_routes(app, config)
 
-    pipeline_defs = load_pipelines_definitions(config)
-    _LOGGER.debug("loaded pipeline definitions from config %s", pipeline_defs)
+    pipelines = [Pipeline.from_config(model_config) for model_config in config.models]
+    _LOGGER.debug("loaded pipeline definitions from config %s", pipelines)
     num_tasks = len(config.models)
     defined_tasks = set()
-
-    for pipeline_def in pipeline_defs:
-        _add_pipeline_route(
-            app, pipeline_def, num_tasks, defined_tasks, config.integration
-        )
+    for pipeline in pipelines:
+        _add_pipeline_route(app, pipeline, num_tasks, defined_tasks, config.integration)
 
     return app
 
diff --git a/src/deepsparse/server/pipelines.py b/src/deepsparse/server/pipelines.py
deleted file mode 100644
index ef07c68ca2..0000000000
--- a/src/deepsparse/server/pipelines.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Pipelines that run preprocessing, postprocessing, and model inference
-within the DeepSparse model server.
-"""
-
-from typing import Any, Dict, List
-
-from pydantic import BaseModel, Field
-
-from deepsparse.server.config import ServeModelConfig, ServerConfig
-from deepsparse.tasks import SupportedTasks
-
-
-__all__ = ["PipelineDefinition", "load_pipelines_definitions"]
-
-
-class PipelineDefinition(BaseModel):
-    """
-    A definition of a pipeline to be served by the model server.
-    Used to create a prediction route on construction of the server app.
-    """
-
-    pipeline: Any = Field(description="the callable pipeline to invoke on each request")
-    request_model: Any = Field(
-        description="the pydantic model to validate the request body with"
-    )
-    response_model: Any = Field(
-        description="the pydantic model to validate the response payload with"
-    )
-    kwargs: Dict[str, Any] = Field(
-        description="any additional kwargs that should be passed into the pipeline"
-    )
-    config: ServeModelConfig = Field(
-        description="the config for the model the pipeline is serving"
-    )
-
-
-def load_pipelines_definitions(config: ServerConfig) -> List[PipelineDefinition]:
-    """
-    Load the pipeline definitions to use for creating prediction routes from
-    the given server configuration.
-
-    :param config: the configuration to load pipeline definitions for
-    :return: the loaded pipeline definitions to use for serving inference requests
-    """
-    defs = []
-
-    for model_config in config.models:
-        if SupportedTasks.is_nlp(model_config.task):
-            # dynamically import so we don't install dependencies when unneeded
-            from deepsparse.transformers.server import create_pipeline_definitions
-
-            (
-                pipeline,
-                request_model,
-                response_model,
-                kwargs,
-            ) = create_pipeline_definitions(model_config)
-        else:
-            raise ValueError(
-                f"unsupported task given of {model_config.task} "
-                f"for serve model config {model_config}"
-            )
-
-        defs.append(
-            PipelineDefinition(
-                pipeline=pipeline,
-                request_model=request_model,
-                response_model=response_model,
-                kwargs=kwargs,
-                config=model_config,
-            )
-        )
-
-    return defs
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 6ffaad7ec3..690de5276e 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -78,6 +78,32 @@ class SupportedTasks:
         token_classification=AliasedTask("token_classification", ["ner"]),
     )
 
+    image_classification = namedtuple("image_classification", ["image_classification"])(
+        image_classification=AliasedTask(
+            "image_classification",
+            ["image_classification"],
+        ),
+    )
+
+    yolo = namedtuple("yolo", ["yolo"])(
+        yolo=AliasedTask("yolo", ["yolo"]),
+    )
+
+    @classmethod
+    def check_register_task(cls, task: str):
+        if cls.is_nlp(task):
+            # trigger transformers pipelines to register with Pipeline.register
+            import deepsparse.transformers.pipelines  # noqa: F401
+
+        elif cls.is_image_classification(task):
+            # trigger image classification pipelines to
+            # register with Pipeline.register
+            import deepsparse.image_classification.pipelines  # noqa: F401
+
+        elif cls.is_yolo(task):
+            # trigger yolo pipelines to register with Pipeline.register
+            import deepsparse.yolo.pipelines  # noqa: F401
+
     @classmethod
     def is_nlp(cls, task: str) -> bool:
         """
@@ -90,3 +116,21 @@ def is_nlp(cls, task: str) -> bool:
             or cls.nlp.text_classification.matches(task)
             or cls.nlp.token_classification.matches(task)
         )
+
+    @classmethod
+    def is_image_classification(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is an image
+            classification task
+        :return: True if it is an image classification task, False otherwise
+        """
+        return cls.image_classification.image_classification.matches(task)
+
+    @classmethod
+    def is_yolo(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is an image
+            segmentation task using YOLO
+        :return: True if it is an segmentation task using YOLO, False otherwise
+        """
+        return cls.yolo.yolo.matches(task)
diff --git a/src/deepsparse/transformers/__init__.py b/src/deepsparse/transformers/__init__.py
index 89c7eb68ef..1264aa316d 100644
--- a/src/deepsparse/transformers/__init__.py
+++ b/src/deepsparse/transformers/__init__.py
@@ -120,4 +120,3 @@ def _check_transformers_install():
 from .helpers import *
 from .loaders import *
 from .pipelines import *
-from .server import *
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index b434dec625..8f9e9c5d49 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -58,7 +58,7 @@
 
 from tqdm.auto import tqdm
 
-from deepsparse.transformers import pipeline
+from deepsparse import Pipeline
 
 
 from datasets import load_dataset, load_metric  # isort: skip
@@ -79,14 +79,14 @@ def squad_eval(args):
     squad_metrics = load_metric("squad")
 
     # load QA pipeline
-    question_answer = pipeline(
+    question_answer = Pipeline.create(
         task="question-answering",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {question_answer.model}")
+    print(f"Engine info: {question_answer.engine}")
 
     for idx, sample in enumerate(tqdm(squad)):
         pred = question_answer(
@@ -96,7 +96,7 @@ def squad_eval(args):
         )
 
         squad_metrics.add_batch(
-            predictions=[{"prediction_text": pred["answer"], "id": sample["id"]}],
+            predictions=[{"prediction_text": pred.answer, "id": sample["id"]}],
             references=[{"answers": sample["answers"], "id": sample["id"]}],
         )
 
@@ -114,21 +114,21 @@ def mnli_eval(args):
     mnli_metrics = load_metric("glue", "mnli")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
 
     for idx, sample in enumerate(tqdm(mnli_matched)):
         pred = text_classify([[sample["premise"], sample["hypothesis"]]])
         mnli_metrics.add_batch(
-            predictions=[label_map.get(pred[0]["label"])],
+            predictions=[label_map.get(pred.labels[0])],
             references=[sample["label"]],
         )
 
@@ -154,14 +154,14 @@ def qqp_eval(args):
     qqp_metrics = load_metric("glue", "qqp")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     label_map = {"not_duplicate": 0, "duplicate": 1}
 
@@ -169,7 +169,7 @@ def qqp_eval(args):
         pred = text_classify([[sample["question1"], sample["question2"]]])
 
         qqp_metrics.add_batch(
-            predictions=[label_map.get(pred[0]["label"])],
+            predictions=[label_map.get(pred.labels[0])],
             references=[sample["label"]],
         )
 
@@ -185,14 +185,14 @@ def sst2_eval(args):
     sst2_metrics = load_metric("glue", "sst2")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     label_map = {"negative": 0, "positive": 1}
 
@@ -202,7 +202,7 @@ def sst2_eval(args):
         )
 
         sst2_metrics.add_batch(
-            predictions=[label_map.get(pred[0]["label"])],
+            predictions=[label_map.get(pred.labels[0])],
             references=[sample["label"]],
         )
 
diff --git a/src/deepsparse/transformers/pipelines.py b/src/deepsparse/transformers/pipelines.py
deleted file mode 100644
index 7725a0e2c2..0000000000
--- a/src/deepsparse/transformers/pipelines.py
+++ /dev/null
@@ -1,1414 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Adaptation of transformers.pipelines and onnx_transformers.pipelines
-
-adapted from:
-https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines/base.py
-https://github.com/patil-suraj/onnx_transformers/blob/master/onnx_transformers/pipelines.py
-
-"""
-
-import json
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from itertools import chain
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import numpy as np
-from transformers.configuration_utils import PretrainedConfig
-from transformers.data import (
-    SquadExample,
-    SquadFeatures,
-    squad_convert_examples_to_features,
-)
-from transformers.file_utils import ExplicitEnum
-from transformers.models.auto import AutoConfig, AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
-from transformers.utils import logging
-
-from deepsparse import Engine, compile_model, cpu
-from deepsparse.transformers.helpers import (
-    fix_numpy_types,
-    get_onnx_path_and_configs,
-    overwrite_transformer_onnx_model_inputs,
-)
-from deepsparse.transformers.loaders import get_batch_loader
-
-
-try:
-    import onnxruntime
-
-    ort_import_error = None
-except Exception as ort_import_err:
-    onnxruntime = None
-    ort_import_error = ort_import_err
-
-__all__ = [
-    "ArgumentHandler",
-    "Pipeline",
-    "TextClassificationPipeline",
-    "TokenClassificationPipeline",
-    "QuestionAnsweringPipeline",
-    "pipeline",
-    "overwrite_transformer_onnx_model_inputs",
-    "SUPPORTED_ENGINES",
-    "SUPPORTED_TASKS",
-]
-
-logger = logging.get_logger(__name__) if logging else None
-
-
-class ArgumentHandler(ABC):
-    """
-    Base interface for handling arguments for each Pipeline.
-    """
-
-    @abstractmethod
-    def __call__(self, *args, **kwargs):
-        raise NotImplementedError()
-
-
-class DefaultArgumentHandler(ArgumentHandler):
-    """
-    Default argument parser handling parameters for each Pipeline`.
-    """
-
-    @staticmethod
-    def handle_kwargs(kwargs: Dict) -> List:
-        """
-        :param kwargs: key word arguments for a pipeline
-        :return: list of the processed key word arguments
-        """
-        if len(kwargs) == 1:
-            output = list(kwargs.values())
-        else:
-            output = list(chain(kwargs.values()))
-
-        return DefaultArgumentHandler.handle_args(output)
-
-    @staticmethod
-    def handle_args(args: Sequence[Any]) -> List[str]:
-        """
-        :param args: sequence of arguments to a pipeline
-        :return: list of formatted, processed arguments
-        """
-
-        # Only one argument, let's do case by case
-        if len(args) == 1:
-            if isinstance(args[0], str):
-                return [args[0]]
-            elif not isinstance(args[0], list):
-                return list(args)
-            else:
-                return args[0]
-
-        # Multiple arguments (x1, x2, ...)
-        elif len(args) > 1:
-            if all([isinstance(arg, str) for arg in args]):
-                return list(args)
-
-            # If not instance of list, then it should be an instance of iterable
-            elif isinstance(args, Iterable):
-                return list(chain.from_iterable(chain(args)))
-            else:
-                raise ValueError(
-                    f"Invalid input type {type(args)}. Pipeline supports "
-                    "Union[str, Iterable[str]]"
-                )
-        else:
-            return []
-
-    def __call__(self, *args, **kwargs):
-        if len(kwargs) > 0 and len(args) > 0:
-            raise ValueError("Pipeline cannot handle mixed args and kwargs")
-
-        if len(kwargs) > 0:
-            return DefaultArgumentHandler.handle_kwargs(kwargs)
-        else:
-            return DefaultArgumentHandler.handle_args(args)
-
-
-class _ScikitCompat(ABC):
-    """
-    Interface layer for the Scikit and Keras compatibility.
-    """
-
-    @abstractmethod
-    def transform(self, X):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def predict(self, X):
-        raise NotImplementedError()
-
-
-class Pipeline(_ScikitCompat):
-    """
-    The Pipeline class is the class from which all pipelines inherit.
-    Refer to this class for methods shared across different pipelines.
-    This base Pipeline class provides support for multiple inference engine backends.
-
-    Base class implementing pipelined operations.
-    Pipeline workflow is defined as a sequence of the following operations:
-
-        Input -> Tokenization -> Model Inference ->
-        Post-Processing (task dependent) -> Output
-
-    Pipeline supports running with the DeepSparse engine or onnxruntime.
-
-    :param model: loaded inference engine to run the model with, can be a
-        deepsparse Engine or onnxruntime InferenceSession
-    :param tokenizer: tokenizer to be used for preprocessing
-    :param config: transformers model config for this model
-    :param engine_type: name of inference engine that is used. Options are
-        deepsparse and onnxruntime
-    :param max_length: maximum sequence length to set for model inputs by default.
-        default value is 128
-    :param input_names: list of input names to the neural network
-    :param args_parser: Reference to the object in charge of parsing supplied
-        pipeline parameters. A default is provided if None
-    :param binary_output: if True, stores outputs as pickled binaries to avoid
-        storing large amount of textual data. Default is False
-    """
-
-    default_input_names = None
-
-    def __init__(
-        self,
-        model: Union[Engine, "onnxruntime.InferenceSession"],
-        tokenizer: PreTrainedTokenizer,
-        config: PretrainedConfig,
-        engine_type: str,
-        max_length: int = 128,
-        input_names: Optional[List[str]] = None,
-        args_parser: ArgumentHandler = None,
-        binary_output: bool = False,
-    ):
-
-        self.model = model
-        self.tokenizer = tokenizer
-        self.config = config
-        self.engine_type = engine_type
-        self.max_length = max_length
-        self.input_names = input_names
-        self.binary_output = binary_output
-        self._args_parser = args_parser or DefaultArgumentHandler()
-        self._framework = (
-            "np" if self.engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE] else "pt"
-        )
-
-    def transform(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines.
-        This method will forward to __call__().
-        """
-        return self(X=X)
-
-    def predict(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines.
-        This method will forward to __call__().
-        """
-        return self(X=X)
-
-    def _parse_and_tokenize(
-        self, *args, padding=True, add_special_tokens=True, **kwargs
-    ):
-        # Parse arguments
-        inputs = self._args_parser(*args, **kwargs)
-        inputs = self.tokenizer(
-            inputs,
-            add_special_tokens=add_special_tokens,
-            return_tensors=self._framework,
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-        )
-
-        return inputs
-
-    def __call__(self, *args, **kwargs):
-        inputs = self._parse_and_tokenize(*args, **kwargs)
-        return self._forward(inputs)
-
-    def _forward(self, inputs):
-        if not all(name in inputs for name in self.input_names):
-            raise ValueError(
-                f"pipeline expected arrays with names {self.input_names}, received "
-                f"inputs: {list(inputs.keys())}"
-            )
-
-        if self.engine_type == ORT_ENGINE:
-            inputs = {k: v for k, v in inputs.items() if k in self.input_names}
-            return self.model.run(None, inputs)
-        elif self.engine_type == DEEPSPARSE_ENGINE:
-            return self.model.run([inputs[name] for name in self.input_names])
-        # TODO: torch
-        # with self.device_placement():
-        #         with torch.no_grad():
-        #             inputs = self.ensure_tensor_on_device(**inputs)
-        #             predictions = self.model(**inputs)[0].cpu()
-        # if return_tensors:
-        #     return predictions
-        # else:
-        #     return predictions.numpy()
-
-
-class TokenClassificationArgumentHandler(ArgumentHandler):
-    """
-    Handles arguments for token classification.
-    """
-
-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
-
-        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
-            inputs = list(inputs)
-            batch_size = len(inputs)
-        elif isinstance(inputs, str):
-            inputs = [inputs]
-            batch_size = 1
-        else:
-            raise ValueError("At least one input is required.")
-
-        offset_mapping = kwargs.get("offset_mapping")
-        if offset_mapping:
-            if isinstance(offset_mapping, list) and isinstance(
-                offset_mapping[0], tuple
-            ):
-                offset_mapping = [offset_mapping]
-            if len(offset_mapping) != batch_size:
-                raise ValueError(
-                    "offset_mapping should have the same batch size as the input"
-                )
-        return inputs, offset_mapping
-
-
-class QuestionAnsweringArgumentHandler(ArgumentHandler):
-    """
-    QuestionAnsweringPipeline requires the user to provide multiple arguments
-    (i.e. question & context) to be mapped
-    to internal `transformers.SquadExample`
-
-    QuestionAnsweringArgumentHandler manages all the possible to create a
-    `transformers.SquadExample` from the command-line supplied arguments
-    """
-
-    def __call__(self, *args, **kwargs):
-        # Position args, handling is sensibly the same as X and data,
-        # so forwarding to avoid duplicating
-        if args is not None and len(args) > 0:
-            if len(args) == 1:
-                kwargs["X"] = args[0]
-            else:
-                kwargs["X"] = list(args)
-
-        # Generic compatibility with sklearn and Keras
-        # Batched data
-        if "X" in kwargs or "data" in kwargs:
-            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
-
-            if isinstance(inputs, dict):
-                inputs = [inputs]
-            else:
-                # Copy to avoid overriding arguments
-                inputs = [i for i in inputs]
-
-            for i, item in enumerate(inputs):
-                if isinstance(item, dict):
-                    if any(k not in item for k in ["question", "context"]):
-                        raise KeyError(
-                            "You need to provide a dictionary with keys "
-                            "{question:..., context:...}"
-                        )
-
-                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
-
-                elif not isinstance(item, SquadExample):
-                    arg_name = "X" if "X" in kwargs else "data"
-                    raise ValueError(
-                        f"{arg_name} argument needs to be of type "
-                        "(list[SquadExample | dict], SquadExample, dict)"
-                    )
-
-        # Tabular input
-        elif "question" in kwargs and "context" in kwargs:
-            if isinstance(kwargs["question"], str):
-                kwargs["question"] = [kwargs["question"]]
-
-            if isinstance(kwargs["context"], str):
-                kwargs["context"] = [kwargs["context"]]
-
-            inputs = [
-                QuestionAnsweringPipeline.create_sample(q, c)
-                for q, c in zip(kwargs["question"], kwargs["context"])
-            ]
-        else:
-            raise ValueError(f"Unknown arguments {kwargs}")
-
-        if not isinstance(inputs, list):
-            inputs = [inputs]
-
-        return inputs
-
-
-class TextClassificationPipeline(Pipeline):
-    """
-    Text classification pipeline using any `ModelForSequenceClassification`.
-
-    This text classification pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"text-classification"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a text classification task.
-
-    :param return_all_scores: set True to return all model scores. Default False
-    """
-
-    def __init__(self, return_all_scores: bool = False, **kwargs):
-        super().__init__(**kwargs)
-
-        self.return_all_scores = return_all_scores
-
-    def __call__(self, *args, **kwargs):
-        """
-        Classify the text(s) given as inputs.
-
-        :param args: One or several texts (or one list of prompts) to classify
-        :param args: kwargs for inner call function
-        :return: A list or a list of list of dicts: Each result comes as list of dicts
-            with the following keys:
-            - `label` -- The label predicted.
-            - `score` -- The corresponding probability.
-            If ``self.return_all_scores=True``, one dictionary is returned per label
-        """
-        outputs = super().__call__(*args, **kwargs)
-
-        if isinstance(outputs, list) and outputs:
-            outputs = outputs[0]
-
-        if self.config.num_labels == 1:
-            scores = 1.0 / (1.0 + np.exp(-outputs))
-        else:
-            scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
-        if self.return_all_scores:
-            return [
-                [
-                    {"label": self.config.id2label[i], "score": score.item()}
-                    for i, score in enumerate(item)
-                ]
-                for item in scores
-            ]
-        else:
-            return [
-                {
-                    "label": self.config.id2label[item.argmax()],
-                    "score": item.max().item(),
-                }
-                for item in scores
-            ]
-
-
-class AggregationStrategy(ExplicitEnum):
-    """
-    All the valid aggregation strategies for TokenClassificationPipeline
-    """
-
-    NONE = "none"
-    SIMPLE = "simple"
-    FIRST = "first"
-    AVERAGE = "average"
-    MAX = "max"
-
-
-class TokenClassificationPipeline(Pipeline):
-    """
-    Named Entity Recognition pipeline using any `ModelForTokenClassification`.
-
-    This token classification pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"token-classification"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a token classification task.
-
-    :param args_parser: argument parser to use default is
-        TokenClassificationArgumentHandler
-    :param aggregation_strategy: AggregationStrategy Enum object to determine
-        the pipeline aggregation strategy. Default is AggregationStrategy.NONE
-    :param ignore_labels: list of labels to ignore. Default is `["O"]`
-    """
-
-    default_input_names = "sequences"
-
-    def __init__(
-        self,
-        args_parser: ArgumentHandler = None,
-        aggregation_strategy: AggregationStrategy = AggregationStrategy.NONE,
-        ignore_labels: List[str] = False,
-        **kwargs,
-    ):
-        super().__init__(
-            args_parser=args_parser or TokenClassificationArgumentHandler(),
-            **kwargs,
-        )
-
-        self.ignore_labels = ignore_labels or ["O"]
-
-        if isinstance(aggregation_strategy, str):
-            aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
-
-        if (
-            aggregation_strategy
-            in {
-                AggregationStrategy.FIRST,
-                AggregationStrategy.MAX,
-                AggregationStrategy.AVERAGE,
-            }
-            and not self.tokenizer.is_fast
-        ):
-            raise ValueError(
-                "Slow tokenizers cannot handle subwords. Please set the "
-                '`aggregation_strategy` option to `"simple"` or use a fast tokenizer.'
-            )
-
-        self.aggregation_strategy = aggregation_strategy
-
-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
-        """
-        Classify each token of the text(s) given as inputs.
-
-
-        :param inputs: One or several texts (or one list of texts) for token
-            classification
-        :return: A list or a list of list of :obj:`dict`: Each result comes as a list
-            of dictionaries (one for each token in the corresponding input, or each
-            entity if this pipeline was instantiated with an aggregation_strategy)
-            with the following keys:
-            - `word` -- The token/word classified.
-            - `score` -- The corresponding probability for `entity`.
-            - `entity` -- The entity predicted for that token/word (it is named
-                `entity_group` when `aggregation_strategy` is not `"none"`.
-            - `index` -- The index of the corresponding token in the sentence.
-            - `start` -- index of the start of the corresponding entity in the sentence
-                Only exists if the offsets are available within the tokenizer
-            - `end` -- The index of the end of the corresponding entity in the sentence.
-                Only exists if the offsets are available within the tokenizer
-        """
-
-        _inputs, offset_mappings = self._args_parser(inputs, **kwargs)
-
-        answers = []
-
-        tokens = self.tokenizer(
-            _inputs,
-            return_tensors=self._framework,
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            return_special_tokens_mask=True,
-            return_offsets_mapping=self.tokenizer.is_fast,
-        )
-
-        if self.tokenizer.is_fast:
-            offset_mapping = tokens.pop("offset_mapping")
-        elif not offset_mappings:
-            offset_mapping = [None] * len(_inputs)
-
-        special_tokens_mask = tokens.pop("special_tokens_mask")
-
-        # Forward
-        _forward_pass = self._forward(tokens)
-        for entities_index, current_entities in enumerate(_forward_pass[0]):
-            input_ids = tokens["input_ids"][entities_index]
-
-            scores = np.exp(current_entities) / np.exp(current_entities).sum(
-                -1, keepdims=True
-            )
-            pre_entities = self.gather_pre_entities(
-                _inputs[entities_index],
-                input_ids,
-                scores,
-                offset_mapping[entities_index],
-                special_tokens_mask[entities_index],
-            )
-            grouped_entities = self.aggregate(pre_entities, self.aggregation_strategy)
-            # Filter anything that is in self.ignore_labels
-            current_entities = [
-                entity
-                for entity in grouped_entities
-                if entity.get("entity", None) not in self.ignore_labels
-                and entity.get("entity_group", None) not in self.ignore_labels
-            ]
-            answers.append(current_entities)
-
-        if len(answers) == 1:
-            return answers[0]
-        return answers
-
-    def gather_pre_entities(
-        self,
-        sentence: str,
-        input_ids: np.ndarray,
-        scores: np.ndarray,
-        offset_mapping: Optional[List[Tuple[int, int]]],
-        special_tokens_mask: np.ndarray,
-    ) -> List[dict]:
-        pre_entities = []
-        for idx, token_scores in enumerate(scores):
-            # Filter special_tokens, they should only occur
-            # at the sentence boundaries since we're not encoding pairs of
-            # sentences so we don't have to keep track of those.
-            if special_tokens_mask[idx]:
-                continue
-
-            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
-            if offset_mapping is not None:
-                start_ind, end_ind = offset_mapping[idx]
-                word_ref = sentence[start_ind:end_ind]
-                is_subword = len(word_ref) != len(word)
-
-                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
-                    word = word_ref
-                    is_subword = False
-            else:
-                start_ind = None
-                end_ind = None
-                is_subword = False
-
-            pre_entity = {
-                "word": word,
-                "scores": token_scores,
-                "start": start_ind,
-                "end": end_ind,
-                "index": idx,
-                "is_subword": is_subword,
-            }
-            pre_entities.append(pre_entity)
-        return pre_entities
-
-    def aggregate(
-        self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> List[dict]:
-        if aggregation_strategy in {
-            AggregationStrategy.NONE,
-            AggregationStrategy.SIMPLE,
-        }:
-            entities = []
-            for pre_entity in pre_entities:
-                entity_idx = pre_entity["scores"].argmax()
-                score = pre_entity["scores"][entity_idx]
-                entity = {
-                    "entity": self.config.id2label[entity_idx],
-                    "score": score,
-                    "index": pre_entity["index"],
-                    "word": pre_entity["word"],
-                    "start": pre_entity["start"],
-                    "end": pre_entity["end"],
-                }
-                entities.append(entity)
-        else:
-            entities = self.aggregate_words(pre_entities, aggregation_strategy)
-
-        if aggregation_strategy == AggregationStrategy.NONE:
-            return entities
-        return self.group_entities(entities)
-
-    def aggregate_word(
-        self, entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> dict:
-        word = self.tokenizer.convert_tokens_to_string(
-            [entity["word"] for entity in entities]
-        )
-        if aggregation_strategy == AggregationStrategy.FIRST:
-            scores = entities[0]["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif aggregation_strategy == AggregationStrategy.MAX:
-            max_entity = max(entities, key=lambda entity: entity["scores"].max())
-            scores = max_entity["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif aggregation_strategy == AggregationStrategy.AVERAGE:
-            scores = np.stack([entity["scores"] for entity in entities])
-            average_scores = np.nanmean(scores, axis=0)
-            entity_idx = average_scores.argmax()
-            entity = self.config.id2label[entity_idx]
-            score = average_scores[entity_idx]
-        else:
-            raise ValueError("Invalid aggregation_strategy")
-        new_entity = {
-            "entity": entity,
-            "score": score,
-            "word": word,
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return new_entity
-
-    def aggregate_words(
-        self, entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> List[dict]:
-        assert aggregation_strategy not in {
-            AggregationStrategy.NONE,
-            AggregationStrategy.SIMPLE,
-        }, "NONE and SIMPLE strategies are invalid"
-
-        word_entities = []
-        word_group = None
-        for entity in entities:
-            if word_group is None:
-                word_group = [entity]
-            elif entity["is_subword"]:
-                word_group.append(entity)
-            else:
-                word_entities.append(
-                    self.aggregate_word(word_group, aggregation_strategy)
-                )
-                word_group = [entity]
-        # Last item
-        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
-        return word_entities
-
-    def group_sub_entities(self, entities: List[dict]) -> dict:
-        # Get the first entity in the entity group
-        entity = entities[0]["entity"].split("-")[-1]
-        scores = np.nanmean([entity["score"] for entity in entities])
-        tokens = [entity["word"] for entity in entities]
-
-        entity_group = {
-            "entity_group": entity,
-            "score": np.mean(scores),
-            "word": self.tokenizer.convert_tokens_to_string(tokens),
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return entity_group
-
-    def get_tag(self, entity_name: str) -> Tuple[str, str]:
-        if entity_name.startswith("B-"):
-            bi = "B"
-            tag = entity_name[2:]
-        elif entity_name.startswith("I-"):
-            bi = "I"
-            tag = entity_name[2:]
-        else:
-            # It's not in B-, I- format
-            bi = "B"
-            tag = entity_name
-        return bi, tag
-
-    def group_entities(self, entities: List[dict]) -> List[dict]:
-
-        entity_groups = []
-        entity_group_disagg = []
-
-        for entity in entities:
-            if not entity_group_disagg:
-                entity_group_disagg.append(entity)
-                continue
-
-            # If the current entity is similar and adjacent to the previous entity,
-            # append it to the disaggregated entity group
-            # The split is meant to account for the "B" and "I" prefixes
-            # Shouldn't merge if both entities are B-type
-            bi, tag = self.get_tag(entity["entity"])
-            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
-
-            if tag == last_tag and bi != "B":
-                # Modify subword type to be previous_type
-                entity_group_disagg.append(entity)
-            else:
-                # If the current entity is different from the previous entity
-                # aggregate the disaggregated entity group
-                entity_groups.append(self.group_sub_entities(entity_group_disagg))
-                entity_group_disagg = [entity]
-        if entity_group_disagg:
-            # it's the last entity, add it to the entity groups
-            entity_groups.append(self.group_sub_entities(entity_group_disagg))
-
-        return entity_groups
-
-
-class QuestionAnsweringPipeline(Pipeline):
-    """
-    Question Answering pipeline using any `ModelForQuestionAnswering`
-
-    This question answering pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"question-answering"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a question answering task.
-
-    :param model: loaded inference engine to run the model with, can be a
-        deepsparse Engine or onnxruntime InferenceSession
-    :param tokenizer: tokenizer to be used for preprocessing
-    :param config: transformers model config for this model
-    :param engine_type: name of inference engine that is used. Options are
-        deepsparse and onnxruntime
-    :param input_names: list of input names to the neural network
-    :param args_parser: Reference to the object in charge of parsing supplied
-        pipeline parameters. A default is provided if None
-    :param binary_output: if True, stores outputs as pickled binaries to avoid
-        storing large amount of textual data. Default is False
-    """
-
-    default_input_names = "question,context"
-
-    def __init__(
-        self,
-        model: Union[Engine, "onnxruntime.InferenceSession"],
-        tokenizer: PreTrainedTokenizer,
-        engine_type: str,
-        input_names: Optional[List[str]] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            engine_type=engine_type,
-            args_parser=QuestionAnsweringArgumentHandler(),
-            input_names=input_names,
-            **kwargs,
-        )
-
-    @staticmethod
-    def create_sample(
-        question: Union[str, List[str]], context: Union[str, List[str]]
-    ) -> Union[SquadExample, List[SquadExample]]:
-        """
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :return: processed SquadExample object(s) for each question/context pair given
-        """
-        if isinstance(question, list):
-            return [
-                SquadExample(None, q, c, None, None, None)
-                for q, c in zip(question, context)
-            ]
-        else:
-            return SquadExample(None, question, context, None, None, None)
-
-    def __call__(self, *args, **kwargs):
-        """
-        Answer the question(s) given as inputs by using the context(s).
-        Multiple arguments can be used to pass the context, question data
-
-        :param args: SquadExample or list of them containing the question and context
-        :param X: SquadExample or list of them containing the question and context
-        :param data: SquadExample or list of them containing the question and context
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :param topk: the number of answers to return. Will be chosen by
-            order of likelihood)
-        :param doc_stride: if the context is too long to fit with the question for the
-            model, it will be split in several chunks with some overlap. This argument
-            controls the size of that overlap
-        :param max_answer_len: maximum length of predicted answers (e.g., only
-            answers with a shorter length are considered)
-        :param max_seq_len: maximum length of the total sentence (context + question)
-            after tokenization. The context will be split in several chunks
-            (using the doc_stride) if needed
-        :param max_question_len: maximum length of the question after tokenization.
-            It will be truncated if needed
-        :param handle_impossible_answer: whether or not we accept impossible as an
-            answer
-        :param num_spans: maximum number of span to use as input from a long
-            context. Default is to stride the entire context string
-        :param preprocessed_inputs: if provided, preprocessing will be skipped in favor
-            of these inputs. Expected format is the output of self.preprocess; a tuple
-            of (examples, features_list)
-        :return: dict or list of dictionaries, each containing the following keys:
-            `"score"` - The probability associated to the answer
-            `"start"` - The start index of the answer
-            `"end"` - The end index of the answer
-            `"answer"` - The answer to the question
-        """
-        # Set defaults values
-        kwargs.setdefault("topk", 1)
-        kwargs.setdefault("max_answer_len", 15)
-        kwargs.setdefault("handle_impossible_answer", False)
-        kwargs.setdefault("preprocessed_inputs", None)  # (examples, features_list)
-
-        if kwargs["topk"] < 1:
-            raise ValueError(f"topk parameter should be >= 1 (got {kwargs['topk']})")
-
-        if kwargs["max_answer_len"] < 1:
-            raise ValueError(
-                "max_answer_len parameter should be >= 1 "
-                f"(got {kwargs['max_answer_len']})"
-            )
-
-        # run pre-processing if not provided
-        examples, features_list = kwargs["preprocessed_inputs"] or self.preprocess(
-            *args, **kwargs
-        )
-
-        # forward pass and post-processing
-        all_answers = []
-        for features, example in zip(features_list, examples):
-            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
-            fw_args = {
-                k: [feature.__dict__[k] for feature in features]
-                for k in model_input_names
-            }
-
-            # Manage tensor allocation on correct device
-            fw_args = {k: np.array(v) for (k, v) in fw_args.items()}
-            start, end = self._forward(fw_args)[:2]
-
-            # TODO: torch
-            # fw_args = {k: torch.tensor(v, device=self.device)
-            #   for (k, v) in fw_args.items()}
-            # start, end = self.model(**fw_args)[:2]
-            # start, end = start.cpu().numpy(), end.cpu().numpy()
-
-            min_null_score = 1000000  # large and positive
-            answers = []
-            for (feature, start_, end_) in zip(features, start, end):
-                # Ensure padded tokens & question tokens cannot belong
-                undesired_tokens = (
-                    np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
-                )
-
-                # Generate mask
-                undesired_tokens_mask = undesired_tokens == 0.0
-
-                # Make sure non-context indexes cannot contribute to the softmax
-                start_ = np.where(undesired_tokens_mask, -10000.0, start_)
-                end_ = np.where(undesired_tokens_mask, -10000.0, end_)
-
-                # Normalize logits and spans to retrieve the answer
-                start_ = np.exp(
-                    start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))
-                )
-                end_ = np.exp(
-                    end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True))
-                )
-
-                if kwargs["handle_impossible_answer"]:
-                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
-
-                # Mask CLS
-                start_[0] = end_[0] = 0.0
-
-                starts, ends, scores = self.decode(
-                    start_, end_, kwargs["topk"], kwargs["max_answer_len"]
-                )
-
-                if not self.tokenizer.is_fast:
-                    char_to_word = np.array(example.char_to_word_offset)
-                    answers += [
-                        {
-                            "score": score.item(),
-                            "start": np.where(
-                                char_to_word == feature.token_to_orig_map[s]
-                            )[0][0].item(),
-                            "end": np.where(
-                                char_to_word == feature.token_to_orig_map[e]
-                            )[0][-1].item(),
-                            "answer": " ".join(
-                                example.doc_tokens[
-                                    feature.token_to_orig_map[
-                                        s
-                                    ] : feature.token_to_orig_map[e]
-                                    + 1
-                                ]
-                            ),
-                        }
-                        for s, e, score in zip(starts, ends, scores)
-                    ]
-                else:
-                    question_first = bool(self.tokenizer.padding_side == "right")
-
-                    # Sometimes the max probability token is in the middle of a word so:
-                    # we start by finding the right word containing the token with
-                    # `token_to_word` then we convert this word in a character span
-                    answers += [
-                        {
-                            "score": score.item(),
-                            "start": feature.encoding.word_to_chars(
-                                feature.encoding.token_to_word(s),
-                                sequence_index=1 if question_first else 0,
-                            )[0],
-                            "end": feature.encoding.word_to_chars(
-                                feature.encoding.token_to_word(e),
-                                sequence_index=1 if question_first else 0,
-                            )[1],
-                            "answer": example.context_text[
-                                feature.encoding.word_to_chars(
-                                    feature.encoding.token_to_word(s),
-                                    sequence_index=1 if question_first else 0,
-                                )[0] : feature.encoding.word_to_chars(
-                                    feature.encoding.token_to_word(e),
-                                    sequence_index=1 if question_first else 0,
-                                )[
-                                    1
-                                ]
-                            ],
-                        }
-                        for s, e, score in zip(starts, ends, scores)
-                    ]
-
-            if kwargs["handle_impossible_answer"]:
-                answers.append(
-                    {"score": min_null_score, "start": 0, "end": 0, "answer": ""}
-                )
-
-            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[
-                : kwargs["topk"]
-            ]
-            all_answers += answers
-
-        if len(all_answers) == 1:
-            return all_answers[0]
-        return all_answers
-
-    def preprocess(self, *args, **kwargs) -> Tuple[Any, Any]:
-        """
-        preprocess the given QA model inputs using squad_convert_examples_to_features
-
-        :param args: SquadExample or list of them containing the question and context
-        :param X: SquadExample or list of them containing the question and context
-        :param data: SquadExample or list of them containing the question and context
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :param doc_stride: if the context is too long to fit with the question for the
-            model, it will be split in several chunks with some overlap. This argument
-            controls the size of that overlap
-        :param max_seq_len: maximum length of the total sentence (context + question)
-            after tokenization. The context will be split in several chunks
-            (using the doc_stride) if needed
-        :param max_question_len: maximum length of the question after tokenization.
-            It will be truncated if needed
-        :param num_spans: maximum number of spans to use as input from a long
-            context. Default is to stride the entire context string
-        :return: tuple of SquadExample inputs and preprocessed features list
-        """
-        kwargs.setdefault("doc_stride", 128)
-        kwargs.setdefault("max_seq_len", self.max_length)
-        kwargs.setdefault("max_question_len", 64)
-        kwargs.setdefault("num_spans", None)
-
-        # Convert inputs to features
-        examples = self._args_parser(*args, **kwargs)
-        if not self.tokenizer.is_fast:
-            features_list = [
-                squad_convert_examples_to_features(
-                    examples=[example],
-                    tokenizer=self.tokenizer,
-                    max_seq_length=kwargs["max_seq_len"],
-                    doc_stride=kwargs["doc_stride"],
-                    max_query_length=kwargs["max_question_len"],
-                    padding_strategy=PaddingStrategy.MAX_LENGTH.value,
-                    is_training=False,
-                    tqdm_enabled=False,
-                )
-                for example in examples
-            ]
-        else:
-            features_list = self._encode_features_fast(examples, **kwargs)
-
-        if kwargs["num_spans"]:
-            features_list = [
-                features[: kwargs["num_spans"]] for features in features_list
-            ]
-
-        return examples, features_list
-
-    def decode(
-        self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int
-    ) -> Tuple:
-        """
-        :param start: Individual start probabilities for each token
-        :param end: Individual end probabilities for each token
-        :param topk: Indicates how many possible answer span(s) to extract from the
-            model output
-        :param max_answer_len: Maximum size of the answer to extract from the model
-            output
-        :return: probabilities for each span to be the actual answer. Will filter out
-            unwanted and impossible cases
-        """
-        # Ensure we have batch axis
-        if start.ndim == 1:
-            start = start[None]
-
-        if end.ndim == 1:
-            end = end[None]
-
-        # Compute the score of each tuple(start, end) to be the real answer
-        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
-
-        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = np.tril(np.triu(outer), max_answer_len - 1)
-
-        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
-        scores_flat = candidates.flatten()
-        if topk == 1:
-            idx_sort = [np.argmax(scores_flat)]
-        elif len(scores_flat) < topk:
-            idx_sort = np.argsort(-scores_flat)
-        else:
-            idx = np.argpartition(-scores_flat, topk)[0:topk]
-            idx_sort = idx[np.argsort(-scores_flat[idx])]
-
-        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
-        return start, end, candidates[0, start, end]
-
-    def span_to_answer(
-        self, text: str, start: int, end: int
-    ) -> Dict[str, Union[str, int]]:
-        """
-        When decoding from token probabilities, this method maps token indexes to
-        actual word in the initial context.
-
-        :param text: The actual context to extract the answer from
-        :param start: The answer starting token index
-        :param end: The answer end token index
-        :return: Dictionary containing the start, end, and answer
-        """
-        words = []
-        token_idx = char_start_idx = char_end_idx = chars_idx = 0
-
-        for i, word in enumerate(text.split(" ")):
-            token = self.tokenizer.tokenize(word)
-
-            # Append words if they are in the span
-            if start <= token_idx <= end:
-                if token_idx == start:
-                    char_start_idx = chars_idx
-
-                if token_idx == end:
-                    char_end_idx = chars_idx + len(word)
-
-                words += [word]
-
-            # Stop if we went over the end of the answer
-            if token_idx > end:
-                break
-
-            # Append the subtokenization length to the running index
-            token_idx += len(token)
-            chars_idx += len(word) + 1
-
-        # Join text with spaces
-        return {
-            "answer": " ".join(words),
-            "start": max(0, char_start_idx),
-            "end": min(len(text), char_end_idx),
-        }
-
-    def _encode_features_fast(self, examples: Any, **kwargs) -> List[SquadFeatures]:
-        features_list = []
-        for example in examples:
-            # Define the side we want to truncate / pad and the text/pair sorting
-            question_first = bool(self.tokenizer.padding_side == "right")
-
-            encoded_inputs = self.tokenizer(
-                text=example.question_text if question_first else example.context_text,
-                text_pair=(
-                    example.context_text if question_first else example.question_text
-                ),
-                padding=PaddingStrategy.MAX_LENGTH.value,
-                truncation="only_second" if question_first else "only_first",
-                max_length=kwargs["max_seq_len"],
-                stride=kwargs["doc_stride"],
-                return_tensors="np",
-                return_token_type_ids=True,
-                return_overflowing_tokens=True,
-                return_offsets_mapping=True,
-                return_special_tokens_mask=True,
-            )
-
-            total_spans = len(encoded_inputs["input_ids"])
-
-            # p_mask: mask with 1 for token than cannot be in the answer
-            # We put 0 on the tokens from the context and 1 everywhere else
-            p_mask = np.asarray(
-                [
-                    [
-                        tok != 1 if question_first else 0
-                        for tok in encoded_inputs.sequence_ids(span_id)
-                    ]
-                    for span_id in range(total_spans)
-                ]
-            )
-
-            # keep the cls_token unmasked
-            if self.tokenizer.cls_token_id is not None:
-                cls_index = np.nonzero(
-                    encoded_inputs["input_ids"] == self.tokenizer.cls_token_id
-                )
-                p_mask[cls_index] = 0
-
-            features = []
-            for span_idx in range(total_spans):
-                features.append(
-                    SquadFeatures(
-                        input_ids=encoded_inputs["input_ids"][span_idx],
-                        attention_mask=encoded_inputs["attention_mask"][span_idx],
-                        token_type_ids=encoded_inputs["token_type_ids"][span_idx],
-                        p_mask=p_mask[span_idx].tolist(),
-                        encoding=encoded_inputs[span_idx],
-                        # the following values are unused for fast tokenizers
-                        cls_index=None,
-                        token_to_orig_map={},
-                        example_index=0,
-                        unique_id=0,
-                        paragraph_len=0,
-                        token_is_max_context=0,
-                        tokens=[],
-                        start_position=0,
-                        end_position=0,
-                        is_impossible=False,
-                        qas_id=None,
-                    )
-                )
-            features_list.append(features)
-        return features_list
-
-
-@dataclass
-class TaskInfo:
-    """
-    Information about an NLP task
-
-    :param pipeline_constructor: reference to constructor for the given pipeline task
-    :param default model name: the transformers canonical name for the default model
-    :param base_stub: sparsezoo stub path for the base model for this task
-    :param default_pruned_stub: sparsezoo stub path for the default pruned model
-        for this task
-    :param default_quant_stub: sparsezoo stub path for the default quantized model
-        for this task
-    """
-
-    pipeline_constructor: Callable[[Any], Pipeline]
-    default_model_name: str
-    base_stub: Optional[str] = None
-    default_pruned_stub: Optional[str] = None
-    default_quant_stub: Optional[str] = None
-
-
-# Register all the supported tasks here
-SUPPORTED_TASKS = {
-    "ner": TaskInfo(
-        pipeline_constructor=TokenClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "question-answering": TaskInfo(
-        pipeline_constructor=QuestionAnsweringPipeline,
-        default_model_name="bert-base-uncased",
-        base_stub=(
-            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none"
-        ),
-        default_pruned_stub=(
-            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/"
-            "pruned-aggressive_98"
-        ),
-    ),
-    "sentiment-analysis": TaskInfo(
-        pipeline_constructor=TextClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "text-classification": TaskInfo(
-        pipeline_constructor=TextClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "token-classification": TaskInfo(
-        pipeline_constructor=TokenClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-}
-
-DEEPSPARSE_ENGINE = "deepsparse"
-ORT_ENGINE = "onnxruntime"
-
-SUPPORTED_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
-
-
-def pipeline(
-    task: str,
-    model_name: Optional[str] = None,
-    model_path: Optional[str] = None,
-    engine_type: str = DEEPSPARSE_ENGINE,
-    config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-    max_length: int = 128,
-    num_cores: Optional[int] = None,
-    scheduler: Optional[str] = None,
-    batch_size: Optional[int] = 1,
-    **kwargs,
-) -> Pipeline:
-    """
-    Utility factory method to build a Pipeline
-
-    :param task: name of the task to define which pipeline to create. Currently
-        supported task - "question-answering"
-    :param model_name: canonical name of the hugging face model this model is based on
-    :param model_path: path to model directory containing `model.onnx`, `config.json`,
-        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
-    :param engine_type: inference engine name to use. Supported options are 'deepsparse'
-        and 'onnxruntime'
-    :param config: huggingface model config, if none provided, default will be used
-        which will be from the model name or sparsezoo stub if given for model path
-    :param tokenizer: huggingface tokenizer, if none provided, default will be used
-    :param max_length: maximum sequence length of model inputs. default is 128
-    :param num_cores: number of CPU cores to run engine with. Default is the maximum
-        available
-    :param scheduler: The scheduler to use for the engine. Can be None, single or multi.
-    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
-        Note: `question-answering` pipeline only supports a batch_size of 1.
-    :param kwargs: additional key word arguments for task specific pipeline constructor
-    :return: Pipeline object for the given taks and model
-    """
-
-    # Retrieve the task
-    if task not in SUPPORTED_TASKS:
-        raise KeyError(
-            f"Unknown task {task}, available tasks are {list(SUPPORTED_TASKS.keys())}"
-        )
-    if engine_type not in SUPPORTED_ENGINES:
-        raise ValueError(
-            f"Unsupported engine {engine_type}, supported engines "
-            f"are {SUPPORTED_ENGINES}"
-        )
-    if task == "question-answering" and batch_size != 1:
-        raise ValueError(
-            f"{task} pipeline only supports batch_size 1. "
-            f"Supplied batch_size = {batch_size}"
-        )
-    task_info = SUPPORTED_TASKS[task]
-
-    model_path = model_path or _get_default_model_path(task_info)
-    model_name = model_name or task_info.default_model_name
-
-    onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(model_path)
-
-    # default the tokenizer and config to file in model directory or given model name
-    config = config or config_path or model_name
-    tokenizer = tokenizer or tokenizer_path or model_name
-
-    # create model
-    model, input_names = _create_model(
-        onnx_path,
-        engine_type,
-        num_cores,
-        max_length,
-        scheduler=scheduler,
-        batch_size=batch_size,
-    )
-
-    # Instantiate tokenizer if needed
-    if isinstance(tokenizer, (str, tuple)):
-        if isinstance(tokenizer, tuple):
-            # For tuple we have (tokenizer name, {kwargs})
-            tokenizer_kwargs = tokenizer[1]
-            tokenizer_kwargs["model_max_length"] = max_length
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer, model_max_length=max_length
-            )
-
-    # Instantiate config if needed
-    if config is not None and isinstance(config, str):
-        config = AutoConfig.from_pretrained(config, finetuning_task=task)
-
-    return task_info.pipeline_constructor(
-        model=model,
-        tokenizer=tokenizer,
-        config=config,
-        engine_type=engine_type,
-        max_length=max_length,
-        input_names=input_names,
-        **kwargs,
-    )
-
-
-def _get_default_model_path(task_info: TaskInfo) -> str:
-    if cpu.cpu_vnni_compatible() and task_info.default_quant_stub:
-        return task_info.default_quant_stub
-    return task_info.default_pruned_stub or task_info.base_stub
-
-
-def _create_model(
-    model_path: str,
-    engine_type: str,
-    num_cores: Optional[int],
-    max_length: int = 128,
-    scheduler: Optional[str] = None,
-    batch_size: int = 1,
-) -> Tuple[Union[Engine, "onnxruntime.InferenceSession"], List[str]]:
-    onnx_path, input_names, _ = overwrite_transformer_onnx_model_inputs(
-        model_path, max_length=max_length
-    )
-
-    if engine_type == DEEPSPARSE_ENGINE:
-        model = compile_model(
-            onnx_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            scheduler=scheduler,
-        )
-    elif engine_type == ORT_ENGINE:
-        _validate_ort_import()
-        sess_options = onnxruntime.SessionOptions()
-        if num_cores is not None:
-            sess_options.intra_op_num_threads = num_cores
-        sess_options.log_severity_level = 3
-        sess_options.graph_optimization_level = (
-            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        )
-
-        model = onnxruntime.InferenceSession(onnx_path, sess_options=sess_options)
-
-    return model, input_names
-
-
-def _validate_ort_import():
-    if ort_import_error is not None:
-        raise ImportError(
-            "An exception occurred when importing onxxruntime. Please verify that "
-            "onnxruntime is installed in order to use the onnxruntime inference "
-            f"engine. \n\nException info: {ort_import_error}"
-        )
-
-
-def process_dataset(
-    pipeline_object: Callable,
-    data_path: str,
-    batch_size: int,
-    task: str,
-    output_path: str,
-) -> None:
-    """
-    :param pipeline_object: An instantiated pipeline Callable object
-    :param data_path: Path to input file, supports csv, json and text files
-    :param batch_size: batch_size to use for inference
-    :param task: The task pipeline is instantiated for
-    :param output_path: Path to a json file to output inference results to
-    """
-    batch_loader = get_batch_loader(
-        data_file=data_path,
-        batch_size=batch_size,
-        task=task,
-    )
-    # Wraps pipeline object to make numpy types serializable
-    pipeline_object = fix_numpy_types(pipeline_object)
-    with open(output_path, "a") as output_file:
-        for batch in batch_loader:
-            batch_output = pipeline_object(**batch)
-            json.dump(batch_output, output_file)
-            output_file.write("\n")
diff --git a/src/deepsparse/transformers/pipelines/__init__.py b/src/deepsparse/transformers/pipelines/__init__.py
new file mode 100644
index 0000000000..9986181a2a
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .pipeline import *
+from .question_answering import *
+from .text_classification import *
+from .token_classification import *
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
new file mode 100644
index 0000000000..2fdcd27236
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Base Pipeline class for transformers inference pipeline
+"""
+
+
+import warnings
+from typing import Any, List, Mapping, Optional
+
+import numpy
+from transformers.models.auto import AutoConfig, AutoTokenizer
+
+from deepsparse import Pipeline
+from deepsparse.transformers.helpers import (
+    get_onnx_path_and_configs,
+    overwrite_transformer_onnx_model_inputs,
+)
+
+
+__all__ = [
+    "TransformersPipeline",
+    "pipeline",
+]
+
+
+class TransformersPipeline(Pipeline):
+    """
+    Base deepsparse.Pipeline class for transformers model loading. This class handles
+    the parsing of deepsparse-transformers files and model inputs, supporting loading
+    from sparsezoo, a directory containing a model.onnx, tokenizer, and model config,
+    or just an ONNX file with the ability to load a tokenizer and model config from
+    a default huggingface-transformers model.
+
+    Note, when implementing child tasks in deepsparse.transformers.pipelines,
+    in addition to registering task names with Pipeline.register, task names should
+    be added to the supported nlp tasks in deepsparse.tasks so they can be properly
+    imported at runtime.
+
+    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
+        (preferred) a directory containing a model.onnx, tokenizer config, and model
+        config. If no tokenizer and/or model config(s) are found, then they will be
+        loaded from huggingface transformers using the `default_model_name` key
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param sequence_length: static sequence length to use for inference
+    :param default_model_name: huggingface transformers model name to use to
+        load a tokenizer and model config when none are provided in the `model_path`.
+        Default is 'bert-base-uncased'
+    """
+
+    def __init__(
+        self,
+        *,
+        sequence_length: int = 128,
+        default_model_name: str = "bert-base-uncased",
+        **kwargs,
+    ):
+
+        self._sequence_length = sequence_length
+        self._default_model_name = default_model_name
+
+        self.config = None
+        self.tokenizer = None
+        self.onnx_input_names = None
+
+        self._temp_model_directory = None
+
+        super().__init__(**kwargs)
+
+    @property
+    def sequence_length(self) -> int:
+        """
+        :return: static sequence length to use for inference
+        """
+        return self._sequence_length
+
+    @property
+    def default_model_name(self) -> str:
+        """
+        :return: huggingface transformers model name to use to
+            load a tokenizer and model config when none are provided in the
+            `model_path`
+        """
+        return self._default_model_name
+
+    def setup_onnx_file_path(self) -> str:
+        """
+        Parses ONNX, tokenizer, and config file paths from the given `model_path`.
+        Supports sparsezoo stubs. If a tokenizer and/or config file are not found,
+        they will be defaulted to the default_model_name in the transformers repo
+
+        :return: file path to the processed ONNX file for the engine to compile
+        """
+        onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
+            self.model_path
+        )
+
+        # default config + tokenizer if necessary
+        config_path = config_path or self.default_model_name
+        tokenizer_path = tokenizer_path or self.default_model_name
+
+        self.config = AutoConfig.from_pretrained(
+            config_path, finetuning_task=self.task if hasattr(self, "task") else None
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path, model_max_length=self.sequence_length
+        )
+
+        # overwrite onnx graph to given required input shape
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path, max_length=self.sequence_length
+        )
+
+        return onnx_path
+
+    def tokens_to_engine_input(
+        self, tokens: Mapping[Any, numpy.ndarray]
+    ) -> List[numpy.ndarray]:
+        """
+        :param tokens: outputs of the pipeline tokenizer
+        :return: list of numpy arrays in expected order for model input
+        """
+        if not all(name in tokens for name in self.onnx_input_names):
+            raise ValueError(
+                f"pipeline expected arrays with names {self.onnx_input_names}, "
+                f"received inputs: {list(tokens.keys())}"
+            )
+
+        return [tokens[name] for name in self.onnx_input_names]
+
+
+def pipeline(
+    task: str,
+    model_name: Optional[str] = None,
+    model_path: Optional[str] = None,
+    engine_type: str = "deepsparse",
+    config: Optional[str] = None,
+    tokenizer: Optional[str] = None,
+    max_length: int = 128,
+    num_cores: Optional[int] = None,
+    scheduler: Optional[str] = None,
+    batch_size: Optional[int] = 1,
+    **kwargs,
+) -> Pipeline:
+    """
+    [DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete DeepSparse
+    pipelines for tranformers tasks use deepsparse.Pipeline.create(task, ...)
+
+    Utility factory method to build a Pipeline
+
+    :param task: name of the task to define which pipeline to create. Currently
+        supported task - "question-answering"
+    :param model_name: canonical name of the hugging face model this model is based on
+    :param model_path: path to model directory containing `model.onnx`, `config.json`,
+        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
+    :param engine_type: inference engine name to use. Options are 'deepsparse'
+        and 'onnxruntime'. Default is 'deepsparse'
+    :param config: huggingface model config, if none provided, default will be used
+        which will be from the model name or sparsezoo stub if given for model path
+    :param tokenizer: huggingface tokenizer, if none provided, default will be used
+    :param max_length: maximum sequence length of model inputs. default is 128
+    :param num_cores: number of CPU cores to run engine with. Default is the maximum
+        available
+    :param scheduler: The scheduler to use for the engine. Can be None, single or multi
+    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
+        Note: `question-answering` pipeline only supports a batch_size of 1.
+    :param kwargs: additional key word arguments for task specific pipeline constructor
+    :return: Pipeline object for the given taks and model
+    """
+    warnings.warn(
+        "[DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete "
+        "DeepSparse pipelines for tranformers tasks use deepsparse.Pipeline.create()"
+    )
+
+    if config is not None or tokenizer is not None:
+        raise ValueError(
+            "Directly passing in a config or tokenizer to DeepSparse transformers "
+            "pipelines is no longer supported. config and tokenizer objects should "
+            "be specified by including config.json and tokenizer.json files in the "
+            "model directory respectively"
+        )
+
+    return Pipeline.create(
+        task=task,
+        model_path=model_path,
+        engine_type=engine_type,
+        batch_size=batch_size,
+        num_cores=num_cores,
+        scheduler=scheduler,
+        sequence_length=max_length,
+        default_model_name=model_name,
+        **kwargs,
+    )
diff --git a/src/deepsparse/transformers/pipelines/question_answering.py b/src/deepsparse/transformers/pipelines/question_answering.py
new file mode 100644
index 0000000000..ba57117dad
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/question_answering.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# postprocessing adapted from huggingface/transformers
+
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pipeline implementation and pydantic models for question answering transformers
+tasks
+"""
+
+
+from typing import Any, Dict, List, Tuple, Type
+
+import numpy
+from pydantic import BaseModel, Field
+from transformers.data import (
+    SquadExample,
+    SquadFeatures,
+    squad_convert_examples_to_features,
+)
+from transformers.tokenization_utils_base import PaddingStrategy
+
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines import TransformersPipeline
+
+
+__all__ = [
+    "QuestionAnsweringInput",
+    "QuestionAnsweringOutput",
+    "QuestionAnsweringPipeline",
+]
+
+
+class QuestionAnsweringInput(BaseModel):
+    """
+    Schema for inputs to question_answering pipelines
+    """
+
+    question: str = Field(description="String question to be answered")
+    context: str = Field(description="String representing context for answer")
+
+
+class QuestionAnsweringOutput(BaseModel):
+    """
+    Schema for question_answering pipeline output. Values are in batch order
+    """
+
+    score: float = Field(description="confidence score for prediction")
+    answer: str = Field(description="predicted answer")
+    start: int = Field(description="start index of the answer")
+    end: int = Field(description="end index of the answer")
+
+
+@Pipeline.register(
+    task="question_answering",
+    task_aliases=["qa"],
+    default_model_path=(
+        "zoo:nlp/question_answering/bert-base/pytorch/huggingface/"
+        "squad/12layer_pruned80_quant-none-vnni"
+    ),
+)
+class QuestionAnsweringPipeline(TransformersPipeline):
+    """
+    transformers question_answering pipeline
+
+    example instantiation:
+    ```python
+    question_answering = Pipeline.create(
+        task="question_answering",
+        model_path="question_answering_model_dir/",
+    )
+    ```
+
+    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
+        (preferred) a directory containing a model.onnx, tokenizer config, and model
+        config. If no tokenizer and/or model config(s) are found, then they will be
+        loaded from huggingface transformers using the `default_model_name` key
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param sequence_length: sequence length to compile model and tokenizer for.
+        Default is 128
+    :param default_model_name: huggingface transformers model name to use to
+        load a tokenizer and model config when none are provided in the `model_path`.
+        Default is 'bert-base-uncased'
+    :param doc_stride: if the context is too long to fit with the question for the
+        model, it will be split in several chunks with some overlap. This argument
+        controls the size of that overlap. Currently, only reading the first span
+        is supported (everything after doc_stride will be truncated). Default
+        is 128
+    :param max_question_len: maximum length of the question after tokenization.
+        It will be truncated if needed. Default is 64
+    :param max_answer_len: maximum length of answer after decoding. Default is 15
+    """
+
+    def __init__(
+        self,
+        *,
+        doc_stride: int = 128,
+        max_question_length: int = 64,
+        max_answer_length: int = 15,
+        **kwargs,
+    ):
+
+        if kwargs.get("batch_size") and kwargs["batch_size"] > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} currently only supports batch size 1, "
+                f"batch size set to {kwargs['batch_size']}"
+            )
+
+        self._doc_stride = doc_stride
+        self._max_question_length = max_question_length
+        self._max_answer_length = max_answer_length
+
+        super().__init__(**kwargs)
+
+    @property
+    def doc_stride(self) -> int:
+        """
+        :return: if the context is too long to fit with the question for the
+            model, it will be split in several chunks with some overlap. This argument
+            controls the size of that overlap. Currently, only reading the first span
+            is supported (everything after doc_stride will be truncated)
+        """
+        return self._doc_stride
+
+    @property
+    def max_answer_length(self) -> int:
+        """
+        :return: maximum length of answer after decoding
+        """
+        return self._max_answer_length
+
+    @property
+    def max_question_length(self) -> int:
+        """
+        :return: maximum length of the question after tokenization.
+            It will be truncated if needed
+        """
+        return self._max_question_length
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return QuestionAnsweringInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        return QuestionAnsweringOutput
+
+    def process_inputs(
+        self,
+        inputs: QuestionAnsweringInput,
+    ) -> Tuple[List[numpy.ndarray], Dict[str, Any]]:
+        """
+        :param inputs: inputs to the pipeline. Must be the type of the
+            QuestionAnsweringInput
+        :return: inputs of this model processed into a list of numpy arrays that
+            can be directly passed into the forward pass of the pipeline engine and
+            dictionary of parsed features and original extracted example
+        """
+        squad_example = SquadExample(
+            None, inputs.question, inputs.context, None, None, None
+        )
+        features = self._tokenize(squad_example)
+        tokens = features.__dict__
+
+        engine_inputs = self.tokens_to_engine_input(tokens)
+        # add batch dimension, assuming batch size 1
+        engine_inputs = [numpy.expand_dims(inp, axis=0) for inp in engine_inputs]
+
+        return engine_inputs, dict(
+            features=features,
+            example=squad_example,
+        )
+
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> BaseModel:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+        features = kwargs["features"]
+        example = kwargs["example"]
+        start_vals, end_vals = engine_outputs[:2]
+
+        # assuming batch size 0
+        start = start_vals[0]
+        end = end_vals[0]
+
+        # Ensure padded tokens & question tokens cannot belong
+        undesired_tokens = (
+            numpy.abs(numpy.array(features.p_mask) - 1) & features.attention_mask
+        )
+
+        # Generate mask
+        undesired_tokens_mask = undesired_tokens == 0.0
+
+        # Make sure non-context indexes cannot contribute to the softmax
+        start = numpy.where(undesired_tokens_mask, -10000.0, start)
+        end = numpy.where(undesired_tokens_mask, -10000.0, end)
+
+        # Normalize logits and spans to retrieve the answer
+        start = numpy.exp(
+            start - numpy.log(numpy.sum(numpy.exp(start), axis=-1, keepdims=True))
+        )
+        end = numpy.exp(
+            end - numpy.log(numpy.sum(numpy.exp(end), axis=-1, keepdims=True))
+        )
+
+        # Mask CLS
+        start[0] = 0.0
+        end[0] = 0.0
+
+        ans_start, ans_end, scores = self._decode(start, end)
+        # assuming one stride, so grab first idx
+        ans_start = ans_start[0]
+        ans_end = ans_end[0]
+        score = scores[0]
+
+        # decode start, end idx into text
+        if not self.tokenizer.is_fast:
+            char_to_word = numpy.array(example.char_to_word_offset)
+            return self.output_schema(
+                score=score.item(),
+                start=numpy.where(
+                    char_to_word == features.token_to_orig_map[ans_start]
+                )[0][0].item(),
+                end=numpy.where(char_to_word == features.token_to_orig_map[ans_end])[0][
+                    -1
+                ].item(),
+                answer=" ".join(
+                    example.doc_tokens[
+                        features.token_to_orig_map[
+                            ans_start
+                        ] : features.token_to_orig_map[ans_end]
+                        + 1
+                    ]
+                ),
+            )
+        else:
+            question_first = bool(self.tokenizer.padding_side == "right")
+
+            # Sometimes the max probability token is in the middle of a word so:
+            # we start by finding the right word containing the token with
+            # `token_to_word` then we convert this word in a character span
+            return self.output_schema(
+                score=score.item(),
+                start=features.encoding.word_to_chars(
+                    features.encoding.token_to_word(ans_start),
+                    sequence_index=1 if question_first else 0,
+                )[0],
+                end=features.encoding.word_to_chars(
+                    features.encoding.token_to_word(ans_end),
+                    sequence_index=1 if question_first else 0,
+                )[1],
+                answer=example.context_text[
+                    features.encoding.word_to_chars(
+                        features.encoding.token_to_word(ans_start),
+                        sequence_index=1 if question_first else 0,
+                    )[0] : features.encoding.word_to_chars(
+                        features.encoding.token_to_word(ans_end),
+                        sequence_index=1 if question_first else 0,
+                    )[
+                        1
+                    ]
+                ],
+            )
+
+    def _tokenize(self, example: SquadExample):
+        if not self.tokenizer.is_fast:
+            features = squad_convert_examples_to_features(
+                examples=[example],
+                tokenizer=self.tokenizer,
+                max_set_length=self.sequence_length,
+                doc_stride=self.doc_stride,
+                max_query_length=self.max_question_length,
+                padding_strategy=PaddingStrategy.MAX_LENGTH.value,
+                is_training=False,
+                tqdm_enabled=False,
+            )
+            # only 1 span supported so taking only the first element of features
+            # to add support for num_spans switch to features = features[:num_spans]
+            # not included for now due to static batch requirements in production
+            features = features[0]
+        else:
+            question_first = bool(self.tokenizer.padding_side == "right")
+            encoded_inputs = self.tokenizer(
+                text=example.question_text if question_first else example.context_text,
+                text_pair=(
+                    example.context_text if question_first else example.question_text
+                ),
+                padding=PaddingStrategy.MAX_LENGTH.value,
+                truncation="only_second" if question_first else "only_first",
+                max_length=self.sequence_length,
+                stride=self.doc_stride,
+                return_tensors="np",
+                return_token_type_ids=True,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                return_special_tokens_mask=True,
+            )
+
+            # only 1 span supported so taking only the first element of features
+            # to add support for num_spans switch hardcoded 0 idx lookups to loop
+            # over values in num_spans
+
+            # p_mask: mask with 1 for token than cannot be in the answer
+            # We put 0 on the tokens from the context and 1 everywhere else
+            p_mask = numpy.asarray(
+                [
+                    [
+                        tok != 1 if question_first else 0
+                        for tok in encoded_inputs.sequence_ids(0)
+                    ]
+                ]
+            )
+
+            # keep the cls_token unmasked
+            if self.tokenizer.cls_token_id is not None:
+                cls_index = numpy.nonzero(
+                    encoded_inputs["input_ids"][0] == self.tokenizer.cls_token_id
+                )
+                p_mask[cls_index] = 0
+
+            features = SquadFeatures(
+                input_ids=encoded_inputs["input_ids"][0],
+                attention_mask=encoded_inputs["attention_mask"][0],
+                token_type_ids=encoded_inputs["token_type_ids"][0],
+                p_mask=p_mask[0].tolist(),
+                encoding=encoded_inputs[0],
+                # the following values are unused for fast tokenizers
+                cls_index=None,
+                token_to_orig_map={},
+                example_index=0,
+                unique_id=0,
+                paragraph_len=0,
+                token_is_max_context=0,
+                tokens=[],
+                start_position=0,
+                end_position=0,
+                is_impossible=False,
+                qas_id=None,
+            )
+
+        return features
+
+    def _decode(self, start: numpy.ndarray, end: numpy.ndarray) -> Tuple:
+        # Ensure we have batch axis
+        if start.ndim == 1:
+            start = start[None]
+
+        if end.ndim == 1:
+            end = end[None]
+
+        # Compute the score of each tuple(start, end) to be the real answer
+        outer = numpy.matmul(numpy.expand_dims(start, -1), numpy.expand_dims(end, 1))
+
+        # Remove candidate with end < start and end - start > max_answer_len
+        candidates = numpy.tril(numpy.triu(outer), self.max_answer_length - 1)
+
+        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+        scores_flat = candidates.flatten()
+        # only returning best result, use argsort for topk support
+        idx_sort = [numpy.argmax(scores_flat)]
+
+        start, end = numpy.unravel_index(idx_sort, candidates.shape)[1:]
+        return start, end, candidates[0, start, end]
diff --git a/src/deepsparse/transformers/pipelines/text_classification.py b/src/deepsparse/transformers/pipelines/text_classification.py
new file mode 100644
index 0000000000..0df9ba2b59
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/text_classification.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# postprocessing adapted from huggingface/transformers
+
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Pipeline implementation and pydantic models for text classification transformers
+tasks
+"""
+
+
+from typing import List, Type, Union
+
+import numpy
+from pydantic import BaseModel, Field
+from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
+
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines import TransformersPipeline
+
+
+__all__ = [
+    "TextClassificationInput",
+    "TextClassificationOutput",
+    "TextClassificationPipeline",
+]
+
+
+class TextClassificationInput(BaseModel):
+    """
+    Schema for inputs to text_classification pipelines
+    """
+
+    sequences: Union[List[List[str]], List[str], str] = Field(
+        description="A string or List of strings representing input to"
+        "text_classification task"
+    )
+
+
+class TextClassificationOutput(BaseModel):
+    """
+    Schema for text_classification pipeline output. Values are in batch order
+    """
+
+    labels: List[str] = Field(description="The predicted labels in batch order")
+    scores: List[float] = Field(
+        description="The corresponding probability for each label in the batch"
+    )
+
+
+@Pipeline.register(
+    task="text_classification",
+    task_aliases=["glue", "sentiment_analysis"],
+    default_model_path=(
+        "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/"
+        "sst2/12layer_pruned80_quant-none-vnni"
+    ),
+)
+class TextClassificationPipeline(TransformersPipeline):
+    """
+    transformers text classification pipeline
+
+    example instantiation:
+    ```python
+    text_classifier = Pipeline.create(
+        task="text_classification",
+        model_path="text_classification_model_dir/",
+        batch_size=BATCH_SIZE,
+    )
+    ```
+
+    example batch size 1, single text inputs (ie sentiment analysis):
+    ```python
+    sentiment = text_classifier("the food tastes great")
+    sentiment = text_classifier(["the food tastes great"])
+    sentiment = text_classifier([["the food tastes great"]])
+    ```
+
+    example batch size 1, multi text input (ie QQP like tasks):
+    ```python
+    prediction = text_classifier([["how is the food?", "what is the food?"]])
+    ```
+
+    example batch size n, single text inputs:
+    ```python
+    sentiments = text_classifier(["the food tastes great", "the food tastes bad"])
+    sentiments = text_classifier([["the food tastes great"], ["the food tastes bad"]])
+    ```
+
+    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
+        (preferred) a directory containing a model.onnx, tokenizer config, and model
+        config. If no tokenizer and/or model config(s) are found, then they will be
+        loaded from huggingface transformers using the `default_model_name` key
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param sequence_length: sequence length to compile model and tokenizer for.
+        Default is 128
+    :param default_model_name: huggingface transformers model name to use to
+        load a tokenizer and model config when none are provided in the `model_path`.
+        Default is 'bert-base-uncased'
+    """
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return TextClassificationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        return TextClassificationOutput
+
+    def parse_inputs(self, *args, **kwargs) -> BaseModel:
+        """
+        :param args: ordered arguments to pipeline, only an input_schema object
+            is supported as an arg for this function
+        :param kwargs: keyword arguments to pipeline
+        :return: pipeline arguments parsed into the given `input_schema`
+            schema if necessary. If an instance of the `input_schema` is provided
+            it will be returned
+        """
+        if args and kwargs:
+            raise ValueError(
+                f"{self.__class__} only support args OR kwargs. Found "
+                f" {len(args)} args and {len(kwargs)} kwargs"
+            )
+
+        if args:
+            if len(args) == 1:
+                # passed input_schema schema directly
+                if isinstance(args[0], self.input_schema):
+                    return args[0]
+                return self.input_schema(sequences=args[0])
+            else:
+                return self.input_schema(sequences=args)
+
+        return self.input_schema(**kwargs)
+
+    def process_inputs(self, inputs: TextClassificationInput) -> List[numpy.ndarray]:
+        """
+        :param inputs: inputs to the pipeline. Must be the type of the
+            TextClassificationInput
+        :return: inputs of this model processed into a list of numpy arrays that
+            can be directly passed into the forward pass of the pipeline engine
+        """
+        tokens = self.tokenizer(
+            inputs.sequences,
+            add_special_tokens=True,
+            return_tensors="np",
+            padding=PaddingStrategy.MAX_LENGTH.value,
+            truncation=TruncationStrategy.LONGEST_FIRST.value,
+        )
+        return self.tokens_to_engine_input(tokens)
+
+    def process_engine_outputs(self, engine_outputs: List[numpy.ndarray]) -> BaseModel:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+        outputs = engine_outputs
+        if isinstance(outputs, list):
+            outputs = outputs[0]
+
+        scores = (
+            1.0 / (1.0 + numpy.exp(-outputs))
+            if self.config.num_labels == 1
+            else numpy.exp(outputs) / numpy.exp(outputs).sum(-1, keepdims=True)
+        )
+
+        labels = []
+        label_scores = []
+
+        for score in scores:
+            labels.append(self.config.id2label[score.argmax()])
+            label_scores.append(score.max().item())
+
+        return self.output_schema(
+            labels=labels,
+            scores=label_scores,
+        )
diff --git a/src/deepsparse/transformers/pipelines/token_classification.py b/src/deepsparse/transformers/pipelines/token_classification.py
new file mode 100644
index 0000000000..6485df668e
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines/token_classification.py
@@ -0,0 +1,499 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# postprocessing adapted from huggingface/transformers
+
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Pipeline implementation and pydantic models for token classification transformers
+tasks
+"""
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+from pydantic import BaseModel, Field
+from transformers.file_utils import ExplicitEnum
+from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
+
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines import TransformersPipeline
+
+
+__all__ = [
+    "AggregationStrategy",
+    "TokenClassificationInput",
+    "TokenClassificationResult",
+    "TokenClassificationOutput",
+    "TokenClassificationPipeline",
+]
+
+
+class AggregationStrategy(ExplicitEnum):
+    """
+    Valid aggregation strategies for postprocessing in the TokenClassificationPipeline
+    """
+
+    NONE = "none"
+    SIMPLE = "simple"
+    FIRST = "first"
+    AVERAGE = "average"
+    MAX = "max"
+
+
+class TokenClassificationInput(BaseModel):
+    """
+    Schema for inputs to token_classification pipelines
+    """
+
+    inputs: Union[List[str], str] = Field(
+        description=(
+            "A string or List of batch of strings representing input(s) to"
+            "a token_classification task"
+        )
+    )
+
+
+class TokenClassificationResult(BaseModel):
+    """
+    Schema for a classification of a single token
+    """
+
+    entity: str = Field(description="entity predicted for that token/word")
+    score: float = Field(description="The corresponding probability for `entity`")
+    index: int = Field(description="index of the corresponding token in the sentence")
+    word: str = Field(description="token/word classified")
+    start: Optional[int] = Field(
+        description=(
+            "index of the start of the corresponding entity in the sentence. "
+            "Only exists if the offsets are available within the tokenizer"
+        )
+    )
+    end: Optional[int] = Field(
+        description=(
+            "index of the end of the corresponding entity in the sentence. "
+            "Only exists if the offsets are available within the tokenizer"
+        )
+    )
+    is_grouped: bool = Field(
+        default=False,
+        description="True if this result is part of an entity group",
+    )
+
+
+class TokenClassificationOutput(BaseModel):
+    """
+    Schema for results of TokenClassificationPipeline inference. Classifications of each
+    token stored in a list of lists of batch[sentence[token]]
+    """
+
+    predictions: List[List[TokenClassificationResult]] = Field(
+        description=(
+            "list of list of results of token classification pipeline. Outer list "
+            "has one item for each sequence in the batch. Inner list has one "
+            "TokenClassificationResult item per token in the given sequence"
+        )
+    )
+
+
+@Pipeline.register(
+    task="token_classification",
+    task_aliases=["ner"],
+    default_model_path=(
+        "zoo:nlp/token_classification/bert-base/pytorch/huggingface/"
+        "conll2003/12layer_pruned80_quant-none-vnni"
+    ),
+)
+class TokenClassificationPipeline(TransformersPipeline):
+    """
+    transformers token classification pipeline
+
+    example instantiation:
+    ```python
+    token_classifier = Pipeline.create(
+        task="token_classification",
+        model_path="token_classification_model_dir/",
+        batch_size=BATCH_SIZE,
+    )
+    ```
+
+    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
+        (preferred) a directory containing a model.onnx, tokenizer config, and model
+        config. If no tokenizer and/or model config(s) are found, then they will be
+        loaded from huggingface transformers using the `default_model_name` key
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param sequence_length: sequence length to compile model and tokenizer for.
+        Default is 128
+    :param default_model_name: huggingface transformers model name to use to
+        load a tokenizer and model config when none are provided in the `model_path`.
+        Default is 'bert-base-uncased'
+    :param aggregation_strategy: how to aggregate tokens in postprocessing. Options
+        include 'none', 'simple', 'first', 'average', and 'max'. Default is None
+    :param ignore_labels: list of label names to ignore in output. Default is
+        ['0'] which ignores the default known class label
+    """
+
+    def __init__(
+        self,
+        *,
+        aggregation_strategy: AggregationStrategy = AggregationStrategy.NONE,
+        ignore_labels: List[str] = None,
+        **kwargs,
+    ):
+
+        if isinstance(aggregation_strategy, str):
+            aggregation_strategy = aggregation_strategy.strip().lower()
+        self._aggregation_strategy = AggregationStrategy(aggregation_strategy)
+        self._ignore_labels = ["0"] if ignore_labels is None else ignore_labels
+
+        super().__init__(**kwargs)
+
+    @property
+    def aggregation_strategy(self) -> str:
+        """
+        :return: how to aggregate tokens in postprocessing. Options
+            include 'none', 'simple', 'first', 'average', and 'max'
+        """
+        return self._aggregation_strategy.value
+
+    @property
+    def ignore_labels(self) -> List[str]:
+        """
+        :return: list of label names to ignore in output. Default is
+            ['0'] which ignores the default known class label
+        """
+        return self._ignore_labels
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return TokenClassificationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        return TokenClassificationOutput
+
+    def parse_inputs(self, *args, **kwargs) -> BaseModel:
+        """
+        :param args: ordered arguments to pipeline, only an input_schema object
+            is supported as an arg for this function
+        :param kwargs: keyword arguments to pipeline
+        :return: pipeline arguments parsed into the given `input_schema`
+            schema if necessary. If an instance of the `input_schema` is provided
+            it will be returned
+        """
+        if args and kwargs:
+            raise ValueError(
+                f"{self.__class__} only support args OR kwargs. Found "
+                f" {len(args)} args and {len(kwargs)} kwargs"
+            )
+
+        if args:
+            if len(args) == 1:
+                # passed input_schema schema directly
+                if isinstance(args[0], self.input_schema):
+                    return args[0]
+                return self.input_schema(inputs=args[0])
+            else:
+                return self.input_schema(inputs=args)
+
+        return self.input_schema(**kwargs)
+
+    def process_inputs(
+        self,
+        inputs: TokenClassificationInput,
+    ) -> Tuple[List[numpy.ndarray], Dict[str, Any]]:
+        """
+        :param inputs: inputs to the pipeline. Must be the type of the
+            TokenClassificationInput
+        :return: inputs of this model processed into a list of numpy arrays that
+            can be directly passed into the forward pass of the pipeline engine
+            and dictionary containing offset mappings and special tokens mask to
+            be used during postprocessing
+        """
+        tokens = self.tokenizer(
+            inputs.inputs,
+            return_tensors="np",
+            truncation=TruncationStrategy.LONGEST_FIRST.value,
+            padding=PaddingStrategy.MAX_LENGTH.value,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=self.tokenizer.is_fast,
+        )
+
+        offset_mapping = (
+            tokens.pop("offset_mapping")
+            if self.tokenizer.is_fast
+            else [None] * len(inputs.inputs)
+        )
+        special_tokens_mask = tokens.pop("special_tokens_mask")
+        postprocessing_kwargs = dict(
+            inputs=inputs,
+            tokens=tokens,
+            offset_mapping=offset_mapping,
+            special_tokens_mask=special_tokens_mask,
+        )
+
+        return self.tokens_to_engine_input(tokens), postprocessing_kwargs
+
+    def process_engine_outputs(
+        self,
+        engine_outputs: List[numpy.ndarray],
+        **kwargs,
+    ) -> BaseModel:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+        inputs = kwargs["inputs"]
+        tokens = kwargs["tokens"]
+        offset_mapping = kwargs["offset_mapping"]
+        special_tokens_mask = kwargs["special_tokens_mask"]
+
+        predictions = []  # type: List[List[TokenClassificationResult]]
+
+        for entities_index, current_entities in enumerate(engine_outputs[0]):
+            input_ids = tokens["input_ids"][entities_index]
+
+            scores = numpy.exp(current_entities) / numpy.exp(current_entities).sum(
+                -1, keepdims=True
+            )
+            pre_entities = self._gather_pre_entities(
+                inputs.inputs[entities_index],
+                input_ids,
+                scores,
+                offset_mapping[entities_index],
+                special_tokens_mask[entities_index],
+            )
+            grouped_entities = self._aggregate(pre_entities)
+            # Filter anything that is in self.ignore_labels
+            current_results = []  # type: List[TokenClassificationResult]
+            for entity in grouped_entities:
+                if entity.get("entity") in self.ignore_labels or (
+                    entity.get("entity_group") in self.ignore_labels
+                ):
+                    continue
+                if entity.get("entity_group"):
+                    entity["entity"] = entity["entity_group"]
+                    entity["is_grouped"] = True
+                    del entity["entity_group"]
+                current_results.append(TokenClassificationResult(**entity))
+            predictions.append(current_results)
+
+        return self.output_schema(predictions=predictions)
+
+    # utilities below adapted from transformers
+
+    def _gather_pre_entities(
+        self,
+        sentence: str,
+        input_ids: numpy.ndarray,
+        scores: numpy.ndarray,
+        offset_mapping: Optional[List[Tuple[int, int]]],
+        special_tokens_mask: numpy.ndarray,
+    ) -> List[dict]:
+        pre_entities = []
+        for idx, token_scores in enumerate(scores):
+            # Filter special_tokens, they should only occur
+            # at the sentence boundaries since we're not encoding pairs of
+            # sentences so we don't have to keep track of those.
+            if special_tokens_mask[idx]:
+                continue
+
+            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+            if offset_mapping is not None:
+                start_ind, end_ind = offset_mapping[idx]
+                word_ref = sentence[start_ind:end_ind]
+                is_subword = len(word_ref) != len(word)
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
+            else:
+                start_ind = None
+                end_ind = None
+                is_subword = False
+
+            pre_entity = {
+                "word": word,
+                "scores": token_scores,
+                "start": start_ind,
+                "end": end_ind,
+                "index": idx,
+                "is_subword": is_subword,
+            }
+            pre_entities.append(pre_entity)
+        return pre_entities
+
+    def _aggregate(self, pre_entities: List[dict]) -> List[dict]:
+        if self._aggregation_strategy in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }:
+            entities = []
+            for pre_entity in pre_entities:
+                entity_idx = pre_entity["scores"].argmax()
+                score = pre_entity["scores"][entity_idx]
+                entity = {
+                    "entity": self.config.id2label[entity_idx],
+                    "score": score,
+                    "index": pre_entity["index"],
+                    "word": pre_entity["word"],
+                    "start": pre_entity["start"],
+                    "end": pre_entity["end"],
+                }
+                entities.append(entity)
+        else:
+            entities = self._aggregate_words(pre_entities)
+
+        if self._aggregation_strategy == AggregationStrategy.NONE:
+            return entities
+        return self._group_entities(entities)
+
+    def _aggregate_word(self, entities: List[dict]) -> dict:
+        word = self.tokenizer.convert_tokens_to_string(
+            [entity["word"] for entity in entities]
+        )
+        if self._aggregation_strategy == AggregationStrategy.FIRST:
+            scores = entities[0]["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.config.id2label[idx]
+        elif self._aggregation_strategy == AggregationStrategy.MAX:
+            max_entity = max(entities, key=lambda entity: entity["scores"].max())
+            scores = max_entity["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.config.id2label[idx]
+        elif self._aggregation_strategy == AggregationStrategy.AVERAGE:
+            scores = numpy.stack([entity["scores"] for entity in entities])
+            average_scores = numpy.nanmean(scores, axis=0)
+            entity_idx = average_scores.argmax()
+            entity = self.config.id2label[entity_idx]
+            score = average_scores[entity_idx]
+        else:
+            raise ValueError(
+                f"Invalid aggregation_strategy: {self._aggregation_strategy}"
+            )
+        new_entity = {
+            "entity": entity,
+            "score": score,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+
+    def _aggregate_words(self, entities: List[dict]) -> List[dict]:
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self._aggregate_word(word_group))
+                word_group = [entity]
+        # Last item
+        word_entities.append(self._aggregate_word(word_group))
+        return word_entities
+
+    def _group_sub_entities(self, entities: List[dict]) -> dict:
+        # Get the first entity in the entity group
+        entity = entities[0]["entity"].split("-")[-1]
+        scores = numpy.nanmean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": numpy.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+
+    def _get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            # It's not in B-, I- format
+            bi = "B"
+            tag = entity_name
+        return bi, tag
+
+    def _group_entities(self, entities: List[dict]) -> List[dict]:
+
+        entity_groups = []
+        entity_group_disagg = []
+
+        for entity in entities:
+            if not entity_group_disagg:
+                entity_group_disagg.append(entity)
+                continue
+
+            # If the current entity is similar and adjacent to the previous entity,
+            # append it to the disaggregated entity group
+            # The split is meant to account for the "B" and "I" prefixes
+            # Shouldn't merge if both entities are B-type
+            bi, tag = self._get_tag(entity["entity"])
+            last_bi, last_tag = self._get_tag(entity_group_disagg[-1]["entity"])
+
+            if tag == last_tag and bi != "B":
+                # Modify subword type to be previous_type
+                entity_group_disagg.append(entity)
+            else:
+                # If the current entity is different from the previous entity
+                # aggregate the disaggregated entity group
+                entity_groups.append(self._group_sub_entities(entity_group_disagg))
+                entity_group_disagg = [entity]
+        if entity_group_disagg:
+            # it's the last entity, add it to the entity groups
+            entity_groups.append(self._group_sub_entities(entity_group_disagg))
+
+        return entity_groups
diff --git a/src/deepsparse/transformers/server.py b/src/deepsparse/transformers/server.py
deleted file mode 100644
index 59035dba80..0000000000
--- a/src/deepsparse/transformers/server.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Specs, schemas, and pipelines for use when serving transformers models
-"""
-
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from deepsparse.tasks import SupportedTasks
-from deepsparse.transformers.pipelines import Pipeline, pipeline
-
-
-try:
-    from deepsparse.server.config import ServeModelConfig
-
-    deepsparse_server_err = None
-except Exception as _err:
-    deepsparse_server_err = _err
-    ServeModelConfig = object
-
-try:
-    from pydantic import BaseModel, Field
-
-    pydantic_import_err = None
-except Exception as _err:
-    pydantic_import_err = _err
-    BaseModel = object
-    Field = dict
-
-
-__all__ = [
-    "create_pipeline_definitions",
-    "QuestionAnsweringRequest",
-    "QuestionAnsweringResponse",
-    "TextClassificationRequest",
-    "TextClassificationResponse",
-    "TokenClassificationRequest",
-    "TokenClassificationResponse",
-]
-
-
-def create_pipeline_definitions(
-    model_config: ServeModelConfig,
-) -> Tuple[Pipeline, Any, Any, Dict]:
-    """
-    Create a pipeline definition and the supporting files for a given model config
-    to use for serving in the DeepSparse inference server
-
-    :param model_config: the server model config describing the model and params
-    :return: a tuple containing (the pipeline to use for inference,
-        the expected request body, the expected response body,
-        any additional keyword args for use with the server)
-    """
-    if deepsparse_server_err:
-        raise deepsparse_server_err
-
-    if pydantic_import_err:
-        raise pydantic_import_err
-
-    if SupportedTasks.nlp.question_answering.matches(model_config.task):
-        request_model = QuestionAnsweringRequest
-        response_model = Union[
-            List[QuestionAnsweringResponse],
-            QuestionAnsweringResponse,
-        ]
-        kwargs = {}
-    elif SupportedTasks.nlp.text_classification.matches(model_config.task):
-        request_model = TextClassificationRequest
-        response_model = Union[
-            List[TextClassificationResponse], List[List[TextClassificationResponse]]
-        ]
-        kwargs = {}
-    elif SupportedTasks.nlp.token_classification.matches(model_config.task):
-        request_model = TokenClassificationRequest
-        response_model = Union[
-            List[TokenClassificationResponse], List[List[TokenClassificationResponse]]
-        ]
-        kwargs = {}
-    else:
-        raise ValueError(
-            f"unrecognized task given of {model_config.task} for config {model_config}"
-        )
-
-    pipeline_instance: Pipeline = pipeline(
-        task=model_config.task.lower().replace("_", "-"),
-        model_path=model_config.model_path,
-        engine_type=model_config.engine,
-        num_cores=model_config.num_cores,
-        scheduler=model_config.scheduler,
-        batch_size=model_config.batch_size,
-        **model_config.kwargs,
-    )
-
-    return pipeline_instance, request_model, response_model, kwargs
-
-
-class QuestionAnsweringRequest(BaseModel):
-    """
-    The request model for Question Answering Task
-    """
-
-    question: Union[List[str], str] = Field(
-        description="Either a string or a List of string questions to answer"
-    )
-    context: Union[List[str], str] = Field(
-        description="Either a string or List of strings representing the context "
-        "for each question"
-    )
-
-
-class TokenClassificationRequest(BaseModel):
-    """
-    Schema for TokenClassificationPipeline Request
-    """
-
-    inputs: Union[List[str], str] = Field(
-        description="A string or List of strings representing input to"
-        "TokenClassificationPipeline task"
-    )
-
-
-class TextClassificationRequest(BaseModel):
-    """
-    Schema for TextClassificationPipeline Request
-    """
-
-    sequences: Union[List[str], str] = Field(
-        description="A string or List of strings representing input to"
-        "TextClassificationPipeline task"
-    )
-
-
-class QuestionAnsweringResponse(BaseModel):
-    """
-    Schema for a result from Question Answering Task
-    """
-
-    score: float = Field(description="confidence score for prediction")
-    start: int = Field(description="The start index of the answer")
-    end: int = Field(description="The end index of the answer")
-    answer: str = Field(description="The predicted answer")
-
-
-class TokenClassificationResponse(BaseModel):
-    """
-    Schema for TokenClassificationPipeline Response
-    """
-
-    entity: str = Field(
-        description="The entity predicted for that token/word (it is named"
-        "`entity_group` when `aggregation_strategy` is not `none`."
-    )
-    score: float = Field(description="The corresponding probability for `entity`.")
-    index: int = Field(
-        description="The index of the corresponding token in the sentence."
-    )
-    word: str = Field(description="The token/word classified.")
-    start: Optional[int] = Field(
-        description="The index of the start of the corresponding entity in the "
-        "sentence. Only exists if the offsets are available within the tokenizer"
-    )
-    end: Optional[int] = Field(
-        description="The index of the end of the corresponding entity in the sentence. "
-        "Only exists if the offsets are available within the tokenizer"
-    )
-
-
-class TextClassificationResponse(BaseModel):
-    """
-    Schema for TextClassificationPipeline Response
-    """
-
-    label: str = Field(description="The label predicted.")
-    score: float = Field(description="The corresponding probability.")
diff --git a/src/deepsparse/yolo/__init__.py b/src/deepsparse/yolo/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/yolo/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/yolo/annotate.py b/src/deepsparse/yolo/annotate.py
new file mode 100644
index 0000000000..72f7770934
--- /dev/null
+++ b/src/deepsparse/yolo/annotate.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage: deepsparse.object_detection.annotate [OPTIONS]
+
+  Annotation Script for YOLO with DeepSparse
+
+Options:
+  --model_filepath, --model-filepath TEXT
+                                  Path/SparseZoo stub to the model file to be
+                                  used for annotation  [default: zoo:cv/detect
+                                  ion/yolov5-s/pytorch/ultralytics/coco/pruned
+                                  -aggressive_96]
+  --source TEXT                   File path to image or directory of .jpg
+                                  files, a .mp4 video, or an integer (i.e. 0)
+                                  for webcam  [required]
+  --engine [deepsparse|onnxruntime|torch]
+                                  Inference engine backend to run on. Choices
+                                  are 'deepsparse', 'onnxruntime', and
+                                  'torch'. Default is 'deepsparse'
+  --image_shape, --image_shape INTEGER...
+                                  Image shape to use for inference, must be
+                                  two integers  [default: 640, 640]
+  --num_cores, --num-cores INTEGER
+                                  The number of physical cores to run the
+                                  annotations with, defaults to using all
+                                  physical cores available on the system. For
+                                  DeepSparse benchmarks, this value is the
+                                  number of cores per socket
+  --save_dir, --save-dir DIRECTORY
+                                  The path to the directory for saving results
+                                  [default: annotation-results]
+  --name TEXT                     Name of directory in save-dir to write
+                                  results to. defaults to
+                                  {engine}-annotations-{run_number}
+  --target_fps, --target-fps FLOAT
+                                  Target FPS when writing video files. Frames
+                                  will be dropped to closely match target FPS.
+                                  --source must be a video file and if target-
+                                  fps is greater than the source video fps
+                                  then it will be ignored
+  --no_save, --no-save            Set flag when source is from webcam to not
+                                  save results.Not supported for non-webcam
+                                  sources  [default: False]
+  --help                          Show this message and exit.
+
+#######
+Examples:
+
+1) deepsparse.object_detection.annotate --source PATH/TO/IMAGE.jpg
+2) deepsparse.object_detection.annotate --source PATH/TO/VIDEO.mp4
+3) deepsparse.object_detection.annotate --source 0
+4) deepsparse.object_detection.annotate --source PATH/TO/IMAGE_DIR
+"""
+import logging
+from typing import Optional
+
+import click
+
+import cv2
+from deepsparse.pipeline import Pipeline
+from deepsparse.yolo import utils
+from deepsparse.yolo.utils.cli_helpers import create_dir_callback
+
+
+yolo_v5_default_stub = (
+    "zoo:cv/detection/yolov5-s/pytorch/ultralytics/coco/" "pruned-aggressive_96"
+)
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+TORCH_ENGINE = "torch"
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option(
+    "--model_filepath",
+    "--model-filepath",
+    type=str,
+    default=yolo_v5_default_stub,
+    help="Path/SparseZoo stub to the model file to be used for annotation",
+    show_default=True,
+)
+@click.option(
+    "--source",
+    type=str,
+    required=True,
+    help="File path to image or directory of .jpg files, a .mp4 video, "
+    "or an integer (i.e. 0) for webcam",
+)
+@click.option(
+    "--engine",
+    type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCH_ENGINE]),
+    default=DEEPSPARSE_ENGINE,
+    help="Inference engine backend to run on. Choices are 'deepsparse', "
+    "'onnxruntime', and 'torch'. Default is 'deepsparse'",
+)
+@click.option(
+    "--image_shape",
+    "--image_shape",
+    type=int,
+    nargs=2,
+    default=(640, 640),
+    help="Image shape to use for inference, must be two integers",
+    show_default=True,
+)
+@click.option(
+    "--num_cores",
+    "--num-cores",
+    type=int,
+    default=None,
+    help="The number of physical cores to run the annotations with, "
+    "defaults to using all physical cores available on the system."
+    " For DeepSparse benchmarks, this value is the number of cores "
+    "per socket",
+    show_default=True,
+)
+@click.option(
+    "--save_dir",
+    "--save-dir",
+    type=click.Path(dir_okay=True, file_okay=False),
+    default="annotation-results",
+    callback=create_dir_callback,
+    help="The path to the directory for saving results",
+    show_default=True,
+)
+@click.option(
+    "--name",
+    type=str,
+    default=None,
+    help="Name of directory in save-dir to write results to. defaults to "
+    "{engine}-annotations-{run_number}",
+)
+@click.option(
+    "--target_fps",
+    "--target-fps",
+    type=float,
+    default=None,
+    help="Target FPS when writing video files. Frames will be dropped to "
+    "closely match target FPS. --source must be a video file and if "
+    "target-fps is greater than the source video fps then it "
+    "will be ignored",
+    show_default=True,
+)
+@click.option(
+    "--no_save",
+    "--no-save",
+    is_flag=True,
+    help="Set flag when source is from webcam to not save results."
+    "Not supported for non-webcam sources",
+    show_default=True,
+)
+def main(
+    model_filepath: str,
+    source: str,
+    engine: str,
+    image_shape: tuple,
+    num_cores: Optional[int],
+    save_dir: str,
+    name: Optional[str],
+    target_fps: Optional[float],
+    no_save: bool,
+) -> None:
+    """
+    Annotation Script for YOLO with DeepSparse
+    """
+    save_dir = utils.get_annotations_save_dir(
+        initial_save_dir=save_dir,
+        tag=name,
+        engine=engine,
+    )
+
+    loader, saver, is_video = utils.get_yolo_loader_and_saver(
+        path=source,
+        save_dir=save_dir,
+        image_shape=image_shape,
+        target_fps=target_fps,
+        no_save=no_save,
+    )
+
+    is_webcam = source.isnumeric()
+    yolo_pipeline = Pipeline.create(
+        task="yolo",
+        model_path=model_filepath,
+        class_names="coco",
+        engine_type=engine,
+        num_cores=num_cores,
+    )
+
+    for iteration, (input_image, source_image) in enumerate(loader):
+
+        # annotate
+        annotated_images = utils.annotate(
+            pipeline=yolo_pipeline,
+            image_batch=input_image,
+            target_fps=target_fps,
+            calc_fps=is_video,
+            original_images=[source_image],
+        )
+
+        for annotated_image in annotated_images:
+            # display
+            if is_webcam:
+                cv2.imshow("annotated", annotated_image)
+                cv2.waitKey(1)
+
+            # save
+            if saver:
+                saver.save_frame(annotated_image)
+
+    if saver:
+        saver.close()
+
+    _LOGGER.info(f"Results saved to {save_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/deepsparse/yolo/pipelines.py b/src/deepsparse/yolo/pipelines.py
new file mode 100644
index 0000000000..2398313c31
--- /dev/null
+++ b/src/deepsparse/yolo/pipelines.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+import onnx
+
+from deepsparse.pipeline import Pipeline
+from deepsparse.utils import model_to_path
+from deepsparse.yolo.schemas import YOLOInput, YOLOOutput
+from deepsparse.yolo.utils import COCO_CLASSES, YoloPostprocessor, postprocess_nms
+
+
+try:
+    import cv2
+
+    cv2_error = None
+except ModuleNotFoundError as cv2_import_error:
+    cv2 = None
+    cv2_error = cv2_import_error
+
+
+@Pipeline.register(
+    task="yolo",
+    default_model_path=(
+        "zoo:cv/detection/yolov5-l/pytorch/ultralytics/coco/pruned_quant-aggressive_95"
+    ),
+)
+class YOLOPipeline(Pipeline):
+    """
+    Image Segmentation YOLO pipeline for DeepSparse
+
+    :param model_path: path on local system or SparseZoo stub to load the model from
+    :param engine_type: inference engine to use. Currently supported values include
+        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
+    :param batch_size: static batch size to use for inference. Default is 1
+    :param num_cores: number of CPU cores to allocate for inference engine. None
+        specifies all available cores. Default is None
+    :param scheduler: (deepsparse only) kind of scheduler to execute with.
+        Pass None for the default
+    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
+        to use model as-is. Default is None
+    :param alias: optional name to give this pipeline instance, useful when
+        inferencing with multiple models. Default is None
+    :param class_names: Optional string identifier, dict, or json file of
+        class names to use for mapping class ids to class labels. Default is
+        `coco`
+    """
+
+    def __init__(
+        self,
+        *,
+        class_names: Optional[Union[str, Dict[str, str]]] = "coco",
+        model_config: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+
+        if isinstance(class_names, str):
+            if class_names.endswith(".json"):
+                class_names = json.load(open(class_names))
+            elif class_names == "coco":
+                class_names = COCO_CLASSES
+            else:
+                raise ValueError(f"Unknown class_names: {class_names}")
+
+        if isinstance(class_names, dict):
+            self._class_names = class_names
+        elif isinstance(class_names, list):
+            self._class_names = {
+                str(index): class_name for index, class_name in enumerate(class_names)
+            }
+        else:
+            raise ValueError(
+                "class_names must be a str identifier, dict, json file, or "
+                f"list of class names got {type(class_names)}"
+            )
+
+        onnx_model = onnx.load(self.onnx_file_path)
+        self.has_postprocessing = self.model_has_postprocessing(
+            loaded_onnx_model=onnx_model,
+        )
+        self.input_shape = self._infer_image_shape(onnx_model=onnx_model)
+        self.is_quantized = self.model_is_quantized(onnx_model=onnx_model)
+        self.postprocessor = (
+            None
+            if self.has_postprocessing
+            else YoloPostprocessor(
+                image_size=self.input_shape,
+                cfg=model_config,
+            )
+        )
+        self._model_config = model_config
+
+    @property
+    def model_config(self) -> str:
+        return self._model_config
+
+    @property
+    def class_names(self) -> Optional[Dict[str, str]]:
+        return self._class_names
+
+    @property
+    def input_schema(self) -> Type[YOLOInput]:
+        """
+        :return: pydantic model class that inputs to this pipeline must comply to
+        """
+        return YOLOInput
+
+    @property
+    def output_schema(self) -> Type[YOLOOutput]:
+        """
+        :return: pydantic model class that outputs of this pipeline must comply to
+        """
+        return YOLOOutput
+
+    def setup_onnx_file_path(self) -> str:
+        """
+        Performs any setup to unwrap and process the given `model_path` and other
+        class properties into an inference ready onnx file to be compiled by the
+        engine of the pipeline
+
+        :return: file path to the ONNX file for the engine to compile
+        """
+        return model_to_path(self.model_path)
+
+    def process_inputs(self, inputs: YOLOInput) -> List[numpy.ndarray]:
+        """
+        :param inputs: inputs to the pipeline. Must be the type of the `input_schema`
+            of this pipeline
+        :return: inputs of this model processed into a list of numpy arrays that
+            can be directly passed into the forward pass of the pipeline engine
+        """
+        image_batch = []
+
+        if isinstance(inputs.images, str):
+            inputs.images = [inputs.images]
+
+        for image in inputs.images:
+            if isinstance(image, str):
+                image = cv2.imread(image)
+                image = cv2.resize(image, dsize=self.input_shape)
+                image = image[:, :, ::-1].transpose(2, 0, 1)
+
+            image_batch.append(image)
+
+        image_batch = numpy.stack(image_batch, axis=0)
+        image_batch = numpy.ascontiguousarray(
+            image_batch,
+            dtype=numpy.int8 if self.is_quantized else numpy.float32,
+        )
+        image_batch /= 255
+
+        return [image_batch]
+
+    def process_engine_outputs(
+        self,
+        engine_outputs: List[numpy.ndarray],
+    ) -> YOLOOutput:
+        """
+        :param engine_outputs: list of numpy arrays that are the output of the engine
+            forward pass
+        :return: outputs of engine post-processed into an object in the `output_schema`
+            format of this pipeline
+        """
+
+        # post-processing
+        if self.postprocessor:
+            batch_output = self.postprocessor.pre_nms_postprocess(engine_outputs)
+        else:
+            batch_output = engine_outputs[
+                0
+            ]  # post-processed values stored in first output
+
+        # NMS
+        batch_output = postprocess_nms(batch_output)
+
+        batch_predictions, batch_boxes, batch_scores, batch_labels = [], [], [], []
+
+        for image_output in batch_output:
+            batch_predictions.append(image_output.tolist())
+            batch_boxes.append(image_output[:, 0:4].tolist())
+            batch_scores.append(image_output[:, 4].tolist())
+            batch_labels.append(
+                [
+                    self.class_names[str(class_ids)]
+                    for class_ids in image_output[:, 5].astype(int)
+                ]
+            )
+
+        return YOLOOutput(
+            predictions=batch_predictions,
+            boxes=batch_boxes,
+            scores=batch_scores,
+            labels=batch_labels,
+        )
+
+    def _infer_image_shape(self, onnx_model) -> Tuple[int, ...]:
+        """
+        Infer and return the expected shape of the input tensor
+
+        :return: The expected shape of the input tensor from onnx graph
+        """
+        input_tensor = onnx_model.graph.input[0]
+        return (
+            input_tensor.type.tensor_type.shape.dim[2].dim_value,
+            input_tensor.type.tensor_type.shape.dim[3].dim_value,
+        )
+
+    def model_has_postprocessing(self, loaded_onnx_model) -> bool:
+        """
+        :return: True if loaded_onnx_model has postprocessing, False otherwise
+        """
+        # get number of dimensions in each output
+        outputs_num_dims = [
+            len(output.type.tensor_type.shape.dim)
+            for output in loaded_onnx_model.graph.output
+        ]
+
+        # assume if only one output, then it is post-processed
+        if len(outputs_num_dims) == 1:
+            return True
+
+        return all(num_dims > outputs_num_dims[0] for num_dims in outputs_num_dims[1:])
+
+    def model_is_quantized(self, onnx_model) -> bool:
+        """
+        :return: True if loaded_onnx_model is quantized, False otherwise
+        """
+        return (
+            onnx_model.graph.input[0].type.tensor_type.elem_type
+            == onnx.TensorProto.UINT8
+        )
diff --git a/src/deepsparse/yolo/schemas.py b/src/deepsparse/yolo/schemas.py
new file mode 100644
index 0000000000..f60357dfb5
--- /dev/null
+++ b/src/deepsparse/yolo/schemas.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Input/Output Schemas for Image Segmentation with YOLO
+"""
+from collections import namedtuple
+from typing import List, Union
+
+import numpy
+from pydantic import BaseModel
+
+
+__all__ = [
+    "YOLOOutput",
+    "YOLOInput",
+]
+
+_YOLOImageOutput = namedtuple(
+    "_YOLOImageOutput", ["predictions", "boxes", "scores", "labels"]
+)
+
+
+class YOLOInput(BaseModel):
+    """
+    Input model for image classification
+    """
+
+    images: Union[str, List[numpy.ndarray], List[str]]
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class YOLOOutput(BaseModel):
+    """
+    Output model for image classification
+    """
+
+    predictions: List[List[List[float]]]
+    boxes: List[List[List[float]]]
+    scores: List[List[float]]
+    labels: List[List[str]]
+
+    def __getitem__(self, index):
+        if index >= len(self.predictions):
+            raise IndexError("Index out of range")
+
+        return _YOLOImageOutput(
+            self.predictions[index],
+            self.boxes[index],
+            self.scores[index],
+            self.labels[index],
+        )
+
+    def __iter__(self):
+        for index in range(len(self.predictions)):
+            yield self[index]
diff --git a/src/deepsparse/yolo/utils/__init__.py b/src/deepsparse/yolo/utils/__init__.py
new file mode 100644
index 0000000000..5344738df6
--- /dev/null
+++ b/src/deepsparse/yolo/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .coco_classes import *
+from .utils import *
diff --git a/src/deepsparse/yolo/utils/cli_helpers.py b/src/deepsparse/yolo/utils/cli_helpers.py
new file mode 100644
index 0000000000..ccd366236f
--- /dev/null
+++ b/src/deepsparse/yolo/utils/cli_helpers.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Optional, Union
+
+
+def parse_device(
+    ctx,
+    params,
+    value: Optional[Union[str, int]],
+) -> Optional[Union[str, int]]:
+    """
+    :param ctx: The click context
+    :param params: The click params
+    :param value: The device value to parse
+    :return: The correct inferred device
+    """
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        return value
+
+
+def create_dir_callback(ctx, params, value: str):
+    """
+    Create and return directory if it doesn't exist.
+
+    :param ctx: The click context
+    :param params: The click params
+    :param value: The value to create the directory from
+    :returns: The directory path
+    """
+    os.makedirs(value, exist_ok=True)
+    return value
diff --git a/src/deepsparse/yolo/utils/coco_classes.py b/src/deepsparse/yolo/utils/coco_classes.py
new file mode 100644
index 0000000000..5e67829d8f
--- /dev/null
+++ b/src/deepsparse/yolo/utils/coco_classes.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+COCO_CLASSES = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
new file mode 100644
index 0000000000..0e14aad9fe
--- /dev/null
+++ b/src/deepsparse/yolo/utils/utils.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helpers and Utilities for YOLO
+"""
+import functools
+import glob
+import itertools
+import logging
+import os
+import random
+import shutil
+import time
+from pathlib import Path
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
+
+import numpy
+import onnx
+import yaml
+
+import torch
+import torchvision
+from sparsezoo.utils import create_dirs
+
+
+try:
+    import cv2
+
+    cv2_error = None
+except ModuleNotFoundError as cv2_import_error:
+    cv2 = None
+    cv2_error = cv2_import_error
+
+_YOLO_CLASS_COLORS = list(itertools.product([0, 255, 128, 64, 192], repeat=3))
+_YOLO_CLASS_COLORS.remove((255, 255, 255))  # remove white from possible colors
+_LOGGER = logging.getLogger(__name__)
+
+# Default YOLO anchor grids
+_YOLO_DEFAULT_ANCHORS = [
+    torch.Tensor([[10, 13], [16, 30], [33, 23]]),
+    torch.Tensor([[30, 61], [62, 45], [59, 119]]),
+    torch.Tensor([[116, 90], [156, 198], [373, 326]]),
+]
+_YOLO_DEFAULT_ANCHOR_GRIDS = [
+    t.clone().view(1, -1, 1, 1, 2) for t in _YOLO_DEFAULT_ANCHORS
+]
+
+
+@functools.lru_cache(maxsize=None)
+def _get_color(label):
+    # cache color lookups
+    return random.choice(_YOLO_CLASS_COLORS)
+
+
+class YoloPostprocessor:
+    """
+    Class for performing post-processing of YOLO model predictions
+
+    :param image_size: size of input image to model. used to calculate stride based on
+        output shapes
+    """
+
+    def __init__(
+        self, image_size: Tuple[int, int] = (640, 640), cfg: Optional[str] = None
+    ):
+        self._image_size = image_size
+        self._anchor_grids = (
+            self._load_cfg_anchor_grid(cfg) if cfg else _YOLO_DEFAULT_ANCHOR_GRIDS
+        )
+        self._grids = {}  # Dict[Tuple[int], torch.Tensor]
+
+    def pre_nms_postprocess(self, outputs: List[numpy.ndarray]) -> torch.Tensor:
+        """
+        :param outputs: raw outputs of a YOLO model before anchor grid processing
+        :return: post-processed model outputs without NMS.
+        """
+        # postprocess and transform raw outputs into single torch tensor
+        processed_outputs = []
+        for idx, pred in enumerate(outputs):
+            pred = torch.from_numpy(pred)
+            pred = pred.sigmoid()
+
+            # get grid and stride
+            grid_shape = pred.shape[2:4]
+            grid = self._get_grid(grid_shape)
+            stride = self._image_size[0] / grid_shape[0]
+
+            # decode xywh box values
+            pred[..., 0:2] = (pred[..., 0:2] * 2.0 - 0.5 + grid) * stride
+            pred[..., 2:4] = (pred[..., 2:4] * 2) ** 2 * self._anchor_grids[idx]
+            # flatten anchor and grid dimensions ->
+            #       (bs, num_predictions, num_classes + 5)
+            processed_outputs.append(pred.view(pred.size(0), -1, pred.size(-1)))
+        return torch.cat(processed_outputs, 1)
+
+    def _get_grid(self, grid_shape: Tuple[int, int]) -> torch.Tensor:
+        if grid_shape not in self._grids:
+            # adapted from yolov5.yolo.Detect._make_grid
+            coords_y, coords_x = torch.meshgrid(
+                [torch.arange(grid_shape[0]), torch.arange(grid_shape[1])]
+            )
+            grid = torch.stack((coords_x, coords_y), 2)
+            self._grids[grid_shape] = grid.view(
+                1, 1, grid_shape[0], grid_shape[1], 2
+            ).float()
+        return self._grids[grid_shape]
+
+    @staticmethod
+    def _load_cfg_anchor_grid(cfg: str) -> List[torch.Tensor]:
+        with open(cfg) as f:
+            anchors = yaml.safe_load(f)["anchors"]
+
+        def _split_to_coords(coords_list):
+            return [
+                [coords_list[idx], coords_list[idx + 1]]
+                for idx in range(0, len(coords_list), 2)
+            ]
+
+        anchors = [torch.Tensor(_split_to_coords(coords)) for coords in anchors]
+        return [t.clone().view(1, -1, 1, 1, 2) for t in anchors]
+
+
+def postprocess_nms(outputs: Union[torch.Tensor, numpy.ndarray]) -> List[numpy.ndarray]:
+    """
+    :param outputs: Tensor of post-processed model outputs
+    :return: List of numpy arrays of NMS predictions for each image in the batch
+    """
+    # run nms in PyTorch, only post-process first output
+    if isinstance(outputs, numpy.ndarray):
+        outputs = torch.from_numpy(outputs)
+    nms_outputs = _non_max_suppression(outputs)
+    return [output.cpu().numpy() for output in nms_outputs]
+
+
+def _non_max_suppression(
+    prediction,
+    conf_thres=0.25,
+    iou_thres=0.45,
+    classes=None,
+    agnostic=False,
+    multi_label=False,
+    labels=(),
+):
+    # Ported from ultralytics/yolov5
+
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Checks
+    assert 0 <= conf_thres <= 1, (
+        f"Invalid Confidence threshold {conf_thres}, "
+        "valid values are between 0.0 and 1.0"
+    )
+    assert (
+        0 <= iou_thres <= 1
+    ), f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
+
+    # Settings
+    _, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    max_det = 300  # maximum number of detections per image
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            label_ = labels[xi]
+            v = torch.zeros((len(label_), nc + 5), device=x.device)
+            v[:, :4] = label_[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(label_)), label_[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = _xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n < 3e3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = _box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
+                1, keepdim=True
+            )  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f"WARNING: NMS time limit {time_limit}s exceeded")
+            break  # time limit exceeded
+
+    return output
+
+
+def _xywh2xyxy(
+    x: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    # ported from ultralytics/yolov5
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2]
+    # where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else numpy.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def _box_iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor:
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (
+        (
+            torch.min(box1[:, None, 2:], box2[:, 2:])
+            - torch.max(box1[:, None, :2], box2[:, :2])
+        )
+        .clamp(0)
+        .prod(2)
+    )
+    return inter / (
+        area1[:, None] + area2 - inter
+    )  # iou = inter / (area1 + area2 - inter)
+
+
+def yolo_onnx_has_postprocessing(model_path: str) -> bool:
+    """
+    :param model_path: file path to YOLO ONNX model
+    :return: True if YOLO postprocessing (pre-nms) is included in the ONNX graph,
+        this is assumed to be when the first output of the model has fewer dimensions
+        than the other outputs as the grid dimensions have been flattened
+    """
+    model = onnx.load(model_path)
+
+    # get number of dimensions in each output
+    outputs_num_dims = [
+        len(output.type.tensor_type.shape.dim) for output in model.graph.output
+    ]
+
+    # assume if only one output, then it is post-processed
+    if len(outputs_num_dims) == 1:
+        return True
+
+    return all(num_dims > outputs_num_dims[0] for num_dims in outputs_num_dims[1:])
+
+
+def annotate(
+    pipeline: "YOLOPipeline",  # noqa: F821
+    image_batch: Union[List[numpy.ndarray], List[str]],
+    target_fps: float = None,
+    calc_fps: bool = False,
+    original_images: Optional[Union[List[numpy.ndarray], numpy.ndarray]] = None,
+) -> List[numpy.ndarray]:
+    """
+    Annotated and return image_batch with bounding boxes and labels
+
+    :param pipeline: A YOLOPipeline object
+    :param image_batch: A list of image files, or batch of numpy image_batch
+    :param target_fps: If not None, then the pipeline will be run at this target
+    :param calc_fps: If True, and target_fps is None then the pipeline will
+        calculate the FPS
+    :param original_images: images from input_batch before any processing
+    :return: A list of annotated images
+
+    """
+
+    if not isinstance(image_batch, list):
+        image_batch = [image_batch]
+
+    if not original_images:
+        original_images = image_batch
+
+    batch_size = len(image_batch)
+    if image_batch and isinstance(image_batch[0], str):
+        original_images = [cv2.imread(image) for image in image_batch]
+
+    if target_fps is None and calc_fps:
+        start = time.time()
+
+    pipeline_outputs = pipeline(images=image_batch)
+
+    if target_fps is None and calc_fps:
+        target_fps = float(batch_size) / (time.time() - start)
+
+    annotated_images = []
+    for index, image_output in enumerate(pipeline_outputs):
+        image = original_images[index]
+        result = _annotate_image(
+            img=image,
+            boxes=image_output.boxes,
+            labels=image_output.labels,
+            scores=image_output.scores,
+            model_input_size=pipeline.input_shape,
+            images_per_sec=target_fps,
+        )
+        annotated_images.append(result)
+
+    return annotated_images
+
+
+def _annotate_image(
+    img: numpy.ndarray,
+    boxes: List[List[float]],
+    scores: List[float],
+    labels: List[str],
+    score_threshold: float = 0.35,
+    model_input_size: Tuple[int, int] = None,
+    images_per_sec: Optional[float] = None,
+) -> numpy.ndarray:
+    """
+    Draws bounding boxes on predictions of a detection model
+
+    :param img: Original image to annotate (no pre-processing needed)
+    :param boxes: List of bounding boxes (x1, y1, x2, y2)
+    :param scores: List of scores for each bounding box
+    :param labels: List of labels for each bounding box
+    :param score_threshold: minimum score a detection should have to be annotated
+        on the image. Default is 0.35
+    :param model_input_size: 2-tuple of expected input size for the given model to
+        be used for bounding box scaling with original image. Scaling will not
+        be applied if model_input_size is None. Default is None
+    :param images_per_sec: optional image_batch per second to annotate the left corner
+        of the image with
+    :return: the original image annotated with the given bounding boxes
+    """
+    img_res = numpy.copy(img)
+
+    scale_y = img.shape[0] / (1.0 * model_input_size[0]) if model_input_size else 1.0
+    scale_x = img.shape[1] / (1.0 * model_input_size[1]) if model_input_size else 1.0
+
+    for idx in range(len(boxes)):
+        label = labels[idx]
+        if scores[idx] > score_threshold:
+            annotation_text = f"{label}: {scores[idx]:.0%}"
+
+            # bounding box points
+            left = boxes[idx][0] * scale_x
+            top = boxes[idx][1] * scale_y
+            right = boxes[idx][2] * scale_x
+            bottom = boxes[idx][3] * scale_y
+
+            # calculate text size
+            (text_width, text_height), text_baseline = cv2.getTextSize(
+                annotation_text,
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.9,  # font scale
+                2,  # thickness
+            )
+            text_height += text_baseline
+
+            # make solid background for annotation text
+            cv2.rectangle(
+                img_res,
+                (int(left), int(top) - 33),
+                (int(left) + text_width, int(top) - 28 + text_height),
+                _get_color(label),
+                thickness=-1,  # filled solid
+            )
+
+            # add white annotation text
+            cv2.putText(
+                img_res,
+                annotation_text,
+                (int(left), int(top) - 10),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.9,  # font scale
+                (255, 255, 255),  # white text
+                2,  # thickness
+                cv2.LINE_AA,
+            )
+
+            # draw bounding box
+            cv2.rectangle(
+                img_res,
+                (int(left), int(top)),
+                (int(right), int(bottom)),
+                _get_color(label),
+                thickness=2,
+            )
+
+    if images_per_sec is not None:
+        cv2.putText(
+            img_res,
+            f"images_per_sec: {int(images_per_sec)}",
+            (50, 50),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            2.0,  # font scale
+            (245, 46, 6),  # color
+            2,  # thickness
+            cv2.LINE_AA,
+        )
+    return img_res
+
+
+def get_yolo_loader_and_saver(
+    path: str,
+    save_dir: str,
+    image_shape: Tuple[int, int] = (640, 640),
+    target_fps: Optional[float] = None,
+    no_save: bool = False,
+) -> Union[Iterable, Any, bool]:
+    """
+
+    :param path: file path to image or directory of .jpg files, a .mp4 video,
+        or an integer (i.e. 0) for web-cam
+    :param save_dir: path of directory to save to
+    :param image_shape: size of input image_batch to model
+    :param target_fps: fps to save potential video at
+    :param no_save: set true if not saving results of processing
+    :return: image loader iterable, result saver objects
+        image_batch, video, or web-cam based on path given, and a boolean value
+        that is True is the returned objects load videos
+    """
+    # video
+    if path.endswith(".mp4"):
+        loader = YoloVideoLoader(path, image_shape)
+        saver = VideoSaver(
+            save_dir,
+            loader.original_fps,
+            loader.original_frame_size,
+            target_fps,
+        )
+        return loader, saver, True
+    # webcam
+    if path.isnumeric():
+        loader = YoloWebcamLoader(int(path), image_shape)
+        saver = (
+            VideoSaver(save_dir, 30, loader.original_frame_size, None)
+            if not no_save
+            else None
+        )
+        return loader, saver, True
+    # image file(s)
+    return YoloImageLoader(path, image_shape), ImagesSaver(save_dir), False
+
+
+class YoloImageLoader:
+    """
+    Class for pre-processing and iterating over image_batch to be used as input for YOLO
+    models
+
+    :param path: Filepath to single image file or directory of image files to load,
+        glob paths also valid
+    :param image_size: size of input image_batch to model
+    """
+
+    def __init__(self, path: str, image_size: Tuple[int, int] = (640, 640)):
+        self._path = path
+        self._image_size = image_size
+
+        if os.path.isdir(path):
+            self._image_file_paths = [
+                os.path.join(path, file_name) for file_name in os.listdir(path)
+            ]
+        elif "*" in path:
+            self._image_file_paths = glob.glob(path)
+        elif os.path.isfile(path):
+            # single file
+            self._image_file_paths = [path]
+        else:
+            raise ValueError(f"{path} is not a file, glob, or directory")
+
+    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
+        for image_path in self._image_file_paths:
+            yield load_image(image_path, image_size=self._image_size)
+
+
+class YoloVideoLoader:
+    """
+    Class for pre-processing and iterating over video frames to be used as input for
+    YOLO models
+
+    :param path: Filepath to single video file
+    :param image_size: size of input image_batch to model
+    """
+
+    def __init__(self, path: str, image_size: Tuple[int, int] = (640, 640)):
+        self._path = path
+        self._image_size = image_size
+        self._vid = cv2.VideoCapture(self._path)
+        self._total_frames = int(self._vid.get(cv2.CAP_PROP_FRAME_COUNT))
+        self._fps = self._vid.get(cv2.CAP_PROP_FPS)
+
+    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
+        for _ in range(self._total_frames):
+            loaded, frame = self._vid.read()
+            if not loaded:
+                break
+            yield load_image(frame, image_size=self._image_size)
+        self._vid.release()
+
+    @property
+    def original_fps(self) -> float:
+        """
+        :return: the frames per second of the video this object reads
+        """
+        return self._fps
+
+    @property
+    def original_frame_size(self) -> Tuple[int, int]:
+        """
+        :return: the original size of frames in the video this object reads
+        """
+        return (
+            int(self._vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
+            int(self._vid.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+        )
+
+    @property
+    def total_frames(self) -> int:
+        """
+        :return: the total number of frames this object may laod from the video
+        """
+        return self._total_frames
+
+
+class YoloWebcamLoader:
+    """
+    Class for pre-processing and iterating over webcam frames to be used as input for
+    YOLO models.
+
+    Adapted from: https://github.com/ultralytics/yolov5/blob/master/utils/datasets.py
+
+    :param camera: Webcam index
+    :param image_size: size of input image_batch to model
+    """
+
+    def __init__(self, camera: int, image_size: Tuple[int, int] = (640, 640)):
+
+        self._camera = camera
+        self._image_size = image_size
+        self._stream = cv2.VideoCapture(self._camera)
+        self._stream.set(cv2.CAP_PROP_BUFFERSIZE, 3)
+
+    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
+        while True:
+            if cv2.waitKey(1) == ord("q"):  # q to quit
+                self._stream.release()
+                cv2.destroyAllWindows()
+                break
+            loaded, frame = self._stream.read()
+
+            assert loaded, f"Could not load image from webcam {self._camera}"
+
+            frame = cv2.flip(frame, 1)  # flip left-right
+            yield load_image(frame, image_size=self._image_size)
+
+    @property
+    def original_frame_size(self) -> Tuple[int, int]:
+        """
+        :return: the original size of frames in the stream this object reads
+        """
+        return (
+            int(self._stream.get(cv2.CAP_PROP_FRAME_WIDTH)),
+            int(self._stream.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+        )
+
+
+class ImagesSaver:
+    """
+    Base class for saving YOLO model outputs. Saves each image as an individual file in
+    the given directory
+
+    :param save_dir: path to directory to write to
+    """
+
+    def __init__(self, save_dir: str):
+        self._save_dir = save_dir
+        self._idx = 0
+
+        create_dirs(save_dir)
+
+    def save_frame(self, image: numpy.ndarray):
+        """
+        :param image: numpy array of image to save
+        """
+        output_path = os.path.join(self._save_dir, f"result-{self._idx}.jpg")
+        cv2.imwrite(output_path, image)
+        self._idx += 1
+
+    def close(self):
+        """
+        perform any clean-up tasks
+        """
+        pass
+
+
+class VideoSaver(ImagesSaver):
+    """
+    Class for saving YOLO model outputs as a VideoFile
+
+    :param save_dir: path to directory to write to
+    :param original_fps: frames per second to save video with
+    :param output_frame_size: size of frames to write
+    :param target_fps: fps target for output video. if present, video
+        will be written with a certain number of the original frames
+        evenly dropped to match the target FPS.
+    """
+
+    def __init__(
+        self,
+        save_dir: str,
+        original_fps: float,
+        output_frame_size: Tuple[int, int],
+        target_fps: Optional[float] = None,
+    ):
+        super().__init__(save_dir)
+
+        self._output_frame_size = output_frame_size
+        self._original_fps = original_fps
+
+        if target_fps is not None and target_fps >= original_fps:
+            print(
+                f"target_fps {target_fps} is greater than source_fps "
+                f"{original_fps}. target fps file will not be invoked"
+            )
+        self._target_fps = target_fps
+
+        self._file_path = os.path.join(self._save_dir, "results.mp4")
+        self._writer = cv2.VideoWriter(
+            self._file_path,
+            cv2.VideoWriter_fourcc(*"mp4v"),
+            original_fps,
+            self._output_frame_size,
+        )
+        self._n_frames = 0
+
+    def save_frame(self, image: numpy.ndarray):
+        """
+        :param image: numpy array of image to save
+        """
+        self._writer.write(image)
+        self._n_frames += 1
+
+    def close(self):
+        """
+        perform any clean-up tasks
+        """
+        self._writer.release()
+        if self._target_fps is not None and self._target_fps < self._original_fps:
+            self._write_target_fps_video()
+
+    def _write_target_fps_video(self):
+        assert self._target_fps is not None
+        num_frames_to_keep = int(
+            self._n_frames * (self._target_fps / self._original_fps)
+        )
+        # adjust target fps so we can keep the same video duration
+        adjusted_target_fps = num_frames_to_keep * (self._original_fps / self._n_frames)
+
+        # select num_frames_to_keep evenly spaced frame idxs
+        frame_idxs_to_keep = set(
+            numpy.round(numpy.linspace(0, self._n_frames, num_frames_to_keep))
+            .astype(int)
+            .tolist()
+        )
+
+        # create new video writer for adjusted video
+        vid_path = os.path.join(
+            self._save_dir, f"_results-{adjusted_target_fps:.2f}fps.mp4"
+        )
+        fps_writer = cv2.VideoWriter(
+            vid_path,
+            cv2.VideoWriter_fourcc(*"mp4v"),
+            adjusted_target_fps,
+            self._output_frame_size,
+        )
+
+        # read from original video and write to FPS adjusted video
+        saved_vid = cv2.VideoCapture(self._file_path)
+        for idx in range(self._n_frames):
+            _, frame = saved_vid.read()
+            if idx in frame_idxs_to_keep:
+                fps_writer.write(frame)
+
+        saved_vid.release()
+        fps_writer.release()
+        shutil.move(vid_path, self._file_path)  # overwrite original file
+
+
+def load_image(
+    img: Union[str, numpy.ndarray], image_size: Tuple[int, int] = (640, 640)
+) -> Tuple[List[numpy.ndarray], List[numpy.ndarray]]:
+    """
+    :param img: file path to image or raw image array
+    :param image_size: target shape for image
+    :return: Image loaded into numpy and reshaped to the given shape and the original
+        image
+    """
+    img = cv2.imread(img) if isinstance(img, str) else img
+    img_resized = cv2.resize(img, image_size)
+    img_transposed = img_resized[:, :, ::-1].transpose(2, 0, 1)
+
+    return img_transposed, img
+
+
+def get_annotations_save_dir(
+    initial_save_dir: str,
+    tag: Optional[str] = None,
+    engine: Optional[str] = None,
+) -> str:
+    """
+    Returns the directory to save annotations to. If directory exists and is
+    non-empty, a number is appended to the end of the directory name.
+
+    :param initial_save_dir: Initial directory to save annotations to
+    :param tag: A tag under which to save the annotations inside `save_dir`
+    :param engine: Used to generate a unique tag if it is not provided.
+    :return: A new unique dir path to save annotations to
+    """
+    name = tag or f"{engine}-annotations"
+    initial_save_dir = os.path.join(initial_save_dir, name)
+    counter = 0
+    new_save_dir = initial_save_dir
+    while Path(new_save_dir).exists() and any(Path(new_save_dir).iterdir()):
+        counter += 1
+        new_save_dir = os.path.join(initial_save_dir, f"{name}-{counter:03d}")
+
+    _LOGGER.info(f"Results will be saved to {new_save_dir}")
+    Path(new_save_dir).mkdir(parents=True, exist_ok=True)
+    return new_save_dir