diff --git a/README.md b/README.md
index e0ea89bcc3..91e4d21b22 100644
--- a/README.md
+++ b/README.md
@@ -139,12 +139,12 @@ deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
 ## 👩‍💻 NLP Inference Example
 
 ```python
-from deepsparse import Pipeline
+from deepsparse.transformers import pipeline
 
 # SparseZoo model stub or path to ONNX file
 model_path = "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni"
 
-qa_pipeline = Pipeline.create(
+qa_pipeline = pipeline(
     task="question-answering",
     model_path=model_path,
 )
diff --git a/setup.py b/setup.py
index c6299ca473..6cd3ec7861 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,6 @@
 _deps = [
     "numpy>=1.16.3",
     "onnx>=1.5.0,<=1.10.1",
-    "pydantic>=1.8.2",
     "requests>=2.0.0",
     "tqdm>=4.0.0",
     "protobuf>=3.12.2",
@@ -81,16 +80,6 @@
     "onnxruntime>=1.7.0",
 ]
 
-_ic_integration_deps = [
-    "click<8.1",
-    "opencv-python",
-]
-
-_yolo_integration_deps = [
-    "torchvision>=0.3.0,<=0.10.1",
-    "opencv-python",
-]
-
 
 class OverrideInstall(install):
     """
@@ -183,15 +172,12 @@ def _setup_extras() -> Dict:
         "dev": _dev_deps,
         "server": _server_deps,
         "onnxruntime": _onnxruntime_deps,
-        "image_classification": _ic_integration_deps,
-        "yolo": _yolo_integration_deps,
     }
 
 
 def _setup_entry_points() -> Dict:
     data_api_entrypoint = "deepsparse.transformers.pipelines_cli:cli"
     eval_downstream = "deepsparse.transformers.eval_downstream:main"
-
     return {
         "console_scripts": [
             f"deepsparse.transformers.run_inference={data_api_entrypoint}",
@@ -200,7 +186,6 @@ def _setup_entry_points() -> Dict:
             "deepsparse.check_hardware=deepsparse.cpu:print_hardware_capability",
             "deepsparse.benchmark=deepsparse.benchmark.benchmark_model:main",
             "deepsparse.server=deepsparse.server.main:start_server",
-            "deepsparse.object_detection.annotate=deepsparse.yolo.annotate:main",
         ]
     }
 
diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index d9c28dc591..3d3113b74b 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -31,7 +31,6 @@
     cpu_vnni_compatible,
 )
 from .engine import *
-from .pipeline import *
 from .version import __version__, is_release
 
 
diff --git a/src/deepsparse/image_classification/__init__.py b/src/deepsparse/image_classification/__init__.py
deleted file mode 100644
index 0c44f887a4..0000000000
--- a/src/deepsparse/image_classification/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/src/deepsparse/image_classification/constants.py b/src/deepsparse/image_classification/constants.py
deleted file mode 100644
index d035e44513..0000000000
--- a/src/deepsparse/image_classification/constants.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IMAGENET_RGB_MEANS = [0.485, 0.456, 0.406]
-IMAGENET_RGB_STDS = [0.229, 0.224, 0.225]
diff --git a/src/deepsparse/image_classification/pipelines.py b/src/deepsparse/image_classification/pipelines.py
deleted file mode 100644
index e085937728..0000000000
--- a/src/deepsparse/image_classification/pipelines.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Image classification pipeline
-"""
-import json
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-import onnx
-
-from deepsparse.image_classification.constants import (
-    IMAGENET_RGB_MEANS,
-    IMAGENET_RGB_STDS,
-)
-from deepsparse.image_classification.schemas import (
-    ImageClassificationInput,
-    ImageClassificationOutput,
-)
-from deepsparse.pipeline import Pipeline
-from deepsparse.utils import model_to_path
-
-
-try:
-    import cv2
-
-    cv2_error = None
-except ModuleNotFoundError as cv2_import_error:
-    cv2 = None
-    cv2_error = cv2_import_error
-
-
-@Pipeline.register(
-    task="image_classification",
-    default_model_path=(
-        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/"
-        "imagenet/pruned85_quant-none-vnni"
-    ),
-)
-class ImageClassificationPipeline(Pipeline):
-    """
-    Image classification pipeline for DeepSparse
-
-    :param model_path: path on local system or SparseZoo stub to load the model from
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param class_names: Optional dict, or json file of class names to use for
-        mapping class ids to class labels. Default is None
-    """
-
-    def __init__(
-        self,
-        *,
-        class_names: Union[None, str, Dict[str, str]] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        if isinstance(class_names, str) and class_names.endswith(".json"):
-            self._class_names = json.load(open(class_names))
-        elif isinstance(class_names, dict):
-            self._class_names = class_names
-        else:
-            self._class_names = None
-
-        self._image_size = self._infer_image_size()
-
-    @property
-    def class_names(self) -> Optional[Dict[str, str]]:
-        """
-        :return: Optional dict, or json file of class names to use for
-            mapping class ids to class labels
-        """
-        return self._class_names
-
-    @property
-    def input_schema(self) -> Type[ImageClassificationInput]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        return ImageClassificationInput
-
-    @property
-    def output_schema(self) -> Type[ImageClassificationOutput]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        return ImageClassificationOutput
-
-    def setup_onnx_file_path(self) -> str:
-        """
-        Performs any setup to unwrap and process the given `model_path` and other
-        class properties into an inference ready onnx file to be compiled by the
-        engine of the pipeline
-
-        :return: file path to the ONNX file for the engine to compile
-        """
-
-        return model_to_path(self.model_path)
-
-    def process_inputs(self, inputs: ImageClassificationInput) -> List[numpy.ndarray]:
-        """
-        Pre-Process the Inputs for DeepSparse Engine
-
-        :param inputs: input model
-        :return: list of preprocessed numpy arrays
-        """
-
-        if isinstance(inputs.images, numpy.ndarray):
-            image_batch = inputs.images
-        else:
-
-            image_batch = []
-
-            if isinstance(inputs.images, str):
-                inputs.images = [inputs.images]
-
-            for image in inputs.images:
-                if cv2 is None:
-                    raise RuntimeError(
-                        "cv2 is required to load image inputs from file "
-                        f"Unable to import: {cv2_error}"
-                    )
-                img = cv2.imread(image) if isinstance(image, str) else image
-
-                img = cv2.resize(img, dsize=self._image_size)
-                img = img[:, :, ::-1].transpose(2, 0, 1)
-                image_batch.append(img)
-
-            image_batch = numpy.stack(image_batch, axis=0)
-
-        original_dtype = image_batch.dtype
-        image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32)
-
-        if original_dtype == numpy.uint8:
-
-            image_batch /= 255
-
-        # normalize entire batch
-        image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1))
-        image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1))
-
-        return [image_batch]
-
-    def process_engine_outputs(
-        self,
-        engine_outputs: List[numpy.ndarray],
-    ) -> ImageClassificationOutput:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-        labels = numpy.argmax(engine_outputs[0], axis=1).tolist()
-
-        if self.class_names is not None:
-            labels = [self.class_names[str(class_id)] for class_id in labels]
-
-        return self.output_schema(
-            scores=numpy.max(engine_outputs[0], axis=1).tolist(),
-            labels=labels,
-        )
-
-    def _infer_image_size(self) -> Tuple[int, ...]:
-        """
-        Infer and return the expected shape of the input tensor
-
-        :return: The expected shape of the input tensor from onnx graph
-        """
-        onnx_model = onnx.load(self.onnx_file_path)
-        input_tensor = onnx_model.graph.input[0]
-        return (
-            input_tensor.type.tensor_type.shape.dim[2].dim_value,
-            input_tensor.type.tensor_type.shape.dim[3].dim_value,
-        )
diff --git a/src/deepsparse/image_classification/schemas.py b/src/deepsparse/image_classification/schemas.py
deleted file mode 100644
index 5a92b90e3b..0000000000
--- a/src/deepsparse/image_classification/schemas.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Input/Output Schemas for Image Classification.
-"""
-
-from typing import List, Union
-
-import numpy
-from pydantic import BaseModel
-
-
-class ImageClassificationInput(BaseModel):
-    """
-    Input model for image classification
-    """
-
-    images: Union[str, numpy.ndarray, List[str]]
-
-    class Config:
-        arbitrary_types_allowed = True
-
-
-class ImageClassificationOutput(BaseModel):
-    """
-    Output model for image classification
-    """
-
-    labels: List[Union[int, str]]
-    scores: List[float]
diff --git a/src/deepsparse/image_classification/validation_script.py b/src/deepsparse/image_classification/validation_script.py
deleted file mode 100644
index e176b4072c..0000000000
--- a/src/deepsparse/image_classification/validation_script.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Usage: validation_script.py [OPTIONS]
-
-  Validation Script for Image Classification Models
-
-Options:
-  --dataset-path, --dataset_path DIRECTORY
-                                  Path to the validation dataset  [required]
-  --model-path, --model_path TEXT
-                                  Path/SparseZoo stub for the Image
-                                  Classification model to be evaluated.
-                                  Defaults to resnet50 trained on
-                                  Imagenette  [default: zoo:cv/classification/
-                                  resnet_v1-50/pytorch/sparseml/imagenette/
-                                  base-none]
-  --batch-size, --batch_size INTEGER
-                                  Test batch size, must divide the dataset
-                                  evenly, else the last batch will be dropped
-                                  [default: 1]
-  --help                          Show this message and exit.
-
-#########
-EXAMPLES
-#########
-
-##########
-Example command for validating pruned resnet50 on imagenette dataset:
-python validation_script.py \
-  --dataset-path /path/to/imagenette/
-
-"""
-from tqdm import tqdm
-
-from deepsparse.pipeline import Pipeline
-from torch.utils.data import DataLoader
-from torchvision import transforms
-
-
-try:
-    import torchvision
-
-except ModuleNotFoundError as torchvision_error:  # noqa: F841
-    print(
-        "Torchvision not installed. Please install it using the command:"
-        "pip install torchvision>=0.3.0,<=0.10.1"
-    )
-    exit(1)
-
-import click
-
-
-resnet50_imagenet_pruned = (
-    "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenette/base-none"
-)
-
-
-@click.command()
-@click.option(
-    "--dataset-path",
-    "--dataset_path",
-    required=True,
-    type=click.Path(dir_okay=True, file_okay=False),
-    help="Path to the validation dataset",
-)
-@click.option(
-    "--model-path",
-    "--model_path",
-    type=str,
-    default=resnet50_imagenet_pruned,
-    help="Path/SparseZoo stub for the Image Classification model to be "
-    "evaluated. Defaults to dense (vanilla) resnet50 trained on Imagenette",
-    show_default=True,
-)
-@click.option(
-    "--batch-size",
-    "--batch_size",
-    type=int,
-    default=1,
-    show_default=True,
-    help="Test batch size, must divide the dataset evenly, else last "
-    "batch will be dropped",
-)
-@click.option(
-    "--image-size",
-    "--image_size",
-    type=int,
-    default=224,
-    show_default=True,
-    help="Test batch size, must divide the dataset evenly, else last "
-    "batch will be dropped",
-)
-def main(dataset_path: str, model_path: str, batch_size: int, image_size: int):
-    """
-    Validation Script for Image Classification Models
-    """
-
-    dataset = torchvision.datasets.ImageFolder(
-        root=dataset_path,
-        transform=transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Resize(size=(image_size, image_size)),
-            ]
-        ),
-    )
-
-    data_loader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        drop_last=True,
-    )
-
-    pipeline = Pipeline.create(
-        task="image_classification",
-        model_path=model_path,
-        batch_size=batch_size,
-    )
-    correct = total = 0
-    progress_bar = tqdm(data_loader)
-
-    for batch in progress_bar:
-        batch, actual_labels = batch
-        batch = batch.numpy()
-        outs = pipeline(images=batch)
-        predicted_labels = outs.labels
-
-        for actual, predicted in zip(actual_labels, predicted_labels):
-            total += 1
-            if isinstance(predicted, str):
-                predicted = int(predicted)
-            if actual.item() == predicted:
-                correct += 1
-
-        if total > 0:
-            progress_bar.set_postfix(
-                {"Running Accuracy": f"{correct * 100 / total:.2f}%"}
-            )
-
-    # prevent division by zero
-    if total == 0:
-        epsilon = 1e-5
-        total += epsilon
-
-    print(f"Accuracy: {correct * 100 / total:.2f} %")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
deleted file mode 100644
index 5ab6b9ec63..0000000000
--- a/src/deepsparse/pipeline.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Classes and registry for end to end inference pipelines that wrap an underlying
-inference engine and include pre/postprocessing
-"""
-
-
-import os
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-from pydantic import BaseModel, Field
-
-from deepsparse import Engine, Scheduler
-from deepsparse.benchmark import ORTEngine
-from deepsparse.tasks import SupportedTasks
-
-
-__all__ = [
-    "DEEPSPARSE_ENGINE",
-    "ORT_ENGINE",
-    "SUPPORTED_PIPELINE_ENGINES",
-    "Pipeline",
-    "PipelineConfig",
-]
-
-
-DEEPSPARSE_ENGINE = "deepsparse"
-ORT_ENGINE = "onnxruntime"
-
-SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
-
-
-_REGISTERED_PIPELINES = {}
-
-
-class Pipeline(ABC):
-    """
-    Generic Pipeline abstract class meant to wrap inference engine objects to include
-    data pre/post-processing. Inputs and outputs of pipelines should be serialized
-    as pydantic Models.
-
-    Pipelines should not be instantiated by their constructors, but rather the
-    `Pipeline.create()` method. The task name given to `create` will be used to
-    load the appropriate pipeline. When creating a Pipeline, the pipeline should
-    inherit from `Pipeline` and implement the `setup_onnx_file_path`, `process_inputs`,
-    `process_engine_outputs`, `input_schema`, and `output_schema` abstract methods.
-
-    Finally, the class definition should be decorated by the `Pipeline.register`
-    function. This defines the task name and task aliases for the pipeline and
-    ensures that it will be accessible by `Pipeline.create`. The implemented
-    `Pipeline` subclass must be imported at runtime to be accessible.
-
-    Pipeline lifecycle:
-     - On instantiation
-         * `onnx_file_path` <- `setup_onnx_file_path`
-         * `engine` <- `_initialize_engine`
-
-     - on __call__:
-         * `parsed_inputs: input_schema` <- `parse_inputs(*args, **kwargs)`
-         * `pre_processed_inputs` <- `process_inputs(parsed_inputs)`
-         * `engine_outputs` <- `engine(pre_processed_inputs)`
-         * `outputs: output_schema` <- `process_engine_outputs(engine_outputs)`
-
-    Example use of register:
-     ```python
-     @Pipeline.register(
-     task="example_task",
-     task_aliases=["example_alias_1", "example_alias_2"],
-     )
-     class PipelineImplementation(Pipeline):
-     # implementation of Pipeline abstract methods here
-     ```
-
-    Example use of pipeline:
-     ```python
-     example_pipeline = Pipeline.create(
-         task="example_task",
-         model_path="model.onnx",
-     )
-     pipeline_outputs = example_pipeline(pipeline_inputs)
-     ```
-
-    :param model_path: path on local system or SparseZoo stub to load the model from
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    """
-
-    def __init__(
-        self,
-        model_path: str,
-        engine_type: str = DEEPSPARSE_ENGINE,
-        batch_size: int = 1,
-        num_cores: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        alias: Optional[str] = None,
-    ):
-        self._model_path_orig = model_path
-        self._model_path = model_path
-        self._engine_type = engine_type
-        self._alias = alias
-
-        self._engine_args = dict(
-            batch_size=batch_size,
-            num_cores=num_cores,
-            input_shapes=input_shapes,
-        )
-        if engine_type.lower() == DEEPSPARSE_ENGINE:
-            self._engine_args["scheduler"] = scheduler
-
-        self.onnx_file_path = self.setup_onnx_file_path()
-        self.engine = self._initialize_engine()
-
-    def __call__(self, *args, **kwargs) -> BaseModel:
-        # parse inputs into input_schema schema if necessary
-        pipeline_inputs = self.parse_inputs(*args, **kwargs)
-        if not isinstance(pipeline_inputs, self.input_schema):
-            raise RuntimeError(
-                f"Unable to parse {self.__class__} inputs into a "
-                f"{self.input_schema} object. Inputs parsed to {type(pipeline_inputs)}"
-            )
-
-        # run pipeline
-        engine_inputs: List[numpy.ndarray] = self.process_inputs(pipeline_inputs)
-
-        if isinstance(engine_inputs, tuple):
-            engine_inputs, postprocess_kwargs = engine_inputs
-        else:
-            postprocess_kwargs = {}
-
-        engine_outputs: List[numpy.ndarray] = self.engine(engine_inputs)
-        pipeline_outputs = self.process_engine_outputs(
-            engine_outputs, **postprocess_kwargs
-        )
-
-        # validate outputs format
-        if not isinstance(pipeline_outputs, self.output_schema):
-            raise ValueError(
-                f"Outputs of {self.__class__} must be instances of "
-                f"{self.output_schema} found output of type {type(pipeline_outputs)}"
-            )
-
-        return pipeline_outputs
-
-    @staticmethod
-    def create(
-        task: str,
-        model_path: str = None,
-        engine_type: str = DEEPSPARSE_ENGINE,
-        batch_size: int = 1,
-        num_cores: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        alias: Optional[str] = None,
-        **kwargs,
-    ) -> "Pipeline":
-        """
-        :param task: name of task to create a pipeline for
-        :param model_path: path on local system or SparseZoo stub to load the model
-            from. Some tasks may have a default model path
-        :param engine_type: inference engine to use. Currently supported values
-            include 'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-        :param batch_size: static batch size to use for inference. Default is 1
-        :param num_cores: number of CPU cores to allocate for inference engine. None
-            specifies all available cores. Default is None
-        :param scheduler: (deepsparse only) kind of scheduler to execute with.
-            Pass None for the default
-        :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-            to use model as-is. Default is None
-        :param alias: optional name to give this pipeline instance, useful when
-            inferencing with multiple models. Default is None
-        :param kwargs: extra task specific kwargs to be passed to task Pipeline
-            implementation
-        :return: pipeline object initialized for the given task
-        """
-        task = task.lower().replace("-", "_")
-
-        # extra step to register pipelines for a given task domain
-        # for cases where imports should only happen once a user specifies
-        # that domain is to be used. (ie deepsparse.transformers will auto
-        # install extra packages so should only import and register once a
-        # transformers task is specified)
-        SupportedTasks.check_register_task(task)
-
-        if task not in _REGISTERED_PIPELINES:
-            raise ValueError(
-                f"Unknown Pipeline task {task}. Pipeline tasks should be "
-                "must be declared with the Pipeline.register decorator. Currently "
-                f"registered pipelines: {list(_REGISTERED_PIPELINES.keys())}"
-            )
-
-        pipeline_constructor = _REGISTERED_PIPELINES[task]
-
-        if (
-            model_path is None
-            and hasattr(pipeline_constructor, "default_model_path")
-            and pipeline_constructor.default_model_path
-        ):
-            model_path = pipeline_constructor.default_model_path
-
-        if model_path is None:
-            raise ValueError(
-                f"No model_path provided for pipeline {pipeline_constructor}. Must "
-                "provide a model path for pipelines that do not have a default defined"
-            )
-
-        return pipeline_constructor(
-            model_path=model_path,
-            engine_type=engine_type,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            scheduler=scheduler,
-            input_shapes=input_shapes,
-            alias=alias,
-            **kwargs,
-        )
-
-    @classmethod
-    def register(
-        cls,
-        task: str,
-        task_aliases: Optional[List[str]] = None,
-        default_model_path: Optional[str] = None,
-    ):
-        """
-        Pipeline implementer class decorator that registers the pipeline
-        task name and its aliases as valid tasks that can be used to load
-        the pipeline through `Pipeline.create()`.
-
-        Multiple pipelines may not have the same task name. An error will
-        be raised if two different pipelines attempt to register the same task name
-
-        :param task: main task name of this pipeline
-        :param task_aliases: list of extra task names that may be used to reference
-            this pipeline. Default is None
-        :param default_model_path: path (ie zoo stub) to use as default for this
-            task if None is provided
-        """
-        task_names = [task]
-        if task_aliases:
-            task_names.extend(task_aliases)
-
-        def _register_task(task_name, pipeline_class):
-            if task_name in _REGISTERED_PIPELINES and (
-                pipeline_class is not _REGISTERED_PIPELINES[task_name]
-            ):
-                raise RuntimeError(
-                    f"task {task_name} already registered by Pipeline.register. "
-                    f"attempting to register pipeline: {pipeline_class}, but"
-                    f"pipeline: {_REGISTERED_PIPELINES[task_name]}, already registered"
-                )
-            _REGISTERED_PIPELINES[task_name] = pipeline_class
-
-        def _register_pipeline_tasks_decorator(pipeline_class: Pipeline):
-            if not issubclass(pipeline_class, cls):
-                raise RuntimeError(
-                    f"Attempting to register pipeline pipeline_class. "
-                    f"Registered pipelines must inherit from {cls}"
-                )
-            for task_name in task_names:
-                _register_task(task_name, pipeline_class)
-
-            # set task and task_aliases as class level property
-            pipeline_class.task = task
-            pipeline_class.task_aliases = task_aliases
-            pipeline_class.default_model_path = default_model_path
-
-            return pipeline_class
-
-        return _register_pipeline_tasks_decorator
-
-    @classmethod
-    def from_config(cls, config: Union["PipelineConfig", str, Path]) -> "Pipeline":
-        """
-        :param config: PipelineConfig object, filepath to a json serialized
-            PipelineConfig, or raw string of a json serialized PipelineConfig
-        :return: loaded Pipeline object from the config
-        """
-        if isinstance(config, Path) or (
-            isinstance(config, str) and os.path.exists(config)
-        ):
-            if isinstance(config, str):
-                config = Path(config)
-            config = PipelineConfig.parse_file(config)
-        if isinstance(config, str):
-            config = PipelineConfig.parse_raw(config)
-
-        return cls.create(
-            task=config.task,
-            model_path=config.model_path,
-            engine_type=config.engine_type,
-            batch_size=config.batch_size,
-            num_cores=config.num_cores,
-            scheduler=config.scheduler,
-            input_shapes=config.input_shapes,
-            alias=config.alias,
-            **config.kwargs,
-        )
-
-    @abstractmethod
-    def setup_onnx_file_path(self) -> str:
-        """
-        Performs any setup to unwrap and process the given `model_path` and other
-        class properties into an inference ready onnx file to be compiled by the
-        engine of the pipeline
-
-        :return: file path to the ONNX file for the engine to compile
-        """
-        raise NotImplementedError()
-
-    @abstractmethod
-    def process_inputs(
-        self,
-        inputs: BaseModel,
-    ) -> Union[List[numpy.ndarray], Tuple[List[numpy.ndarray], Dict[str, Any]]]:
-        """
-        :param inputs: inputs to the pipeline. Must be the type of the `input_schema`
-            of this pipeline
-        :return: inputs of this model processed into a list of numpy arrays that
-            can be directly passed into the forward pass of the pipeline engine. Can
-            also include a tuple with engine inputs and special key word arguments
-            to pass to process_engine_outputs to facilitate information from the raw
-            inputs to postprocessing that may not be included in the engine inputs
-        """
-        raise NotImplementedError()
-
-    @abstractmethod
-    def process_engine_outputs(
-        self,
-        engine_outputs: List[numpy.ndarray],
-        **kwargs,
-    ) -> BaseModel:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        raise NotImplementedError()
-
-    @property
-    def alias(self) -> str:
-        """
-        :return: optional name to give this pipeline instance, useful when
-            inferencing with multiple models
-        """
-        return self._alias
-
-    @property
-    def model_path_orig(self) -> str:
-        """
-        :return: value originally passed to the `model_path` argument to initialize
-            this Pipeline
-        """
-        return self._model_path_orig
-
-    @property
-    def model_path(self) -> str:
-        """
-        :return: path on local system to the onnx file of this model or directory
-            containing a model.onnx file along with supporting files
-        """
-        return self._model_path
-
-    @property
-    def engine_args(self) -> Dict[str, Any]:
-        """
-        :return: arguments besides onnx filepath used to instantiate engine
-        """
-        return self._engine_args
-
-    @property
-    def engine_type(self) -> str:
-        """
-        :return: type of inference engine used for model forward pass
-        """
-        return self._engine_type
-
-    def to_config(self) -> "PipelineConfig":
-        """
-        :return: PipelineConfig that can be used to reload this object
-        """
-
-        if not hasattr(self, "task"):
-            raise RuntimeError(
-                f"{self.__class__} instance has no attribute task. Pipeline objects "
-                "must have a task to be serialized to a config. Pipeline objects "
-                "must be declared with the Pipeline.register object to be assigned a "
-                "task"
-            )
-
-        # parse any additional properties as kwargs
-        kwargs = {}
-        for attr_name, attr in self.__class__.__dict__.items():
-            if isinstance(attr, property) and attr_name not in dir(PipelineConfig):
-                kwargs[attr_name] = getattr(self, attr_name)
-
-        return PipelineConfig(
-            task=self.task,
-            model_path=self.model_path_orig,
-            engine_type=self.engine_type,
-            batch_size=self.batch_size,
-            num_cores=self.num_cores,
-            scheduler=self.scheduler,
-            input_shapes=self.input_shapes,
-            alias=self.alias,
-            kwargs=kwargs,
-        )
-
-    def parse_inputs(self, *args, **kwargs) -> BaseModel:
-        """
-        :param args: ordered arguments to pipeline, only an input_schema object
-            is supported as an arg for this function
-        :param kwargs: keyword arguments to pipeline
-        :return: pipeline arguments parsed into the given `input_schema`
-            schema if necessary. If an instance of the `input_schema` is provided
-            it will be returned
-        """
-        # passed input_schema schema directly
-        if len(args) == 1 and isinstance(args[0], self.input_schema) and not kwargs:
-            return args[0]
-
-        if args:
-            raise ValueError(
-                f"pipeline {self.__class__} only supports either only a "
-                f"{self.input_schema} object. or keyword arguments to be construct "
-                f"one. Found {len(args)} args and {len(kwargs)} kwargs"
-            )
-
-        return self.input_schema(**kwargs)
-
-    def _initialize_engine(self) -> Union[Engine, ORTEngine]:
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            return Engine(self.onnx_file_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_file_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
-
-
-class PipelineConfig(BaseModel):
-    """
-    Configuration for creating a Pipeline object
-
-    Can be used to create a Pipeline from a config object or file with
-    Pipeline.from_config(), or used as a building block for other configs
-    such as for deepsparse.server
-    """
-
-    task: str = Field(
-        description="name of task to create a pipeline for",
-    )
-    model_path: str = Field(
-        description="path on local system or SparseZoo stub to load the model from",
-    )
-    engine_type: str = Field(
-        default=DEEPSPARSE_ENGINE,
-        description=(
-            "inference engine to use. Currently supported values include "
-            "'deepsparse' and 'onnxruntime'. Default is 'deepsparse'"
-        ),
-    )
-    batch_size: int = Field(
-        default=1,
-        description=("static batch size to use for inference. Default is 1"),
-    )
-    num_cores: int = Field(
-        default=None,
-        description=(
-            "number of CPU cores to allocate for inference engine. None"
-            "specifies all available cores. Default is None"
-        ),
-    )
-    scheduler: str = Field(
-        default="async",
-        description=(
-            "(deepsparse only) kind of scheduler to execute with. Defaults to async"
-        ),
-    )
-    input_shapes: List[List[int]] = Field(
-        default=None,
-        description=(
-            "list of shapes to set ONNX the inputs to. Pass None to use model as-is. "
-            "Default is None"
-        ),
-    )
-    alias: str = Field(
-        default=None,
-        description=(
-            "optional name to give this pipeline instance, useful when inferencing "
-            "with multiple models. Default is None"
-        ),
-    )
-    kwargs: Dict[str, Any] = Field(
-        default={},
-        description=(
-            "Additional arguments for inference with the model that will be passed "
-            "into the pipeline as kwargs"
-        ),
-    )
diff --git a/src/deepsparse/server/config.py b/src/deepsparse/server/config.py
index 0d0be42ec0..7f9ac9bd59 100644
--- a/src/deepsparse/server/config.py
+++ b/src/deepsparse/server/config.py
@@ -19,18 +19,18 @@
 import json
 import os
 from functools import lru_cache
-from typing import List
+from typing import Any, Dict, List
 
 import yaml
 from pydantic import BaseModel, Field
 
-from deepsparse import PipelineConfig
 from deepsparse.cpu import cpu_architecture
 
 
 __all__ = [
     "ENV_DEEPSPARSE_SERVER_CONFIG",
     "ENV_SINGLE_PREFIX",
+    "ServeModelConfig",
     "ServerConfig",
 ]
 
@@ -39,15 +39,75 @@
 ENV_SINGLE_PREFIX = "DEEPSPARSE_SINGLE_MODEL:"
 
 
+class ServeModelConfig(BaseModel):
+    """
+    Configuration for serving a model for a given task in the DeepSparse server
+    """
+
+    task: str = Field(
+        description=(
+            "The task the model_path is serving. For example, one of: "
+            "question_answering, text_classification, token_classification."
+        ),
+    )
+    model_path: str = Field(
+        description=(
+            "The path to a model.onnx file, "
+            "a model folder containing the model.onnx and supporting files, "
+            "or a SparseZoo model stub."
+        ),
+    )
+    batch_size: int = Field(
+        default=1,
+        description=(
+            "The batch size to instantiate the model with and use for serving"
+        ),
+    )
+    alias: str = Field(
+        default=None,
+        description=(
+            "Alias name for model pipeline to be served. A convenience route of "
+            "/predict/alias will be added to the server if present. "
+        ),
+    )
+    kwargs: Dict[str, Any] = Field(
+        default={},
+        description=(
+            "Additional arguments for inference with the model that will be passed "
+            "into the pipeline as kwargs"
+        ),
+    )
+    engine: str = Field(
+        default="deepsparse",
+        description=(
+            "The engine to use for serving the models such as deepsparse or onnxruntime"
+        ),
+    )
+    num_cores: int = Field(
+        default=None,
+        description=(
+            "The number of physical cores to restrict the DeepSparse Engine to. "
+            "Defaults to all cores."
+        ),
+    )
+    scheduler: str = Field(
+        default="async",
+        description=(
+            "The scheduler to use with the DeepSparse Engine such as sync or async. "
+            "Defaults to async"
+        ),
+    )
+
+
 class ServerConfig(BaseModel):
     """
     A configuration for serving models in the DeepSparse inference server
     """
 
-    models: List[PipelineConfig] = Field(
+    models: List[ServeModelConfig] = Field(
         default=[],
         description=(
-            "The models to serve in the server defined by PipelineConfig objects"
+            "The models to serve in the server defined by the additional arguments"
         ),
     )
     workers: str = Field(
@@ -88,7 +148,7 @@ def server_config_from_env(env_key: str = ENV_DEEPSPARSE_SERVER_CONFIG):
         config_dict = json.loads(config_file.replace(ENV_SINGLE_PREFIX, ""))
         config = ServerConfig()
         config.models.append(
-            PipelineConfig(
+            ServeModelConfig(
                 task=config_dict["task"],
                 model_path=config_dict["model_path"],
                 batch_size=config_dict["batch_size"],
@@ -98,7 +158,7 @@ def server_config_from_env(env_key: str = ENV_DEEPSPARSE_SERVER_CONFIG):
         with open(config_file) as file:
             config_dict = yaml.safe_load(file.read())
         config_dict["models"] = (
-            [PipelineConfig(**model) for model in config_dict["models"]]
+            [ServeModelConfig(**model) for model in config_dict["models"]]
             if "models" in config_dict
             else []
         )
diff --git a/src/deepsparse/server/main.py b/src/deepsparse/server/main.py
index dc31f6427f..e8efead286 100644
--- a/src/deepsparse/server/main.py
+++ b/src/deepsparse/server/main.py
@@ -84,7 +84,6 @@
 
 import click
 
-from deepsparse import Pipeline
 from deepsparse.log import set_logging_level
 from deepsparse.server.asynchronous import execute_async, initialize_aysnc
 from deepsparse.server.config import (
@@ -92,6 +91,7 @@
     server_config_from_env,
     server_config_to_env,
 )
+from deepsparse.server.pipelines import load_pipelines_definitions
 from deepsparse.server.utils import serializable_response
 from deepsparse.version import version
 
@@ -130,11 +130,7 @@ def _home():
 
 
 def _add_pipeline_route(
-    app,
-    pipeline: Pipeline,
-    num_models: int,
-    defined_tasks: set,
-    integration: str,
+    app, pipeline_def, num_models: int, defined_tasks: set, integration: str
 ):
     path = "/predict"
 
@@ -146,27 +142,26 @@ def _add_pipeline_route(
             )
         # required path name for Sagemaker
         path = "/invocations"
-    elif pipeline.alias:
-        path = f"/predict/{pipeline.alias}"
+    elif pipeline_def.config.alias:
+        path = f"/predict/{pipeline_def.config.alias}"
     elif num_models > 1:
-        if pipeline.task in defined_tasks:
+        if pipeline_def.config.task in defined_tasks:
             raise ValueError(
-                f"Multiple tasks defined for {pipeline.task} and no alias "
-                f"given for pipeline with model {pipeline.model_path_orig}. "
+                f"Multiple tasks defined for {pipeline_def.config.task} and no alias "
+                f"given for {pipeline_def.config}. "
                 "Either define an alias or supply a single model for the task"
             )
-        path = f"/predict/{pipeline.task}"
-        defined_tasks.add(pipeline.task)
+        path = f"/predict/{pipeline_def.config.task}"
+        defined_tasks.add(pipeline_def.config.task)
 
     @app.post(
         path,
-        response_model=pipeline.output_schema,
+        response_model=pipeline_def.response_model,
         tags=["prediction"],
     )
-    async def _predict_func(request: pipeline.input_schema):
+    async def _predict_func(request: pipeline_def.request_model):
         results = await execute_async(
-            pipeline,
-            request,
+            pipeline_def.pipeline, **vars(request), **pipeline_def.kwargs
         )
         return serializable_response(results)
 
@@ -188,12 +183,15 @@ def server_app_factory():
     _LOGGER.debug("loaded server config %s", config)
     _add_general_routes(app, config)
 
-    pipelines = [Pipeline.from_config(model_config) for model_config in config.models]
-    _LOGGER.debug("loaded pipeline definitions from config %s", pipelines)
+    pipeline_defs = load_pipelines_definitions(config)
+    _LOGGER.debug("loaded pipeline definitions from config %s", pipeline_defs)
     num_tasks = len(config.models)
     defined_tasks = set()
-    for pipeline in pipelines:
-        _add_pipeline_route(app, pipeline, num_tasks, defined_tasks, config.integration)
+
+    for pipeline_def in pipeline_defs:
+        _add_pipeline_route(
+            app, pipeline_def, num_tasks, defined_tasks, config.integration
+        )
 
     return app
 
diff --git a/src/deepsparse/server/pipelines.py b/src/deepsparse/server/pipelines.py
new file mode 100644
index 0000000000..ef07c68ca2
--- /dev/null
+++ b/src/deepsparse/server/pipelines.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pipelines that run preprocessing, postprocessing, and model inference
+within the DeepSparse model server.
+"""
+
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Field
+
+from deepsparse.server.config import ServeModelConfig, ServerConfig
+from deepsparse.tasks import SupportedTasks
+
+
+__all__ = ["PipelineDefinition", "load_pipelines_definitions"]
+
+
+class PipelineDefinition(BaseModel):
+    """
+    A definition of a pipeline to be served by the model server.
+    Used to create a prediction route on construction of the server app.
+    """
+
+    pipeline: Any = Field(description="the callable pipeline to invoke on each request")
+    request_model: Any = Field(
+        description="the pydantic model to validate the request body with"
+    )
+    response_model: Any = Field(
+        description="the pydantic model to validate the response payload with"
+    )
+    kwargs: Dict[str, Any] = Field(
+        description="any additional kwargs that should be passed into the pipeline"
+    )
+    config: ServeModelConfig = Field(
+        description="the config for the model the pipeline is serving"
+    )
+
+
+def load_pipelines_definitions(config: ServerConfig) -> List[PipelineDefinition]:
+    """
+    Load the pipeline definitions to use for creating prediction routes from
+    the given server configuration.
+
+    :param config: the configuration to load pipeline definitions for
+    :return: the loaded pipeline definitions to use for serving inference requests
+    """
+    defs = []
+
+    for model_config in config.models:
+        if SupportedTasks.is_nlp(model_config.task):
+            # dynamically import so we don't install dependencies when unneeded
+            from deepsparse.transformers.server import create_pipeline_definitions
+
+            (
+                pipeline,
+                request_model,
+                response_model,
+                kwargs,
+            ) = create_pipeline_definitions(model_config)
+        else:
+            raise ValueError(
+                f"unsupported task given of {model_config.task} "
+                f"for serve model config {model_config}"
+            )
+
+        defs.append(
+            PipelineDefinition(
+                pipeline=pipeline,
+                request_model=request_model,
+                response_model=response_model,
+                kwargs=kwargs,
+                config=model_config,
+            )
+        )
+
+    return defs
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 690de5276e..6ffaad7ec3 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -78,32 +78,6 @@ class SupportedTasks:
         token_classification=AliasedTask("token_classification", ["ner"]),
     )
 
-    image_classification = namedtuple("image_classification", ["image_classification"])(
-        image_classification=AliasedTask(
-            "image_classification",
-            ["image_classification"],
-        ),
-    )
-
-    yolo = namedtuple("yolo", ["yolo"])(
-        yolo=AliasedTask("yolo", ["yolo"]),
-    )
-
-    @classmethod
-    def check_register_task(cls, task: str):
-        if cls.is_nlp(task):
-            # trigger transformers pipelines to register with Pipeline.register
-            import deepsparse.transformers.pipelines  # noqa: F401
-
-        elif cls.is_image_classification(task):
-            # trigger image classification pipelines to
-            # register with Pipeline.register
-            import deepsparse.image_classification.pipelines  # noqa: F401
-
-        elif cls.is_yolo(task):
-            # trigger yolo pipelines to register with Pipeline.register
-            import deepsparse.yolo.pipelines  # noqa: F401
-
     @classmethod
     def is_nlp(cls, task: str) -> bool:
         """
@@ -116,21 +90,3 @@ def is_nlp(cls, task: str) -> bool:
             or cls.nlp.text_classification.matches(task)
             or cls.nlp.token_classification.matches(task)
         )
-
-    @classmethod
-    def is_image_classification(cls, task: str) -> bool:
-        """
-        :param task: the name of the task to check whether it is an image
-            classification task
-        :return: True if it is an image classification task, False otherwise
-        """
-        return cls.image_classification.image_classification.matches(task)
-
-    @classmethod
-    def is_yolo(cls, task: str) -> bool:
-        """
-        :param task: the name of the task to check whether it is an image
-            segmentation task using YOLO
-        :return: True if it is an segmentation task using YOLO, False otherwise
-        """
-        return cls.yolo.yolo.matches(task)
diff --git a/src/deepsparse/transformers/__init__.py b/src/deepsparse/transformers/__init__.py
index 1264aa316d..89c7eb68ef 100644
--- a/src/deepsparse/transformers/__init__.py
+++ b/src/deepsparse/transformers/__init__.py
@@ -120,3 +120,4 @@ def _check_transformers_install():
 from .helpers import *
 from .loaders import *
 from .pipelines import *
+from .server import *
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index c01a649fbf..b434dec625 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -58,7 +58,7 @@
 
 from tqdm.auto import tqdm
 
-from deepsparse import Pipeline
+from deepsparse.transformers import pipeline
 
 
 from datasets import load_dataset, load_metric  # isort: skip
@@ -79,14 +79,14 @@ def squad_eval(args):
     squad_metrics = load_metric("squad")
 
     # load QA pipeline
-    question_answer = Pipeline.create(
+    question_answer = pipeline(
         task="question-answering",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        sequence_length=args.max_sequence_length,
+        max_length=args.max_sequence_length,
     )
-    print(f"Engine info: {question_answer.engine}")
+    print(f"Engine info: {question_answer.model}")
 
     for idx, sample in enumerate(tqdm(squad)):
         pred = question_answer(
@@ -96,7 +96,7 @@ def squad_eval(args):
         )
 
         squad_metrics.add_batch(
-            predictions=[{"prediction_text": pred.answer, "id": sample["id"]}],
+            predictions=[{"prediction_text": pred["answer"], "id": sample["id"]}],
             references=[{"answers": sample["answers"], "id": sample["id"]}],
         )
 
@@ -114,23 +114,21 @@ def mnli_eval(args):
     mnli_metrics = load_metric("glue", "mnli")
 
     # load pipeline
-    text_classify = Pipeline.create(
+    text_classify = pipeline(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        sequence_length=args.max_sequence_length,
+        max_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.engine}")
-
-    label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
+    print(f"Engine info: {text_classify.model}")
 
     label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
 
     for idx, sample in enumerate(tqdm(mnli_matched)):
         pred = text_classify([[sample["premise"], sample["hypothesis"]]])
         mnli_metrics.add_batch(
-            predictions=[label_map.get(pred.labels[0])],
+            predictions=[label_map.get(pred[0]["label"])],
             references=[sample["label"]],
         )
 
@@ -156,16 +154,14 @@ def qqp_eval(args):
     qqp_metrics = load_metric("glue", "qqp")
 
     # load pipeline
-    text_classify = Pipeline.create(
+    text_classify = pipeline(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        sequence_length=args.max_sequence_length,
+        max_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.engine}")
-
-    label_map = {"not_duplicate": 0, "duplicate": 1}
+    print(f"Engine info: {text_classify.model}")
 
     label_map = {"not_duplicate": 0, "duplicate": 1}
 
@@ -173,7 +169,7 @@ def qqp_eval(args):
         pred = text_classify([[sample["question1"], sample["question2"]]])
 
         qqp_metrics.add_batch(
-            predictions=[label_map.get(pred.labels[0])],
+            predictions=[label_map.get(pred[0]["label"])],
             references=[sample["label"]],
         )
 
@@ -189,16 +185,14 @@ def sst2_eval(args):
     sst2_metrics = load_metric("glue", "sst2")
 
     # load pipeline
-    text_classify = Pipeline.create(
+    text_classify = pipeline(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        sequence_length=args.max_sequence_length,
+        max_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.engine}")
-
-    label_map = {"negative": 0, "positive": 1}
+    print(f"Engine info: {text_classify.model}")
 
     label_map = {"negative": 0, "positive": 1}
 
@@ -208,7 +202,7 @@ def sst2_eval(args):
         )
 
         sst2_metrics.add_batch(
-            predictions=[label_map.get(pred.labels[0])],
+            predictions=[label_map.get(pred[0]["label"])],
             references=[sample["label"]],
         )
 
diff --git a/src/deepsparse/transformers/pipelines.py b/src/deepsparse/transformers/pipelines.py
new file mode 100644
index 0000000000..7725a0e2c2
--- /dev/null
+++ b/src/deepsparse/transformers/pipelines.py
@@ -0,0 +1,1414 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adaptation of transformers.pipelines and onnx_transformers.pipelines
+
+adapted from:
+https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines/base.py
+https://github.com/patil-suraj/onnx_transformers/blob/master/onnx_transformers/pipelines.py
+
+"""
+
+import json
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from itertools import chain
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from transformers.configuration_utils import PretrainedConfig
+from transformers.data import (
+    SquadExample,
+    SquadFeatures,
+    squad_convert_examples_to_features,
+)
+from transformers.file_utils import ExplicitEnum
+from transformers.models.auto import AutoConfig, AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
+from transformers.utils import logging
+
+from deepsparse import Engine, compile_model, cpu
+from deepsparse.transformers.helpers import (
+    fix_numpy_types,
+    get_onnx_path_and_configs,
+    overwrite_transformer_onnx_model_inputs,
+)
+from deepsparse.transformers.loaders import get_batch_loader
+
+
+try:
+    import onnxruntime
+
+    ort_import_error = None
+except Exception as ort_import_err:
+    onnxruntime = None
+    ort_import_error = ort_import_err
+
+__all__ = [
+    "ArgumentHandler",
+    "Pipeline",
+    "TextClassificationPipeline",
+    "TokenClassificationPipeline",
+    "QuestionAnsweringPipeline",
+    "pipeline",
+    "overwrite_transformer_onnx_model_inputs",
+    "SUPPORTED_ENGINES",
+    "SUPPORTED_TASKS",
+]
+
+logger = logging.get_logger(__name__) if logging else None
+
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling arguments for each Pipeline.
+    """
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class DefaultArgumentHandler(ArgumentHandler):
+    """
+    Default argument parser handling parameters for each Pipeline`.
+    """
+
+    @staticmethod
+    def handle_kwargs(kwargs: Dict) -> List:
+        """
+        :param kwargs: key word arguments for a pipeline
+        :return: list of the processed key word arguments
+        """
+        if len(kwargs) == 1:
+            output = list(kwargs.values())
+        else:
+            output = list(chain(kwargs.values()))
+
+        return DefaultArgumentHandler.handle_args(output)
+
+    @staticmethod
+    def handle_args(args: Sequence[Any]) -> List[str]:
+        """
+        :param args: sequence of arguments to a pipeline
+        :return: list of formatted, processed arguments
+        """
+
+        # Only one argument, let's do case by case
+        if len(args) == 1:
+            if isinstance(args[0], str):
+                return [args[0]]
+            elif not isinstance(args[0], list):
+                return list(args)
+            else:
+                return args[0]
+
+        # Multiple arguments (x1, x2, ...)
+        elif len(args) > 1:
+            if all([isinstance(arg, str) for arg in args]):
+                return list(args)
+
+            # If not instance of list, then it should be an instance of iterable
+            elif isinstance(args, Iterable):
+                return list(chain.from_iterable(chain(args)))
+            else:
+                raise ValueError(
+                    f"Invalid input type {type(args)}. Pipeline supports "
+                    "Union[str, Iterable[str]]"
+                )
+        else:
+            return []
+
+    def __call__(self, *args, **kwargs):
+        if len(kwargs) > 0 and len(args) > 0:
+            raise ValueError("Pipeline cannot handle mixed args and kwargs")
+
+        if len(kwargs) > 0:
+            return DefaultArgumentHandler.handle_kwargs(kwargs)
+        else:
+            return DefaultArgumentHandler.handle_args(args)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+class Pipeline(_ScikitCompat):
+    """
+    The Pipeline class is the class from which all pipelines inherit.
+    Refer to this class for methods shared across different pipelines.
+    This base Pipeline class provides support for multiple inference engine backends.
+
+    Base class implementing pipelined operations.
+    Pipeline workflow is defined as a sequence of the following operations:
+
+        Input -> Tokenization -> Model Inference ->
+        Post-Processing (task dependent) -> Output
+
+    Pipeline supports running with the DeepSparse engine or onnxruntime.
+
+    :param model: loaded inference engine to run the model with, can be a
+        deepsparse Engine or onnxruntime InferenceSession
+    :param tokenizer: tokenizer to be used for preprocessing
+    :param config: transformers model config for this model
+    :param engine_type: name of inference engine that is used. Options are
+        deepsparse and onnxruntime
+    :param max_length: maximum sequence length to set for model inputs by default.
+        default value is 128
+    :param input_names: list of input names to the neural network
+    :param args_parser: Reference to the object in charge of parsing supplied
+        pipeline parameters. A default is provided if None
+    :param binary_output: if True, stores outputs as pickled binaries to avoid
+        storing large amount of textual data. Default is False
+    """
+
+    default_input_names = None
+
+    def __init__(
+        self,
+        model: Union[Engine, "onnxruntime.InferenceSession"],
+        tokenizer: PreTrainedTokenizer,
+        config: PretrainedConfig,
+        engine_type: str,
+        max_length: int = 128,
+        input_names: Optional[List[str]] = None,
+        args_parser: ArgumentHandler = None,
+        binary_output: bool = False,
+    ):
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config
+        self.engine_type = engine_type
+        self.max_length = max_length
+        self.input_names = input_names
+        self.binary_output = binary_output
+        self._args_parser = args_parser or DefaultArgumentHandler()
+        self._framework = (
+            "np" if self.engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE] else "pt"
+        )
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines.
+        This method will forward to __call__().
+        """
+        return self(X=X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines.
+        This method will forward to __call__().
+        """
+        return self(X=X)
+
+    def _parse_and_tokenize(
+        self, *args, padding=True, add_special_tokens=True, **kwargs
+    ):
+        # Parse arguments
+        inputs = self._args_parser(*args, **kwargs)
+        inputs = self.tokenizer(
+            inputs,
+            add_special_tokens=add_special_tokens,
+            return_tensors=self._framework,
+            padding=PaddingStrategy.MAX_LENGTH.value,
+            truncation=TruncationStrategy.LONGEST_FIRST.value,
+        )
+
+        return inputs
+
+    def __call__(self, *args, **kwargs):
+        inputs = self._parse_and_tokenize(*args, **kwargs)
+        return self._forward(inputs)
+
+    def _forward(self, inputs):
+        if not all(name in inputs for name in self.input_names):
+            raise ValueError(
+                f"pipeline expected arrays with names {self.input_names}, received "
+                f"inputs: {list(inputs.keys())}"
+            )
+
+        if self.engine_type == ORT_ENGINE:
+            inputs = {k: v for k, v in inputs.items() if k in self.input_names}
+            return self.model.run(None, inputs)
+        elif self.engine_type == DEEPSPARSE_ENGINE:
+            return self.model.run([inputs[name] for name in self.input_names])
+        # TODO: torch
+        # with self.device_placement():
+        #         with torch.no_grad():
+        #             inputs = self.ensure_tensor_on_device(**inputs)
+        #             predictions = self.model(**inputs)[0].cpu()
+        # if return_tensors:
+        #     return predictions
+        # else:
+        #     return predictions.numpy()
+
+
+class TokenClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for token classification.
+    """
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+
+        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+            inputs = list(inputs)
+            batch_size = len(inputs)
+        elif isinstance(inputs, str):
+            inputs = [inputs]
+            batch_size = 1
+        else:
+            raise ValueError("At least one input is required.")
+
+        offset_mapping = kwargs.get("offset_mapping")
+        if offset_mapping:
+            if isinstance(offset_mapping, list) and isinstance(
+                offset_mapping[0], tuple
+            ):
+                offset_mapping = [offset_mapping]
+            if len(offset_mapping) != batch_size:
+                raise ValueError(
+                    "offset_mapping should have the same batch size as the input"
+                )
+        return inputs, offset_mapping
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments
+    (i.e. question & context) to be mapped
+    to internal `transformers.SquadExample`
+
+    QuestionAnsweringArgumentHandler manages all the possible to create a
+    `transformers.SquadExample` from the command-line supplied arguments
+    """
+
+    def __call__(self, *args, **kwargs):
+        # Position args, handling is sensibly the same as X and data,
+        # so forwarding to avoid duplicating
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                kwargs["X"] = args[0]
+            else:
+                kwargs["X"] = list(args)
+
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        if "X" in kwargs or "data" in kwargs:
+            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
+
+            if isinstance(inputs, dict):
+                inputs = [inputs]
+            else:
+                # Copy to avoid overriding arguments
+                inputs = [i for i in inputs]
+
+            for i, item in enumerate(inputs):
+                if isinstance(item, dict):
+                    if any(k not in item for k in ["question", "context"]):
+                        raise KeyError(
+                            "You need to provide a dictionary with keys "
+                            "{question:..., context:...}"
+                        )
+
+                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
+
+                elif not isinstance(item, SquadExample):
+                    arg_name = "X" if "X" in kwargs else "data"
+                    raise ValueError(
+                        f"{arg_name} argument needs to be of type "
+                        "(list[SquadExample | dict], SquadExample, dict)"
+                    )
+
+        # Tabular input
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], str):
+                kwargs["question"] = [kwargs["question"]]
+
+            if isinstance(kwargs["context"], str):
+                kwargs["context"] = [kwargs["context"]]
+
+            inputs = [
+                QuestionAnsweringPipeline.create_sample(q, c)
+                for q, c in zip(kwargs["question"], kwargs["context"])
+            ]
+        else:
+            raise ValueError(f"Unknown arguments {kwargs}")
+
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+
+        return inputs
+
+
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using any `ModelForSequenceClassification`.
+
+    This text classification pipeline can currently be loaded from `pipeline()`
+    using the following task identifier: `"text-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on
+    a text classification task.
+
+    :param return_all_scores: set True to return all model scores. Default False
+    """
+
+    def __init__(self, return_all_scores: bool = False, **kwargs):
+        super().__init__(**kwargs)
+
+        self.return_all_scores = return_all_scores
+
+    def __call__(self, *args, **kwargs):
+        """
+        Classify the text(s) given as inputs.
+
+        :param args: One or several texts (or one list of prompts) to classify
+        :param args: kwargs for inner call function
+        :return: A list or a list of list of dicts: Each result comes as list of dicts
+            with the following keys:
+            - `label` -- The label predicted.
+            - `score` -- The corresponding probability.
+            If ``self.return_all_scores=True``, one dictionary is returned per label
+        """
+        outputs = super().__call__(*args, **kwargs)
+
+        if isinstance(outputs, list) and outputs:
+            outputs = outputs[0]
+
+        if self.config.num_labels == 1:
+            scores = 1.0 / (1.0 + np.exp(-outputs))
+        else:
+            scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
+        if self.return_all_scores:
+            return [
+                [
+                    {"label": self.config.id2label[i], "score": score.item()}
+                    for i, score in enumerate(item)
+                ]
+                for item in scores
+            ]
+        else:
+            return [
+                {
+                    "label": self.config.id2label[item.argmax()],
+                    "score": item.max().item(),
+                }
+                for item in scores
+            ]
+
+
+class AggregationStrategy(ExplicitEnum):
+    """
+    All the valid aggregation strategies for TokenClassificationPipeline
+    """
+
+    NONE = "none"
+    SIMPLE = "simple"
+    FIRST = "first"
+    AVERAGE = "average"
+    MAX = "max"
+
+
+class TokenClassificationPipeline(Pipeline):
+    """
+    Named Entity Recognition pipeline using any `ModelForTokenClassification`.
+
+    This token classification pipeline can currently be loaded from `pipeline()`
+    using the following task identifier: `"token-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on
+    a token classification task.
+
+    :param args_parser: argument parser to use default is
+        TokenClassificationArgumentHandler
+    :param aggregation_strategy: AggregationStrategy Enum object to determine
+        the pipeline aggregation strategy. Default is AggregationStrategy.NONE
+    :param ignore_labels: list of labels to ignore. Default is `["O"]`
+    """
+
+    default_input_names = "sequences"
+
+    def __init__(
+        self,
+        args_parser: ArgumentHandler = None,
+        aggregation_strategy: AggregationStrategy = AggregationStrategy.NONE,
+        ignore_labels: List[str] = False,
+        **kwargs,
+    ):
+        super().__init__(
+            args_parser=args_parser or TokenClassificationArgumentHandler(),
+            **kwargs,
+        )
+
+        self.ignore_labels = ignore_labels or ["O"]
+
+        if isinstance(aggregation_strategy, str):
+            aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
+
+        if (
+            aggregation_strategy
+            in {
+                AggregationStrategy.FIRST,
+                AggregationStrategy.MAX,
+                AggregationStrategy.AVERAGE,
+            }
+            and not self.tokenizer.is_fast
+        ):
+            raise ValueError(
+                "Slow tokenizers cannot handle subwords. Please set the "
+                '`aggregation_strategy` option to `"simple"` or use a fast tokenizer.'
+            )
+
+        self.aggregation_strategy = aggregation_strategy
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+        """
+        Classify each token of the text(s) given as inputs.
+
+
+        :param inputs: One or several texts (or one list of texts) for token
+            classification
+        :return: A list or a list of list of :obj:`dict`: Each result comes as a list
+            of dictionaries (one for each token in the corresponding input, or each
+            entity if this pipeline was instantiated with an aggregation_strategy)
+            with the following keys:
+            - `word` -- The token/word classified.
+            - `score` -- The corresponding probability for `entity`.
+            - `entity` -- The entity predicted for that token/word (it is named
+                `entity_group` when `aggregation_strategy` is not `"none"`.
+            - `index` -- The index of the corresponding token in the sentence.
+            - `start` -- index of the start of the corresponding entity in the sentence
+                Only exists if the offsets are available within the tokenizer
+            - `end` -- The index of the end of the corresponding entity in the sentence.
+                Only exists if the offsets are available within the tokenizer
+        """
+
+        _inputs, offset_mappings = self._args_parser(inputs, **kwargs)
+
+        answers = []
+
+        tokens = self.tokenizer(
+            _inputs,
+            return_tensors=self._framework,
+            truncation=TruncationStrategy.LONGEST_FIRST.value,
+            padding=PaddingStrategy.MAX_LENGTH.value,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=self.tokenizer.is_fast,
+        )
+
+        if self.tokenizer.is_fast:
+            offset_mapping = tokens.pop("offset_mapping")
+        elif not offset_mappings:
+            offset_mapping = [None] * len(_inputs)
+
+        special_tokens_mask = tokens.pop("special_tokens_mask")
+
+        # Forward
+        _forward_pass = self._forward(tokens)
+        for entities_index, current_entities in enumerate(_forward_pass[0]):
+            input_ids = tokens["input_ids"][entities_index]
+
+            scores = np.exp(current_entities) / np.exp(current_entities).sum(
+                -1, keepdims=True
+            )
+            pre_entities = self.gather_pre_entities(
+                _inputs[entities_index],
+                input_ids,
+                scores,
+                offset_mapping[entities_index],
+                special_tokens_mask[entities_index],
+            )
+            grouped_entities = self.aggregate(pre_entities, self.aggregation_strategy)
+            # Filter anything that is in self.ignore_labels
+            current_entities = [
+                entity
+                for entity in grouped_entities
+                if entity.get("entity", None) not in self.ignore_labels
+                and entity.get("entity_group", None) not in self.ignore_labels
+            ]
+            answers.append(current_entities)
+
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def gather_pre_entities(
+        self,
+        sentence: str,
+        input_ids: np.ndarray,
+        scores: np.ndarray,
+        offset_mapping: Optional[List[Tuple[int, int]]],
+        special_tokens_mask: np.ndarray,
+    ) -> List[dict]:
+        pre_entities = []
+        for idx, token_scores in enumerate(scores):
+            # Filter special_tokens, they should only occur
+            # at the sentence boundaries since we're not encoding pairs of
+            # sentences so we don't have to keep track of those.
+            if special_tokens_mask[idx]:
+                continue
+
+            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+            if offset_mapping is not None:
+                start_ind, end_ind = offset_mapping[idx]
+                word_ref = sentence[start_ind:end_ind]
+                is_subword = len(word_ref) != len(word)
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
+            else:
+                start_ind = None
+                end_ind = None
+                is_subword = False
+
+            pre_entity = {
+                "word": word,
+                "scores": token_scores,
+                "start": start_ind,
+                "end": end_ind,
+                "index": idx,
+                "is_subword": is_subword,
+            }
+            pre_entities.append(pre_entity)
+        return pre_entities
+
+    def aggregate(
+        self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy
+    ) -> List[dict]:
+        if aggregation_strategy in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }:
+            entities = []
+            for pre_entity in pre_entities:
+                entity_idx = pre_entity["scores"].argmax()
+                score = pre_entity["scores"][entity_idx]
+                entity = {
+                    "entity": self.config.id2label[entity_idx],
+                    "score": score,
+                    "index": pre_entity["index"],
+                    "word": pre_entity["word"],
+                    "start": pre_entity["start"],
+                    "end": pre_entity["end"],
+                }
+                entities.append(entity)
+        else:
+            entities = self.aggregate_words(pre_entities, aggregation_strategy)
+
+        if aggregation_strategy == AggregationStrategy.NONE:
+            return entities
+        return self.group_entities(entities)
+
+    def aggregate_word(
+        self, entities: List[dict], aggregation_strategy: AggregationStrategy
+    ) -> dict:
+        word = self.tokenizer.convert_tokens_to_string(
+            [entity["word"] for entity in entities]
+        )
+        if aggregation_strategy == AggregationStrategy.FIRST:
+            scores = entities[0]["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.MAX:
+            max_entity = max(entities, key=lambda entity: entity["scores"].max())
+            scores = max_entity["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.AVERAGE:
+            scores = np.stack([entity["scores"] for entity in entities])
+            average_scores = np.nanmean(scores, axis=0)
+            entity_idx = average_scores.argmax()
+            entity = self.config.id2label[entity_idx]
+            score = average_scores[entity_idx]
+        else:
+            raise ValueError("Invalid aggregation_strategy")
+        new_entity = {
+            "entity": entity,
+            "score": score,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+
+    def aggregate_words(
+        self, entities: List[dict], aggregation_strategy: AggregationStrategy
+    ) -> List[dict]:
+        assert aggregation_strategy not in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }, "NONE and SIMPLE strategies are invalid"
+
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(
+                    self.aggregate_word(word_group, aggregation_strategy)
+                )
+                word_group = [entity]
+        # Last item
+        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        return word_entities
+
+    def group_sub_entities(self, entities: List[dict]) -> dict:
+        # Get the first entity in the entity group
+        entity = entities[0]["entity"].split("-")[-1]
+        scores = np.nanmean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+
+    def get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            # It's not in B-, I- format
+            bi = "B"
+            tag = entity_name
+        return bi, tag
+
+    def group_entities(self, entities: List[dict]) -> List[dict]:
+
+        entity_groups = []
+        entity_group_disagg = []
+
+        for entity in entities:
+            if not entity_group_disagg:
+                entity_group_disagg.append(entity)
+                continue
+
+            # If the current entity is similar and adjacent to the previous entity,
+            # append it to the disaggregated entity group
+            # The split is meant to account for the "B" and "I" prefixes
+            # Shouldn't merge if both entities are B-type
+            bi, tag = self.get_tag(entity["entity"])
+            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
+
+            if tag == last_tag and bi != "B":
+                # Modify subword type to be previous_type
+                entity_group_disagg.append(entity)
+            else:
+                # If the current entity is different from the previous entity
+                # aggregate the disaggregated entity group
+                entity_groups.append(self.group_sub_entities(entity_group_disagg))
+                entity_group_disagg = [entity]
+        if entity_group_disagg:
+            # it's the last entity, add it to the entity groups
+            entity_groups.append(self.group_sub_entities(entity_group_disagg))
+
+        return entity_groups
+
+
+class QuestionAnsweringPipeline(Pipeline):
+    """
+    Question Answering pipeline using any `ModelForQuestionAnswering`
+
+    This question answering pipeline can currently be loaded from `pipeline()`
+    using the following task identifier: `"question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on
+    a question answering task.
+
+    :param model: loaded inference engine to run the model with, can be a
+        deepsparse Engine or onnxruntime InferenceSession
+    :param tokenizer: tokenizer to be used for preprocessing
+    :param config: transformers model config for this model
+    :param engine_type: name of inference engine that is used. Options are
+        deepsparse and onnxruntime
+    :param input_names: list of input names to the neural network
+    :param args_parser: Reference to the object in charge of parsing supplied
+        pipeline parameters. A default is provided if None
+    :param binary_output: if True, stores outputs as pickled binaries to avoid
+        storing large amount of textual data. Default is False
+    """
+
+    default_input_names = "question,context"
+
+    def __init__(
+        self,
+        model: Union[Engine, "onnxruntime.InferenceSession"],
+        tokenizer: PreTrainedTokenizer,
+        engine_type: str,
+        input_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            engine_type=engine_type,
+            args_parser=QuestionAnsweringArgumentHandler(),
+            input_names=input_names,
+            **kwargs,
+        )
+
+    @staticmethod
+    def create_sample(
+        question: Union[str, List[str]], context: Union[str, List[str]]
+    ) -> Union[SquadExample, List[SquadExample]]:
+        """
+        :param question: single question or list of question strings
+        :param context: single context or list of context strings
+        :return: processed SquadExample object(s) for each question/context pair given
+        """
+        if isinstance(question, list):
+            return [
+                SquadExample(None, q, c, None, None, None)
+                for q, c in zip(question, context)
+            ]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Answer the question(s) given as inputs by using the context(s).
+        Multiple arguments can be used to pass the context, question data
+
+        :param args: SquadExample or list of them containing the question and context
+        :param X: SquadExample or list of them containing the question and context
+        :param data: SquadExample or list of them containing the question and context
+        :param question: single question or list of question strings
+        :param context: single context or list of context strings
+        :param topk: the number of answers to return. Will be chosen by
+            order of likelihood)
+        :param doc_stride: if the context is too long to fit with the question for the
+            model, it will be split in several chunks with some overlap. This argument
+            controls the size of that overlap
+        :param max_answer_len: maximum length of predicted answers (e.g., only
+            answers with a shorter length are considered)
+        :param max_seq_len: maximum length of the total sentence (context + question)
+            after tokenization. The context will be split in several chunks
+            (using the doc_stride) if needed
+        :param max_question_len: maximum length of the question after tokenization.
+            It will be truncated if needed
+        :param handle_impossible_answer: whether or not we accept impossible as an
+            answer
+        :param num_spans: maximum number of span to use as input from a long
+            context. Default is to stride the entire context string
+        :param preprocessed_inputs: if provided, preprocessing will be skipped in favor
+            of these inputs. Expected format is the output of self.preprocess; a tuple
+            of (examples, features_list)
+        :return: dict or list of dictionaries, each containing the following keys:
+            `"score"` - The probability associated to the answer
+            `"start"` - The start index of the answer
+            `"end"` - The end index of the answer
+            `"answer"` - The answer to the question
+        """
+        # Set defaults values
+        kwargs.setdefault("topk", 1)
+        kwargs.setdefault("max_answer_len", 15)
+        kwargs.setdefault("handle_impossible_answer", False)
+        kwargs.setdefault("preprocessed_inputs", None)  # (examples, features_list)
+
+        if kwargs["topk"] < 1:
+            raise ValueError(f"topk parameter should be >= 1 (got {kwargs['topk']})")
+
+        if kwargs["max_answer_len"] < 1:
+            raise ValueError(
+                "max_answer_len parameter should be >= 1 "
+                f"(got {kwargs['max_answer_len']})"
+            )
+
+        # run pre-processing if not provided
+        examples, features_list = kwargs["preprocessed_inputs"] or self.preprocess(
+            *args, **kwargs
+        )
+
+        # forward pass and post-processing
+        all_answers = []
+        for features, example in zip(features_list, examples):
+            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
+            fw_args = {
+                k: [feature.__dict__[k] for feature in features]
+                for k in model_input_names
+            }
+
+            # Manage tensor allocation on correct device
+            fw_args = {k: np.array(v) for (k, v) in fw_args.items()}
+            start, end = self._forward(fw_args)[:2]
+
+            # TODO: torch
+            # fw_args = {k: torch.tensor(v, device=self.device)
+            #   for (k, v) in fw_args.items()}
+            # start, end = self.model(**fw_args)[:2]
+            # start, end = start.cpu().numpy(), end.cpu().numpy()
+
+            min_null_score = 1000000  # large and positive
+            answers = []
+            for (feature, start_, end_) in zip(features, start, end):
+                # Ensure padded tokens & question tokens cannot belong
+                undesired_tokens = (
+                    np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
+                )
+
+                # Generate mask
+                undesired_tokens_mask = undesired_tokens == 0.0
+
+                # Make sure non-context indexes cannot contribute to the softmax
+                start_ = np.where(undesired_tokens_mask, -10000.0, start_)
+                end_ = np.where(undesired_tokens_mask, -10000.0, end_)
+
+                # Normalize logits and spans to retrieve the answer
+                start_ = np.exp(
+                    start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))
+                )
+                end_ = np.exp(
+                    end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True))
+                )
+
+                if kwargs["handle_impossible_answer"]:
+                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
+
+                # Mask CLS
+                start_[0] = end_[0] = 0.0
+
+                starts, ends, scores = self.decode(
+                    start_, end_, kwargs["topk"], kwargs["max_answer_len"]
+                )
+
+                if not self.tokenizer.is_fast:
+                    char_to_word = np.array(example.char_to_word_offset)
+                    answers += [
+                        {
+                            "score": score.item(),
+                            "start": np.where(
+                                char_to_word == feature.token_to_orig_map[s]
+                            )[0][0].item(),
+                            "end": np.where(
+                                char_to_word == feature.token_to_orig_map[e]
+                            )[0][-1].item(),
+                            "answer": " ".join(
+                                example.doc_tokens[
+                                    feature.token_to_orig_map[
+                                        s
+                                    ] : feature.token_to_orig_map[e]
+                                    + 1
+                                ]
+                            ),
+                        }
+                        for s, e, score in zip(starts, ends, scores)
+                    ]
+                else:
+                    question_first = bool(self.tokenizer.padding_side == "right")
+
+                    # Sometimes the max probability token is in the middle of a word so:
+                    # we start by finding the right word containing the token with
+                    # `token_to_word` then we convert this word in a character span
+                    answers += [
+                        {
+                            "score": score.item(),
+                            "start": feature.encoding.word_to_chars(
+                                feature.encoding.token_to_word(s),
+                                sequence_index=1 if question_first else 0,
+                            )[0],
+                            "end": feature.encoding.word_to_chars(
+                                feature.encoding.token_to_word(e),
+                                sequence_index=1 if question_first else 0,
+                            )[1],
+                            "answer": example.context_text[
+                                feature.encoding.word_to_chars(
+                                    feature.encoding.token_to_word(s),
+                                    sequence_index=1 if question_first else 0,
+                                )[0] : feature.encoding.word_to_chars(
+                                    feature.encoding.token_to_word(e),
+                                    sequence_index=1 if question_first else 0,
+                                )[
+                                    1
+                                ]
+                            ],
+                        }
+                        for s, e, score in zip(starts, ends, scores)
+                    ]
+
+            if kwargs["handle_impossible_answer"]:
+                answers.append(
+                    {"score": min_null_score, "start": 0, "end": 0, "answer": ""}
+                )
+
+            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[
+                : kwargs["topk"]
+            ]
+            all_answers += answers
+
+        if len(all_answers) == 1:
+            return all_answers[0]
+        return all_answers
+
+    def preprocess(self, *args, **kwargs) -> Tuple[Any, Any]:
+        """
+        preprocess the given QA model inputs using squad_convert_examples_to_features
+
+        :param args: SquadExample or list of them containing the question and context
+        :param X: SquadExample or list of them containing the question and context
+        :param data: SquadExample or list of them containing the question and context
+        :param question: single question or list of question strings
+        :param context: single context or list of context strings
+        :param doc_stride: if the context is too long to fit with the question for the
+            model, it will be split in several chunks with some overlap. This argument
+            controls the size of that overlap
+        :param max_seq_len: maximum length of the total sentence (context + question)
+            after tokenization. The context will be split in several chunks
+            (using the doc_stride) if needed
+        :param max_question_len: maximum length of the question after tokenization.
+            It will be truncated if needed
+        :param num_spans: maximum number of spans to use as input from a long
+            context. Default is to stride the entire context string
+        :return: tuple of SquadExample inputs and preprocessed features list
+        """
+        kwargs.setdefault("doc_stride", 128)
+        kwargs.setdefault("max_seq_len", self.max_length)
+        kwargs.setdefault("max_question_len", 64)
+        kwargs.setdefault("num_spans", None)
+
+        # Convert inputs to features
+        examples = self._args_parser(*args, **kwargs)
+        if not self.tokenizer.is_fast:
+            features_list = [
+                squad_convert_examples_to_features(
+                    examples=[example],
+                    tokenizer=self.tokenizer,
+                    max_seq_length=kwargs["max_seq_len"],
+                    doc_stride=kwargs["doc_stride"],
+                    max_query_length=kwargs["max_question_len"],
+                    padding_strategy=PaddingStrategy.MAX_LENGTH.value,
+                    is_training=False,
+                    tqdm_enabled=False,
+                )
+                for example in examples
+            ]
+        else:
+            features_list = self._encode_features_fast(examples, **kwargs)
+
+        if kwargs["num_spans"]:
+            features_list = [
+                features[: kwargs["num_spans"]] for features in features_list
+            ]
+
+        return examples, features_list
+
+    def decode(
+        self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int
+    ) -> Tuple:
+        """
+        :param start: Individual start probabilities for each token
+        :param end: Individual end probabilities for each token
+        :param topk: Indicates how many possible answer span(s) to extract from the
+            model output
+        :param max_answer_len: Maximum size of the answer to extract from the model
+            output
+        :return: probabilities for each span to be the actual answer. Will filter out
+            unwanted and impossible cases
+        """
+        # Ensure we have batch axis
+        if start.ndim == 1:
+            start = start[None]
+
+        if end.ndim == 1:
+            end = end[None]
+
+        # Compute the score of each tuple(start, end) to be the real answer
+        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+        # Remove candidate with end < start and end - start > max_answer_len
+        candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+        scores_flat = candidates.flatten()
+        if topk == 1:
+            idx_sort = [np.argmax(scores_flat)]
+        elif len(scores_flat) < topk:
+            idx_sort = np.argsort(-scores_flat)
+        else:
+            idx = np.argpartition(-scores_flat, topk)[0:topk]
+            idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
+        return start, end, candidates[0, start, end]
+
+    def span_to_answer(
+        self, text: str, start: int, end: int
+    ) -> Dict[str, Union[str, int]]:
+        """
+        When decoding from token probabilities, this method maps token indexes to
+        actual word in the initial context.
+
+        :param text: The actual context to extract the answer from
+        :param start: The answer starting token index
+        :param end: The answer end token index
+        :return: Dictionary containing the start, end, and answer
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {
+            "answer": " ".join(words),
+            "start": max(0, char_start_idx),
+            "end": min(len(text), char_end_idx),
+        }
+
+    def _encode_features_fast(self, examples: Any, **kwargs) -> List[SquadFeatures]:
+        features_list = []
+        for example in examples:
+            # Define the side we want to truncate / pad and the text/pair sorting
+            question_first = bool(self.tokenizer.padding_side == "right")
+
+            encoded_inputs = self.tokenizer(
+                text=example.question_text if question_first else example.context_text,
+                text_pair=(
+                    example.context_text if question_first else example.question_text
+                ),
+                padding=PaddingStrategy.MAX_LENGTH.value,
+                truncation="only_second" if question_first else "only_first",
+                max_length=kwargs["max_seq_len"],
+                stride=kwargs["doc_stride"],
+                return_tensors="np",
+                return_token_type_ids=True,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                return_special_tokens_mask=True,
+            )
+
+            total_spans = len(encoded_inputs["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer
+            # We put 0 on the tokens from the context and 1 everywhere else
+            p_mask = np.asarray(
+                [
+                    [
+                        tok != 1 if question_first else 0
+                        for tok in encoded_inputs.sequence_ids(span_id)
+                    ]
+                    for span_id in range(total_spans)
+                ]
+            )
+
+            # keep the cls_token unmasked
+            if self.tokenizer.cls_token_id is not None:
+                cls_index = np.nonzero(
+                    encoded_inputs["input_ids"] == self.tokenizer.cls_token_id
+                )
+                p_mask[cls_index] = 0
+
+            features = []
+            for span_idx in range(total_spans):
+                features.append(
+                    SquadFeatures(
+                        input_ids=encoded_inputs["input_ids"][span_idx],
+                        attention_mask=encoded_inputs["attention_mask"][span_idx],
+                        token_type_ids=encoded_inputs["token_type_ids"][span_idx],
+                        p_mask=p_mask[span_idx].tolist(),
+                        encoding=encoded_inputs[span_idx],
+                        # the following values are unused for fast tokenizers
+                        cls_index=None,
+                        token_to_orig_map={},
+                        example_index=0,
+                        unique_id=0,
+                        paragraph_len=0,
+                        token_is_max_context=0,
+                        tokens=[],
+                        start_position=0,
+                        end_position=0,
+                        is_impossible=False,
+                        qas_id=None,
+                    )
+                )
+            features_list.append(features)
+        return features_list
+
+
+@dataclass
+class TaskInfo:
+    """
+    Information about an NLP task
+
+    :param pipeline_constructor: reference to constructor for the given pipeline task
+    :param default model name: the transformers canonical name for the default model
+    :param base_stub: sparsezoo stub path for the base model for this task
+    :param default_pruned_stub: sparsezoo stub path for the default pruned model
+        for this task
+    :param default_quant_stub: sparsezoo stub path for the default quantized model
+        for this task
+    """
+
+    pipeline_constructor: Callable[[Any], Pipeline]
+    default_model_name: str
+    base_stub: Optional[str] = None
+    default_pruned_stub: Optional[str] = None
+    default_quant_stub: Optional[str] = None
+
+
+# Register all the supported tasks here
+SUPPORTED_TASKS = {
+    "ner": TaskInfo(
+        pipeline_constructor=TokenClassificationPipeline,
+        default_model_name="bert-base-uncased",
+    ),
+    "question-answering": TaskInfo(
+        pipeline_constructor=QuestionAnsweringPipeline,
+        default_model_name="bert-base-uncased",
+        base_stub=(
+            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none"
+        ),
+        default_pruned_stub=(
+            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/"
+            "pruned-aggressive_98"
+        ),
+    ),
+    "sentiment-analysis": TaskInfo(
+        pipeline_constructor=TextClassificationPipeline,
+        default_model_name="bert-base-uncased",
+    ),
+    "text-classification": TaskInfo(
+        pipeline_constructor=TextClassificationPipeline,
+        default_model_name="bert-base-uncased",
+    ),
+    "token-classification": TaskInfo(
+        pipeline_constructor=TokenClassificationPipeline,
+        default_model_name="bert-base-uncased",
+    ),
+}
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+
+SUPPORTED_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
+
+
+def pipeline(
+    task: str,
+    model_name: Optional[str] = None,
+    model_path: Optional[str] = None,
+    engine_type: str = DEEPSPARSE_ENGINE,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+    max_length: int = 128,
+    num_cores: Optional[int] = None,
+    scheduler: Optional[str] = None,
+    batch_size: Optional[int] = 1,
+    **kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a Pipeline
+
+    :param task: name of the task to define which pipeline to create. Currently
+        supported task - "question-answering"
+    :param model_name: canonical name of the hugging face model this model is based on
+    :param model_path: path to model directory containing `model.onnx`, `config.json`,
+        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
+    :param engine_type: inference engine name to use. Supported options are 'deepsparse'
+        and 'onnxruntime'
+    :param config: huggingface model config, if none provided, default will be used
+        which will be from the model name or sparsezoo stub if given for model path
+    :param tokenizer: huggingface tokenizer, if none provided, default will be used
+    :param max_length: maximum sequence length of model inputs. default is 128
+    :param num_cores: number of CPU cores to run engine with. Default is the maximum
+        available
+    :param scheduler: The scheduler to use for the engine. Can be None, single or multi.
+    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
+        Note: `question-answering` pipeline only supports a batch_size of 1.
+    :param kwargs: additional key word arguments for task specific pipeline constructor
+    :return: Pipeline object for the given taks and model
+    """
+
+    # Retrieve the task
+    if task not in SUPPORTED_TASKS:
+        raise KeyError(
+            f"Unknown task {task}, available tasks are {list(SUPPORTED_TASKS.keys())}"
+        )
+    if engine_type not in SUPPORTED_ENGINES:
+        raise ValueError(
+            f"Unsupported engine {engine_type}, supported engines "
+            f"are {SUPPORTED_ENGINES}"
+        )
+    if task == "question-answering" and batch_size != 1:
+        raise ValueError(
+            f"{task} pipeline only supports batch_size 1. "
+            f"Supplied batch_size = {batch_size}"
+        )
+    task_info = SUPPORTED_TASKS[task]
+
+    model_path = model_path or _get_default_model_path(task_info)
+    model_name = model_name or task_info.default_model_name
+
+    onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(model_path)
+
+    # default the tokenizer and config to file in model directory or given model name
+    config = config or config_path or model_name
+    tokenizer = tokenizer or tokenizer_path or model_name
+
+    # create model
+    model, input_names = _create_model(
+        onnx_path,
+        engine_type,
+        num_cores,
+        max_length,
+        scheduler=scheduler,
+        batch_size=batch_size,
+    )
+
+    # Instantiate tokenizer if needed
+    if isinstance(tokenizer, (str, tuple)):
+        if isinstance(tokenizer, tuple):
+            # For tuple we have (tokenizer name, {kwargs})
+            tokenizer_kwargs = tokenizer[1]
+            tokenizer_kwargs["model_max_length"] = max_length
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer, model_max_length=max_length
+            )
+
+    # Instantiate config if needed
+    if config is not None and isinstance(config, str):
+        config = AutoConfig.from_pretrained(config, finetuning_task=task)
+
+    return task_info.pipeline_constructor(
+        model=model,
+        tokenizer=tokenizer,
+        config=config,
+        engine_type=engine_type,
+        max_length=max_length,
+        input_names=input_names,
+        **kwargs,
+    )
+
+
+def _get_default_model_path(task_info: TaskInfo) -> str:
+    if cpu.cpu_vnni_compatible() and task_info.default_quant_stub:
+        return task_info.default_quant_stub
+    return task_info.default_pruned_stub or task_info.base_stub
+
+
+def _create_model(
+    model_path: str,
+    engine_type: str,
+    num_cores: Optional[int],
+    max_length: int = 128,
+    scheduler: Optional[str] = None,
+    batch_size: int = 1,
+) -> Tuple[Union[Engine, "onnxruntime.InferenceSession"], List[str]]:
+    onnx_path, input_names, _ = overwrite_transformer_onnx_model_inputs(
+        model_path, max_length=max_length
+    )
+
+    if engine_type == DEEPSPARSE_ENGINE:
+        model = compile_model(
+            onnx_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+            scheduler=scheduler,
+        )
+    elif engine_type == ORT_ENGINE:
+        _validate_ort_import()
+        sess_options = onnxruntime.SessionOptions()
+        if num_cores is not None:
+            sess_options.intra_op_num_threads = num_cores
+        sess_options.log_severity_level = 3
+        sess_options.graph_optimization_level = (
+            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+
+        model = onnxruntime.InferenceSession(onnx_path, sess_options=sess_options)
+
+    return model, input_names
+
+
+def _validate_ort_import():
+    if ort_import_error is not None:
+        raise ImportError(
+            "An exception occurred when importing onxxruntime. Please verify that "
+            "onnxruntime is installed in order to use the onnxruntime inference "
+            f"engine. \n\nException info: {ort_import_error}"
+        )
+
+
+def process_dataset(
+    pipeline_object: Callable,
+    data_path: str,
+    batch_size: int,
+    task: str,
+    output_path: str,
+) -> None:
+    """
+    :param pipeline_object: An instantiated pipeline Callable object
+    :param data_path: Path to input file, supports csv, json and text files
+    :param batch_size: batch_size to use for inference
+    :param task: The task pipeline is instantiated for
+    :param output_path: Path to a json file to output inference results to
+    """
+    batch_loader = get_batch_loader(
+        data_file=data_path,
+        batch_size=batch_size,
+        task=task,
+    )
+    # Wraps pipeline object to make numpy types serializable
+    pipeline_object = fix_numpy_types(pipeline_object)
+    with open(output_path, "a") as output_file:
+        for batch in batch_loader:
+            batch_output = pipeline_object(**batch)
+            json.dump(batch_output, output_file)
+            output_file.write("\n")
diff --git a/src/deepsparse/transformers/pipelines/__init__.py b/src/deepsparse/transformers/pipelines/__init__.py
deleted file mode 100644
index 9986181a2a..0000000000
--- a/src/deepsparse/transformers/pipelines/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-
-from .pipeline import *
-from .question_answering import *
-from .text_classification import *
-from .token_classification import *
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
deleted file mode 100644
index 2fdcd27236..0000000000
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Base Pipeline class for transformers inference pipeline
-"""
-
-
-import warnings
-from typing import Any, List, Mapping, Optional
-
-import numpy
-from transformers.models.auto import AutoConfig, AutoTokenizer
-
-from deepsparse import Pipeline
-from deepsparse.transformers.helpers import (
-    get_onnx_path_and_configs,
-    overwrite_transformer_onnx_model_inputs,
-)
-
-
-__all__ = [
-    "TransformersPipeline",
-    "pipeline",
-]
-
-
-class TransformersPipeline(Pipeline):
-    """
-    Base deepsparse.Pipeline class for transformers model loading. This class handles
-    the parsing of deepsparse-transformers files and model inputs, supporting loading
-    from sparsezoo, a directory containing a model.onnx, tokenizer, and model config,
-    or just an ONNX file with the ability to load a tokenizer and model config from
-    a default huggingface-transformers model.
-
-    Note, when implementing child tasks in deepsparse.transformers.pipelines,
-    in addition to registering task names with Pipeline.register, task names should
-    be added to the supported nlp tasks in deepsparse.tasks so they can be properly
-    imported at runtime.
-
-    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
-        (preferred) a directory containing a model.onnx, tokenizer config, and model
-        config. If no tokenizer and/or model config(s) are found, then they will be
-        loaded from huggingface transformers using the `default_model_name` key
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param sequence_length: static sequence length to use for inference
-    :param default_model_name: huggingface transformers model name to use to
-        load a tokenizer and model config when none are provided in the `model_path`.
-        Default is 'bert-base-uncased'
-    """
-
-    def __init__(
-        self,
-        *,
-        sequence_length: int = 128,
-        default_model_name: str = "bert-base-uncased",
-        **kwargs,
-    ):
-
-        self._sequence_length = sequence_length
-        self._default_model_name = default_model_name
-
-        self.config = None
-        self.tokenizer = None
-        self.onnx_input_names = None
-
-        self._temp_model_directory = None
-
-        super().__init__(**kwargs)
-
-    @property
-    def sequence_length(self) -> int:
-        """
-        :return: static sequence length to use for inference
-        """
-        return self._sequence_length
-
-    @property
-    def default_model_name(self) -> str:
-        """
-        :return: huggingface transformers model name to use to
-            load a tokenizer and model config when none are provided in the
-            `model_path`
-        """
-        return self._default_model_name
-
-    def setup_onnx_file_path(self) -> str:
-        """
-        Parses ONNX, tokenizer, and config file paths from the given `model_path`.
-        Supports sparsezoo stubs. If a tokenizer and/or config file are not found,
-        they will be defaulted to the default_model_name in the transformers repo
-
-        :return: file path to the processed ONNX file for the engine to compile
-        """
-        onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
-            self.model_path
-        )
-
-        # default config + tokenizer if necessary
-        config_path = config_path or self.default_model_name
-        tokenizer_path = tokenizer_path or self.default_model_name
-
-        self.config = AutoConfig.from_pretrained(
-            config_path, finetuning_task=self.task if hasattr(self, "task") else None
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path, model_max_length=self.sequence_length
-        )
-
-        # overwrite onnx graph to given required input shape
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path, max_length=self.sequence_length
-        )
-
-        return onnx_path
-
-    def tokens_to_engine_input(
-        self, tokens: Mapping[Any, numpy.ndarray]
-    ) -> List[numpy.ndarray]:
-        """
-        :param tokens: outputs of the pipeline tokenizer
-        :return: list of numpy arrays in expected order for model input
-        """
-        if not all(name in tokens for name in self.onnx_input_names):
-            raise ValueError(
-                f"pipeline expected arrays with names {self.onnx_input_names}, "
-                f"received inputs: {list(tokens.keys())}"
-            )
-
-        return [tokens[name] for name in self.onnx_input_names]
-
-
-def pipeline(
-    task: str,
-    model_name: Optional[str] = None,
-    model_path: Optional[str] = None,
-    engine_type: str = "deepsparse",
-    config: Optional[str] = None,
-    tokenizer: Optional[str] = None,
-    max_length: int = 128,
-    num_cores: Optional[int] = None,
-    scheduler: Optional[str] = None,
-    batch_size: Optional[int] = 1,
-    **kwargs,
-) -> Pipeline:
-    """
-    [DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete DeepSparse
-    pipelines for tranformers tasks use deepsparse.Pipeline.create(task, ...)
-
-    Utility factory method to build a Pipeline
-
-    :param task: name of the task to define which pipeline to create. Currently
-        supported task - "question-answering"
-    :param model_name: canonical name of the hugging face model this model is based on
-    :param model_path: path to model directory containing `model.onnx`, `config.json`,
-        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
-    :param engine_type: inference engine name to use. Options are 'deepsparse'
-        and 'onnxruntime'. Default is 'deepsparse'
-    :param config: huggingface model config, if none provided, default will be used
-        which will be from the model name or sparsezoo stub if given for model path
-    :param tokenizer: huggingface tokenizer, if none provided, default will be used
-    :param max_length: maximum sequence length of model inputs. default is 128
-    :param num_cores: number of CPU cores to run engine with. Default is the maximum
-        available
-    :param scheduler: The scheduler to use for the engine. Can be None, single or multi
-    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
-        Note: `question-answering` pipeline only supports a batch_size of 1.
-    :param kwargs: additional key word arguments for task specific pipeline constructor
-    :return: Pipeline object for the given taks and model
-    """
-    warnings.warn(
-        "[DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete "
-        "DeepSparse pipelines for tranformers tasks use deepsparse.Pipeline.create()"
-    )
-
-    if config is not None or tokenizer is not None:
-        raise ValueError(
-            "Directly passing in a config or tokenizer to DeepSparse transformers "
-            "pipelines is no longer supported. config and tokenizer objects should "
-            "be specified by including config.json and tokenizer.json files in the "
-            "model directory respectively"
-        )
-
-    return Pipeline.create(
-        task=task,
-        model_path=model_path,
-        engine_type=engine_type,
-        batch_size=batch_size,
-        num_cores=num_cores,
-        scheduler=scheduler,
-        sequence_length=max_length,
-        default_model_name=model_name,
-        **kwargs,
-    )
diff --git a/src/deepsparse/transformers/pipelines/question_answering.py b/src/deepsparse/transformers/pipelines/question_answering.py
deleted file mode 100644
index ba57117dad..0000000000
--- a/src/deepsparse/transformers/pipelines/question_answering.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# postprocessing adapted from huggingface/transformers
-
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Pipeline implementation and pydantic models for question answering transformers
-tasks
-"""
-
-
-from typing import Any, Dict, List, Tuple, Type
-
-import numpy
-from pydantic import BaseModel, Field
-from transformers.data import (
-    SquadExample,
-    SquadFeatures,
-    squad_convert_examples_to_features,
-)
-from transformers.tokenization_utils_base import PaddingStrategy
-
-from deepsparse import Pipeline
-from deepsparse.transformers.pipelines import TransformersPipeline
-
-
-__all__ = [
-    "QuestionAnsweringInput",
-    "QuestionAnsweringOutput",
-    "QuestionAnsweringPipeline",
-]
-
-
-class QuestionAnsweringInput(BaseModel):
-    """
-    Schema for inputs to question_answering pipelines
-    """
-
-    question: str = Field(description="String question to be answered")
-    context: str = Field(description="String representing context for answer")
-
-
-class QuestionAnsweringOutput(BaseModel):
-    """
-    Schema for question_answering pipeline output. Values are in batch order
-    """
-
-    score: float = Field(description="confidence score for prediction")
-    answer: str = Field(description="predicted answer")
-    start: int = Field(description="start index of the answer")
-    end: int = Field(description="end index of the answer")
-
-
-@Pipeline.register(
-    task="question_answering",
-    task_aliases=["qa"],
-    default_model_path=(
-        "zoo:nlp/question_answering/bert-base/pytorch/huggingface/"
-        "squad/12layer_pruned80_quant-none-vnni"
-    ),
-)
-class QuestionAnsweringPipeline(TransformersPipeline):
-    """
-    transformers question_answering pipeline
-
-    example instantiation:
-    ```python
-    question_answering = Pipeline.create(
-        task="question_answering",
-        model_path="question_answering_model_dir/",
-    )
-    ```
-
-    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
-        (preferred) a directory containing a model.onnx, tokenizer config, and model
-        config. If no tokenizer and/or model config(s) are found, then they will be
-        loaded from huggingface transformers using the `default_model_name` key
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param sequence_length: sequence length to compile model and tokenizer for.
-        Default is 128
-    :param default_model_name: huggingface transformers model name to use to
-        load a tokenizer and model config when none are provided in the `model_path`.
-        Default is 'bert-base-uncased'
-    :param doc_stride: if the context is too long to fit with the question for the
-        model, it will be split in several chunks with some overlap. This argument
-        controls the size of that overlap. Currently, only reading the first span
-        is supported (everything after doc_stride will be truncated). Default
-        is 128
-    :param max_question_len: maximum length of the question after tokenization.
-        It will be truncated if needed. Default is 64
-    :param max_answer_len: maximum length of answer after decoding. Default is 15
-    """
-
-    def __init__(
-        self,
-        *,
-        doc_stride: int = 128,
-        max_question_length: int = 64,
-        max_answer_length: int = 15,
-        **kwargs,
-    ):
-
-        if kwargs.get("batch_size") and kwargs["batch_size"] > 1:
-            raise ValueError(
-                f"{self.__class__.__name__} currently only supports batch size 1, "
-                f"batch size set to {kwargs['batch_size']}"
-            )
-
-        self._doc_stride = doc_stride
-        self._max_question_length = max_question_length
-        self._max_answer_length = max_answer_length
-
-        super().__init__(**kwargs)
-
-    @property
-    def doc_stride(self) -> int:
-        """
-        :return: if the context is too long to fit with the question for the
-            model, it will be split in several chunks with some overlap. This argument
-            controls the size of that overlap. Currently, only reading the first span
-            is supported (everything after doc_stride will be truncated)
-        """
-        return self._doc_stride
-
-    @property
-    def max_answer_length(self) -> int:
-        """
-        :return: maximum length of answer after decoding
-        """
-        return self._max_answer_length
-
-    @property
-    def max_question_length(self) -> int:
-        """
-        :return: maximum length of the question after tokenization.
-            It will be truncated if needed
-        """
-        return self._max_question_length
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        return QuestionAnsweringInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        return QuestionAnsweringOutput
-
-    def process_inputs(
-        self,
-        inputs: QuestionAnsweringInput,
-    ) -> Tuple[List[numpy.ndarray], Dict[str, Any]]:
-        """
-        :param inputs: inputs to the pipeline. Must be the type of the
-            QuestionAnsweringInput
-        :return: inputs of this model processed into a list of numpy arrays that
-            can be directly passed into the forward pass of the pipeline engine and
-            dictionary of parsed features and original extracted example
-        """
-        squad_example = SquadExample(
-            None, inputs.question, inputs.context, None, None, None
-        )
-        features = self._tokenize(squad_example)
-        tokens = features.__dict__
-
-        engine_inputs = self.tokens_to_engine_input(tokens)
-        # add batch dimension, assuming batch size 1
-        engine_inputs = [numpy.expand_dims(inp, axis=0) for inp in engine_inputs]
-
-        return engine_inputs, dict(
-            features=features,
-            example=squad_example,
-        )
-
-    def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
-    ) -> BaseModel:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-        features = kwargs["features"]
-        example = kwargs["example"]
-        start_vals, end_vals = engine_outputs[:2]
-
-        # assuming batch size 0
-        start = start_vals[0]
-        end = end_vals[0]
-
-        # Ensure padded tokens & question tokens cannot belong
-        undesired_tokens = (
-            numpy.abs(numpy.array(features.p_mask) - 1) & features.attention_mask
-        )
-
-        # Generate mask
-        undesired_tokens_mask = undesired_tokens == 0.0
-
-        # Make sure non-context indexes cannot contribute to the softmax
-        start = numpy.where(undesired_tokens_mask, -10000.0, start)
-        end = numpy.where(undesired_tokens_mask, -10000.0, end)
-
-        # Normalize logits and spans to retrieve the answer
-        start = numpy.exp(
-            start - numpy.log(numpy.sum(numpy.exp(start), axis=-1, keepdims=True))
-        )
-        end = numpy.exp(
-            end - numpy.log(numpy.sum(numpy.exp(end), axis=-1, keepdims=True))
-        )
-
-        # Mask CLS
-        start[0] = 0.0
-        end[0] = 0.0
-
-        ans_start, ans_end, scores = self._decode(start, end)
-        # assuming one stride, so grab first idx
-        ans_start = ans_start[0]
-        ans_end = ans_end[0]
-        score = scores[0]
-
-        # decode start, end idx into text
-        if not self.tokenizer.is_fast:
-            char_to_word = numpy.array(example.char_to_word_offset)
-            return self.output_schema(
-                score=score.item(),
-                start=numpy.where(
-                    char_to_word == features.token_to_orig_map[ans_start]
-                )[0][0].item(),
-                end=numpy.where(char_to_word == features.token_to_orig_map[ans_end])[0][
-                    -1
-                ].item(),
-                answer=" ".join(
-                    example.doc_tokens[
-                        features.token_to_orig_map[
-                            ans_start
-                        ] : features.token_to_orig_map[ans_end]
-                        + 1
-                    ]
-                ),
-            )
-        else:
-            question_first = bool(self.tokenizer.padding_side == "right")
-
-            # Sometimes the max probability token is in the middle of a word so:
-            # we start by finding the right word containing the token with
-            # `token_to_word` then we convert this word in a character span
-            return self.output_schema(
-                score=score.item(),
-                start=features.encoding.word_to_chars(
-                    features.encoding.token_to_word(ans_start),
-                    sequence_index=1 if question_first else 0,
-                )[0],
-                end=features.encoding.word_to_chars(
-                    features.encoding.token_to_word(ans_end),
-                    sequence_index=1 if question_first else 0,
-                )[1],
-                answer=example.context_text[
-                    features.encoding.word_to_chars(
-                        features.encoding.token_to_word(ans_start),
-                        sequence_index=1 if question_first else 0,
-                    )[0] : features.encoding.word_to_chars(
-                        features.encoding.token_to_word(ans_end),
-                        sequence_index=1 if question_first else 0,
-                    )[
-                        1
-                    ]
-                ],
-            )
-
-    def _tokenize(self, example: SquadExample):
-        if not self.tokenizer.is_fast:
-            features = squad_convert_examples_to_features(
-                examples=[example],
-                tokenizer=self.tokenizer,
-                max_set_length=self.sequence_length,
-                doc_stride=self.doc_stride,
-                max_query_length=self.max_question_length,
-                padding_strategy=PaddingStrategy.MAX_LENGTH.value,
-                is_training=False,
-                tqdm_enabled=False,
-            )
-            # only 1 span supported so taking only the first element of features
-            # to add support for num_spans switch to features = features[:num_spans]
-            # not included for now due to static batch requirements in production
-            features = features[0]
-        else:
-            question_first = bool(self.tokenizer.padding_side == "right")
-            encoded_inputs = self.tokenizer(
-                text=example.question_text if question_first else example.context_text,
-                text_pair=(
-                    example.context_text if question_first else example.question_text
-                ),
-                padding=PaddingStrategy.MAX_LENGTH.value,
-                truncation="only_second" if question_first else "only_first",
-                max_length=self.sequence_length,
-                stride=self.doc_stride,
-                return_tensors="np",
-                return_token_type_ids=True,
-                return_overflowing_tokens=True,
-                return_offsets_mapping=True,
-                return_special_tokens_mask=True,
-            )
-
-            # only 1 span supported so taking only the first element of features
-            # to add support for num_spans switch hardcoded 0 idx lookups to loop
-            # over values in num_spans
-
-            # p_mask: mask with 1 for token than cannot be in the answer
-            # We put 0 on the tokens from the context and 1 everywhere else
-            p_mask = numpy.asarray(
-                [
-                    [
-                        tok != 1 if question_first else 0
-                        for tok in encoded_inputs.sequence_ids(0)
-                    ]
-                ]
-            )
-
-            # keep the cls_token unmasked
-            if self.tokenizer.cls_token_id is not None:
-                cls_index = numpy.nonzero(
-                    encoded_inputs["input_ids"][0] == self.tokenizer.cls_token_id
-                )
-                p_mask[cls_index] = 0
-
-            features = SquadFeatures(
-                input_ids=encoded_inputs["input_ids"][0],
-                attention_mask=encoded_inputs["attention_mask"][0],
-                token_type_ids=encoded_inputs["token_type_ids"][0],
-                p_mask=p_mask[0].tolist(),
-                encoding=encoded_inputs[0],
-                # the following values are unused for fast tokenizers
-                cls_index=None,
-                token_to_orig_map={},
-                example_index=0,
-                unique_id=0,
-                paragraph_len=0,
-                token_is_max_context=0,
-                tokens=[],
-                start_position=0,
-                end_position=0,
-                is_impossible=False,
-                qas_id=None,
-            )
-
-        return features
-
-    def _decode(self, start: numpy.ndarray, end: numpy.ndarray) -> Tuple:
-        # Ensure we have batch axis
-        if start.ndim == 1:
-            start = start[None]
-
-        if end.ndim == 1:
-            end = end[None]
-
-        # Compute the score of each tuple(start, end) to be the real answer
-        outer = numpy.matmul(numpy.expand_dims(start, -1), numpy.expand_dims(end, 1))
-
-        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = numpy.tril(numpy.triu(outer), self.max_answer_length - 1)
-
-        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
-        scores_flat = candidates.flatten()
-        # only returning best result, use argsort for topk support
-        idx_sort = [numpy.argmax(scores_flat)]
-
-        start, end = numpy.unravel_index(idx_sort, candidates.shape)[1:]
-        return start, end, candidates[0, start, end]
diff --git a/src/deepsparse/transformers/pipelines/text_classification.py b/src/deepsparse/transformers/pipelines/text_classification.py
deleted file mode 100644
index 0df9ba2b59..0000000000
--- a/src/deepsparse/transformers/pipelines/text_classification.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# postprocessing adapted from huggingface/transformers
-
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-Pipeline implementation and pydantic models for text classification transformers
-tasks
-"""
-
-
-from typing import List, Type, Union
-
-import numpy
-from pydantic import BaseModel, Field
-from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
-
-from deepsparse import Pipeline
-from deepsparse.transformers.pipelines import TransformersPipeline
-
-
-__all__ = [
-    "TextClassificationInput",
-    "TextClassificationOutput",
-    "TextClassificationPipeline",
-]
-
-
-class TextClassificationInput(BaseModel):
-    """
-    Schema for inputs to text_classification pipelines
-    """
-
-    sequences: Union[List[List[str]], List[str], str] = Field(
-        description="A string or List of strings representing input to"
-        "text_classification task"
-    )
-
-
-class TextClassificationOutput(BaseModel):
-    """
-    Schema for text_classification pipeline output. Values are in batch order
-    """
-
-    labels: List[str] = Field(description="The predicted labels in batch order")
-    scores: List[float] = Field(
-        description="The corresponding probability for each label in the batch"
-    )
-
-
-@Pipeline.register(
-    task="text_classification",
-    task_aliases=["glue", "sentiment_analysis"],
-    default_model_path=(
-        "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/"
-        "sst2/12layer_pruned80_quant-none-vnni"
-    ),
-)
-class TextClassificationPipeline(TransformersPipeline):
-    """
-    transformers text classification pipeline
-
-    example instantiation:
-    ```python
-    text_classifier = Pipeline.create(
-        task="text_classification",
-        model_path="text_classification_model_dir/",
-        batch_size=BATCH_SIZE,
-    )
-    ```
-
-    example batch size 1, single text inputs (ie sentiment analysis):
-    ```python
-    sentiment = text_classifier("the food tastes great")
-    sentiment = text_classifier(["the food tastes great"])
-    sentiment = text_classifier([["the food tastes great"]])
-    ```
-
-    example batch size 1, multi text input (ie QQP like tasks):
-    ```python
-    prediction = text_classifier([["how is the food?", "what is the food?"]])
-    ```
-
-    example batch size n, single text inputs:
-    ```python
-    sentiments = text_classifier(["the food tastes great", "the food tastes bad"])
-    sentiments = text_classifier([["the food tastes great"], ["the food tastes bad"]])
-    ```
-
-    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
-        (preferred) a directory containing a model.onnx, tokenizer config, and model
-        config. If no tokenizer and/or model config(s) are found, then they will be
-        loaded from huggingface transformers using the `default_model_name` key
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param sequence_length: sequence length to compile model and tokenizer for.
-        Default is 128
-    :param default_model_name: huggingface transformers model name to use to
-        load a tokenizer and model config when none are provided in the `model_path`.
-        Default is 'bert-base-uncased'
-    """
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        return TextClassificationInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        return TextClassificationOutput
-
-    def parse_inputs(self, *args, **kwargs) -> BaseModel:
-        """
-        :param args: ordered arguments to pipeline, only an input_schema object
-            is supported as an arg for this function
-        :param kwargs: keyword arguments to pipeline
-        :return: pipeline arguments parsed into the given `input_schema`
-            schema if necessary. If an instance of the `input_schema` is provided
-            it will be returned
-        """
-        if args and kwargs:
-            raise ValueError(
-                f"{self.__class__} only support args OR kwargs. Found "
-                f" {len(args)} args and {len(kwargs)} kwargs"
-            )
-
-        if args:
-            if len(args) == 1:
-                # passed input_schema schema directly
-                if isinstance(args[0], self.input_schema):
-                    return args[0]
-                return self.input_schema(sequences=args[0])
-            else:
-                return self.input_schema(sequences=args)
-
-        return self.input_schema(**kwargs)
-
-    def process_inputs(self, inputs: TextClassificationInput) -> List[numpy.ndarray]:
-        """
-        :param inputs: inputs to the pipeline. Must be the type of the
-            TextClassificationInput
-        :return: inputs of this model processed into a list of numpy arrays that
-            can be directly passed into the forward pass of the pipeline engine
-        """
-        tokens = self.tokenizer(
-            inputs.sequences,
-            add_special_tokens=True,
-            return_tensors="np",
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-        )
-        return self.tokens_to_engine_input(tokens)
-
-    def process_engine_outputs(self, engine_outputs: List[numpy.ndarray]) -> BaseModel:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-        outputs = engine_outputs
-        if isinstance(outputs, list):
-            outputs = outputs[0]
-
-        scores = (
-            1.0 / (1.0 + numpy.exp(-outputs))
-            if self.config.num_labels == 1
-            else numpy.exp(outputs) / numpy.exp(outputs).sum(-1, keepdims=True)
-        )
-
-        labels = []
-        label_scores = []
-
-        for score in scores:
-            labels.append(self.config.id2label[score.argmax()])
-            label_scores.append(score.max().item())
-
-        return self.output_schema(
-            labels=labels,
-            scores=label_scores,
-        )
diff --git a/src/deepsparse/transformers/pipelines/token_classification.py b/src/deepsparse/transformers/pipelines/token_classification.py
deleted file mode 100644
index 6485df668e..0000000000
--- a/src/deepsparse/transformers/pipelines/token_classification.py
+++ /dev/null
@@ -1,499 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# postprocessing adapted from huggingface/transformers
-
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-Pipeline implementation and pydantic models for token classification transformers
-tasks
-"""
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-from pydantic import BaseModel, Field
-from transformers.file_utils import ExplicitEnum
-from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
-
-from deepsparse import Pipeline
-from deepsparse.transformers.pipelines import TransformersPipeline
-
-
-__all__ = [
-    "AggregationStrategy",
-    "TokenClassificationInput",
-    "TokenClassificationResult",
-    "TokenClassificationOutput",
-    "TokenClassificationPipeline",
-]
-
-
-class AggregationStrategy(ExplicitEnum):
-    """
-    Valid aggregation strategies for postprocessing in the TokenClassificationPipeline
-    """
-
-    NONE = "none"
-    SIMPLE = "simple"
-    FIRST = "first"
-    AVERAGE = "average"
-    MAX = "max"
-
-
-class TokenClassificationInput(BaseModel):
-    """
-    Schema for inputs to token_classification pipelines
-    """
-
-    inputs: Union[List[str], str] = Field(
-        description=(
-            "A string or List of batch of strings representing input(s) to"
-            "a token_classification task"
-        )
-    )
-
-
-class TokenClassificationResult(BaseModel):
-    """
-    Schema for a classification of a single token
-    """
-
-    entity: str = Field(description="entity predicted for that token/word")
-    score: float = Field(description="The corresponding probability for `entity`")
-    index: int = Field(description="index of the corresponding token in the sentence")
-    word: str = Field(description="token/word classified")
-    start: Optional[int] = Field(
-        description=(
-            "index of the start of the corresponding entity in the sentence. "
-            "Only exists if the offsets are available within the tokenizer"
-        )
-    )
-    end: Optional[int] = Field(
-        description=(
-            "index of the end of the corresponding entity in the sentence. "
-            "Only exists if the offsets are available within the tokenizer"
-        )
-    )
-    is_grouped: bool = Field(
-        default=False,
-        description="True if this result is part of an entity group",
-    )
-
-
-class TokenClassificationOutput(BaseModel):
-    """
-    Schema for results of TokenClassificationPipeline inference. Classifications of each
-    token stored in a list of lists of batch[sentence[token]]
-    """
-
-    predictions: List[List[TokenClassificationResult]] = Field(
-        description=(
-            "list of list of results of token classification pipeline. Outer list "
-            "has one item for each sequence in the batch. Inner list has one "
-            "TokenClassificationResult item per token in the given sequence"
-        )
-    )
-
-
-@Pipeline.register(
-    task="token_classification",
-    task_aliases=["ner"],
-    default_model_path=(
-        "zoo:nlp/token_classification/bert-base/pytorch/huggingface/"
-        "conll2003/12layer_pruned80_quant-none-vnni"
-    ),
-)
-class TokenClassificationPipeline(TransformersPipeline):
-    """
-    transformers token classification pipeline
-
-    example instantiation:
-    ```python
-    token_classifier = Pipeline.create(
-        task="token_classification",
-        model_path="token_classification_model_dir/",
-        batch_size=BATCH_SIZE,
-    )
-    ```
-
-    :param model_path: sparsezoo stub to a transformers model, an ONNX file, or
-        (preferred) a directory containing a model.onnx, tokenizer config, and model
-        config. If no tokenizer and/or model config(s) are found, then they will be
-        loaded from huggingface transformers using the `default_model_name` key
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param sequence_length: sequence length to compile model and tokenizer for.
-        Default is 128
-    :param default_model_name: huggingface transformers model name to use to
-        load a tokenizer and model config when none are provided in the `model_path`.
-        Default is 'bert-base-uncased'
-    :param aggregation_strategy: how to aggregate tokens in postprocessing. Options
-        include 'none', 'simple', 'first', 'average', and 'max'. Default is None
-    :param ignore_labels: list of label names to ignore in output. Default is
-        ['0'] which ignores the default known class label
-    """
-
-    def __init__(
-        self,
-        *,
-        aggregation_strategy: AggregationStrategy = AggregationStrategy.NONE,
-        ignore_labels: List[str] = None,
-        **kwargs,
-    ):
-
-        if isinstance(aggregation_strategy, str):
-            aggregation_strategy = aggregation_strategy.strip().lower()
-        self._aggregation_strategy = AggregationStrategy(aggregation_strategy)
-        self._ignore_labels = ["0"] if ignore_labels is None else ignore_labels
-
-        super().__init__(**kwargs)
-
-    @property
-    def aggregation_strategy(self) -> str:
-        """
-        :return: how to aggregate tokens in postprocessing. Options
-            include 'none', 'simple', 'first', 'average', and 'max'
-        """
-        return self._aggregation_strategy.value
-
-    @property
-    def ignore_labels(self) -> List[str]:
-        """
-        :return: list of label names to ignore in output. Default is
-            ['0'] which ignores the default known class label
-        """
-        return self._ignore_labels
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        return TokenClassificationInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        return TokenClassificationOutput
-
-    def parse_inputs(self, *args, **kwargs) -> BaseModel:
-        """
-        :param args: ordered arguments to pipeline, only an input_schema object
-            is supported as an arg for this function
-        :param kwargs: keyword arguments to pipeline
-        :return: pipeline arguments parsed into the given `input_schema`
-            schema if necessary. If an instance of the `input_schema` is provided
-            it will be returned
-        """
-        if args and kwargs:
-            raise ValueError(
-                f"{self.__class__} only support args OR kwargs. Found "
-                f" {len(args)} args and {len(kwargs)} kwargs"
-            )
-
-        if args:
-            if len(args) == 1:
-                # passed input_schema schema directly
-                if isinstance(args[0], self.input_schema):
-                    return args[0]
-                return self.input_schema(inputs=args[0])
-            else:
-                return self.input_schema(inputs=args)
-
-        return self.input_schema(**kwargs)
-
-    def process_inputs(
-        self,
-        inputs: TokenClassificationInput,
-    ) -> Tuple[List[numpy.ndarray], Dict[str, Any]]:
-        """
-        :param inputs: inputs to the pipeline. Must be the type of the
-            TokenClassificationInput
-        :return: inputs of this model processed into a list of numpy arrays that
-            can be directly passed into the forward pass of the pipeline engine
-            and dictionary containing offset mappings and special tokens mask to
-            be used during postprocessing
-        """
-        tokens = self.tokenizer(
-            inputs.inputs,
-            return_tensors="np",
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            return_special_tokens_mask=True,
-            return_offsets_mapping=self.tokenizer.is_fast,
-        )
-
-        offset_mapping = (
-            tokens.pop("offset_mapping")
-            if self.tokenizer.is_fast
-            else [None] * len(inputs.inputs)
-        )
-        special_tokens_mask = tokens.pop("special_tokens_mask")
-        postprocessing_kwargs = dict(
-            inputs=inputs,
-            tokens=tokens,
-            offset_mapping=offset_mapping,
-            special_tokens_mask=special_tokens_mask,
-        )
-
-        return self.tokens_to_engine_input(tokens), postprocessing_kwargs
-
-    def process_engine_outputs(
-        self,
-        engine_outputs: List[numpy.ndarray],
-        **kwargs,
-    ) -> BaseModel:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-        inputs = kwargs["inputs"]
-        tokens = kwargs["tokens"]
-        offset_mapping = kwargs["offset_mapping"]
-        special_tokens_mask = kwargs["special_tokens_mask"]
-
-        predictions = []  # type: List[List[TokenClassificationResult]]
-
-        for entities_index, current_entities in enumerate(engine_outputs[0]):
-            input_ids = tokens["input_ids"][entities_index]
-
-            scores = numpy.exp(current_entities) / numpy.exp(current_entities).sum(
-                -1, keepdims=True
-            )
-            pre_entities = self._gather_pre_entities(
-                inputs.inputs[entities_index],
-                input_ids,
-                scores,
-                offset_mapping[entities_index],
-                special_tokens_mask[entities_index],
-            )
-            grouped_entities = self._aggregate(pre_entities)
-            # Filter anything that is in self.ignore_labels
-            current_results = []  # type: List[TokenClassificationResult]
-            for entity in grouped_entities:
-                if entity.get("entity") in self.ignore_labels or (
-                    entity.get("entity_group") in self.ignore_labels
-                ):
-                    continue
-                if entity.get("entity_group"):
-                    entity["entity"] = entity["entity_group"]
-                    entity["is_grouped"] = True
-                    del entity["entity_group"]
-                current_results.append(TokenClassificationResult(**entity))
-            predictions.append(current_results)
-
-        return self.output_schema(predictions=predictions)
-
-    # utilities below adapted from transformers
-
-    def _gather_pre_entities(
-        self,
-        sentence: str,
-        input_ids: numpy.ndarray,
-        scores: numpy.ndarray,
-        offset_mapping: Optional[List[Tuple[int, int]]],
-        special_tokens_mask: numpy.ndarray,
-    ) -> List[dict]:
-        pre_entities = []
-        for idx, token_scores in enumerate(scores):
-            # Filter special_tokens, they should only occur
-            # at the sentence boundaries since we're not encoding pairs of
-            # sentences so we don't have to keep track of those.
-            if special_tokens_mask[idx]:
-                continue
-
-            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
-            if offset_mapping is not None:
-                start_ind, end_ind = offset_mapping[idx]
-                word_ref = sentence[start_ind:end_ind]
-                is_subword = len(word_ref) != len(word)
-
-                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
-                    word = word_ref
-                    is_subword = False
-            else:
-                start_ind = None
-                end_ind = None
-                is_subword = False
-
-            pre_entity = {
-                "word": word,
-                "scores": token_scores,
-                "start": start_ind,
-                "end": end_ind,
-                "index": idx,
-                "is_subword": is_subword,
-            }
-            pre_entities.append(pre_entity)
-        return pre_entities
-
-    def _aggregate(self, pre_entities: List[dict]) -> List[dict]:
-        if self._aggregation_strategy in {
-            AggregationStrategy.NONE,
-            AggregationStrategy.SIMPLE,
-        }:
-            entities = []
-            for pre_entity in pre_entities:
-                entity_idx = pre_entity["scores"].argmax()
-                score = pre_entity["scores"][entity_idx]
-                entity = {
-                    "entity": self.config.id2label[entity_idx],
-                    "score": score,
-                    "index": pre_entity["index"],
-                    "word": pre_entity["word"],
-                    "start": pre_entity["start"],
-                    "end": pre_entity["end"],
-                }
-                entities.append(entity)
-        else:
-            entities = self._aggregate_words(pre_entities)
-
-        if self._aggregation_strategy == AggregationStrategy.NONE:
-            return entities
-        return self._group_entities(entities)
-
-    def _aggregate_word(self, entities: List[dict]) -> dict:
-        word = self.tokenizer.convert_tokens_to_string(
-            [entity["word"] for entity in entities]
-        )
-        if self._aggregation_strategy == AggregationStrategy.FIRST:
-            scores = entities[0]["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif self._aggregation_strategy == AggregationStrategy.MAX:
-            max_entity = max(entities, key=lambda entity: entity["scores"].max())
-            scores = max_entity["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif self._aggregation_strategy == AggregationStrategy.AVERAGE:
-            scores = numpy.stack([entity["scores"] for entity in entities])
-            average_scores = numpy.nanmean(scores, axis=0)
-            entity_idx = average_scores.argmax()
-            entity = self.config.id2label[entity_idx]
-            score = average_scores[entity_idx]
-        else:
-            raise ValueError(
-                f"Invalid aggregation_strategy: {self._aggregation_strategy}"
-            )
-        new_entity = {
-            "entity": entity,
-            "score": score,
-            "word": word,
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return new_entity
-
-    def _aggregate_words(self, entities: List[dict]) -> List[dict]:
-        word_entities = []
-        word_group = None
-        for entity in entities:
-            if word_group is None:
-                word_group = [entity]
-            elif entity["is_subword"]:
-                word_group.append(entity)
-            else:
-                word_entities.append(self._aggregate_word(word_group))
-                word_group = [entity]
-        # Last item
-        word_entities.append(self._aggregate_word(word_group))
-        return word_entities
-
-    def _group_sub_entities(self, entities: List[dict]) -> dict:
-        # Get the first entity in the entity group
-        entity = entities[0]["entity"].split("-")[-1]
-        scores = numpy.nanmean([entity["score"] for entity in entities])
-        tokens = [entity["word"] for entity in entities]
-
-        entity_group = {
-            "entity_group": entity,
-            "score": numpy.mean(scores),
-            "word": self.tokenizer.convert_tokens_to_string(tokens),
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return entity_group
-
-    def _get_tag(self, entity_name: str) -> Tuple[str, str]:
-        if entity_name.startswith("B-"):
-            bi = "B"
-            tag = entity_name[2:]
-        elif entity_name.startswith("I-"):
-            bi = "I"
-            tag = entity_name[2:]
-        else:
-            # It's not in B-, I- format
-            bi = "B"
-            tag = entity_name
-        return bi, tag
-
-    def _group_entities(self, entities: List[dict]) -> List[dict]:
-
-        entity_groups = []
-        entity_group_disagg = []
-
-        for entity in entities:
-            if not entity_group_disagg:
-                entity_group_disagg.append(entity)
-                continue
-
-            # If the current entity is similar and adjacent to the previous entity,
-            # append it to the disaggregated entity group
-            # The split is meant to account for the "B" and "I" prefixes
-            # Shouldn't merge if both entities are B-type
-            bi, tag = self._get_tag(entity["entity"])
-            last_bi, last_tag = self._get_tag(entity_group_disagg[-1]["entity"])
-
-            if tag == last_tag and bi != "B":
-                # Modify subword type to be previous_type
-                entity_group_disagg.append(entity)
-            else:
-                # If the current entity is different from the previous entity
-                # aggregate the disaggregated entity group
-                entity_groups.append(self._group_sub_entities(entity_group_disagg))
-                entity_group_disagg = [entity]
-        if entity_group_disagg:
-            # it's the last entity, add it to the entity groups
-            entity_groups.append(self._group_sub_entities(entity_group_disagg))
-
-        return entity_groups
diff --git a/src/deepsparse/transformers/server.py b/src/deepsparse/transformers/server.py
new file mode 100644
index 0000000000..59035dba80
--- /dev/null
+++ b/src/deepsparse/transformers/server.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Specs, schemas, and pipelines for use when serving transformers models
+"""
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from deepsparse.tasks import SupportedTasks
+from deepsparse.transformers.pipelines import Pipeline, pipeline
+
+
+try:
+    from deepsparse.server.config import ServeModelConfig
+
+    deepsparse_server_err = None
+except Exception as _err:
+    deepsparse_server_err = _err
+    ServeModelConfig = object
+
+try:
+    from pydantic import BaseModel, Field
+
+    pydantic_import_err = None
+except Exception as _err:
+    pydantic_import_err = _err
+    BaseModel = object
+    Field = dict
+
+
+__all__ = [
+    "create_pipeline_definitions",
+    "QuestionAnsweringRequest",
+    "QuestionAnsweringResponse",
+    "TextClassificationRequest",
+    "TextClassificationResponse",
+    "TokenClassificationRequest",
+    "TokenClassificationResponse",
+]
+
+
+def create_pipeline_definitions(
+    model_config: ServeModelConfig,
+) -> Tuple[Pipeline, Any, Any, Dict]:
+    """
+    Create a pipeline definition and the supporting files for a given model config
+    to use for serving in the DeepSparse inference server
+
+    :param model_config: the server model config describing the model and params
+    :return: a tuple containing (the pipeline to use for inference,
+        the expected request body, the expected response body,
+        any additional keyword args for use with the server)
+    """
+    if deepsparse_server_err:
+        raise deepsparse_server_err
+
+    if pydantic_import_err:
+        raise pydantic_import_err
+
+    if SupportedTasks.nlp.question_answering.matches(model_config.task):
+        request_model = QuestionAnsweringRequest
+        response_model = Union[
+            List[QuestionAnsweringResponse],
+            QuestionAnsweringResponse,
+        ]
+        kwargs = {}
+    elif SupportedTasks.nlp.text_classification.matches(model_config.task):
+        request_model = TextClassificationRequest
+        response_model = Union[
+            List[TextClassificationResponse], List[List[TextClassificationResponse]]
+        ]
+        kwargs = {}
+    elif SupportedTasks.nlp.token_classification.matches(model_config.task):
+        request_model = TokenClassificationRequest
+        response_model = Union[
+            List[TokenClassificationResponse], List[List[TokenClassificationResponse]]
+        ]
+        kwargs = {}
+    else:
+        raise ValueError(
+            f"unrecognized task given of {model_config.task} for config {model_config}"
+        )
+
+    pipeline_instance: Pipeline = pipeline(
+        task=model_config.task.lower().replace("_", "-"),
+        model_path=model_config.model_path,
+        engine_type=model_config.engine,
+        num_cores=model_config.num_cores,
+        scheduler=model_config.scheduler,
+        batch_size=model_config.batch_size,
+        **model_config.kwargs,
+    )
+
+    return pipeline_instance, request_model, response_model, kwargs
+
+
+class QuestionAnsweringRequest(BaseModel):
+    """
+    The request model for Question Answering Task
+    """
+
+    question: Union[List[str], str] = Field(
+        description="Either a string or a List of string questions to answer"
+    )
+    context: Union[List[str], str] = Field(
+        description="Either a string or List of strings representing the context "
+        "for each question"
+    )
+
+
+class TokenClassificationRequest(BaseModel):
+    """
+    Schema for TokenClassificationPipeline Request
+    """
+
+    inputs: Union[List[str], str] = Field(
+        description="A string or List of strings representing input to"
+        "TokenClassificationPipeline task"
+    )
+
+
+class TextClassificationRequest(BaseModel):
+    """
+    Schema for TextClassificationPipeline Request
+    """
+
+    sequences: Union[List[str], str] = Field(
+        description="A string or List of strings representing input to"
+        "TextClassificationPipeline task"
+    )
+
+
+class QuestionAnsweringResponse(BaseModel):
+    """
+    Schema for a result from Question Answering Task
+    """
+
+    score: float = Field(description="confidence score for prediction")
+    start: int = Field(description="The start index of the answer")
+    end: int = Field(description="The end index of the answer")
+    answer: str = Field(description="The predicted answer")
+
+
+class TokenClassificationResponse(BaseModel):
+    """
+    Schema for TokenClassificationPipeline Response
+    """
+
+    entity: str = Field(
+        description="The entity predicted for that token/word (it is named"
+        "`entity_group` when `aggregation_strategy` is not `none`."
+    )
+    score: float = Field(description="The corresponding probability for `entity`.")
+    index: int = Field(
+        description="The index of the corresponding token in the sentence."
+    )
+    word: str = Field(description="The token/word classified.")
+    start: Optional[int] = Field(
+        description="The index of the start of the corresponding entity in the "
+        "sentence. Only exists if the offsets are available within the tokenizer"
+    )
+    end: Optional[int] = Field(
+        description="The index of the end of the corresponding entity in the sentence. "
+        "Only exists if the offsets are available within the tokenizer"
+    )
+
+
+class TextClassificationResponse(BaseModel):
+    """
+    Schema for TextClassificationPipeline Response
+    """
+
+    label: str = Field(description="The label predicted.")
+    score: float = Field(description="The corresponding probability.")
diff --git a/src/deepsparse/yolo/__init__.py b/src/deepsparse/yolo/__init__.py
deleted file mode 100644
index 0c44f887a4..0000000000
--- a/src/deepsparse/yolo/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/src/deepsparse/yolo/annotate.py b/src/deepsparse/yolo/annotate.py
deleted file mode 100644
index 72f7770934..0000000000
--- a/src/deepsparse/yolo/annotate.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Usage: deepsparse.object_detection.annotate [OPTIONS]
-
-  Annotation Script for YOLO with DeepSparse
-
-Options:
-  --model_filepath, --model-filepath TEXT
-                                  Path/SparseZoo stub to the model file to be
-                                  used for annotation  [default: zoo:cv/detect
-                                  ion/yolov5-s/pytorch/ultralytics/coco/pruned
-                                  -aggressive_96]
-  --source TEXT                   File path to image or directory of .jpg
-                                  files, a .mp4 video, or an integer (i.e. 0)
-                                  for webcam  [required]
-  --engine [deepsparse|onnxruntime|torch]
-                                  Inference engine backend to run on. Choices
-                                  are 'deepsparse', 'onnxruntime', and
-                                  'torch'. Default is 'deepsparse'
-  --image_shape, --image_shape INTEGER...
-                                  Image shape to use for inference, must be
-                                  two integers  [default: 640, 640]
-  --num_cores, --num-cores INTEGER
-                                  The number of physical cores to run the
-                                  annotations with, defaults to using all
-                                  physical cores available on the system. For
-                                  DeepSparse benchmarks, this value is the
-                                  number of cores per socket
-  --save_dir, --save-dir DIRECTORY
-                                  The path to the directory for saving results
-                                  [default: annotation-results]
-  --name TEXT                     Name of directory in save-dir to write
-                                  results to. defaults to
-                                  {engine}-annotations-{run_number}
-  --target_fps, --target-fps FLOAT
-                                  Target FPS when writing video files. Frames
-                                  will be dropped to closely match target FPS.
-                                  --source must be a video file and if target-
-                                  fps is greater than the source video fps
-                                  then it will be ignored
-  --no_save, --no-save            Set flag when source is from webcam to not
-                                  save results.Not supported for non-webcam
-                                  sources  [default: False]
-  --help                          Show this message and exit.
-
-#######
-Examples:
-
-1) deepsparse.object_detection.annotate --source PATH/TO/IMAGE.jpg
-2) deepsparse.object_detection.annotate --source PATH/TO/VIDEO.mp4
-3) deepsparse.object_detection.annotate --source 0
-4) deepsparse.object_detection.annotate --source PATH/TO/IMAGE_DIR
-"""
-import logging
-from typing import Optional
-
-import click
-
-import cv2
-from deepsparse.pipeline import Pipeline
-from deepsparse.yolo import utils
-from deepsparse.yolo.utils.cli_helpers import create_dir_callback
-
-
-yolo_v5_default_stub = (
-    "zoo:cv/detection/yolov5-s/pytorch/ultralytics/coco/" "pruned-aggressive_96"
-)
-
-DEEPSPARSE_ENGINE = "deepsparse"
-ORT_ENGINE = "onnxruntime"
-TORCH_ENGINE = "torch"
-
-_LOGGER = logging.getLogger(__name__)
-
-
-@click.command()
-@click.option(
-    "--model_filepath",
-    "--model-filepath",
-    type=str,
-    default=yolo_v5_default_stub,
-    help="Path/SparseZoo stub to the model file to be used for annotation",
-    show_default=True,
-)
-@click.option(
-    "--source",
-    type=str,
-    required=True,
-    help="File path to image or directory of .jpg files, a .mp4 video, "
-    "or an integer (i.e. 0) for webcam",
-)
-@click.option(
-    "--engine",
-    type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCH_ENGINE]),
-    default=DEEPSPARSE_ENGINE,
-    help="Inference engine backend to run on. Choices are 'deepsparse', "
-    "'onnxruntime', and 'torch'. Default is 'deepsparse'",
-)
-@click.option(
-    "--image_shape",
-    "--image_shape",
-    type=int,
-    nargs=2,
-    default=(640, 640),
-    help="Image shape to use for inference, must be two integers",
-    show_default=True,
-)
-@click.option(
-    "--num_cores",
-    "--num-cores",
-    type=int,
-    default=None,
-    help="The number of physical cores to run the annotations with, "
-    "defaults to using all physical cores available on the system."
-    " For DeepSparse benchmarks, this value is the number of cores "
-    "per socket",
-    show_default=True,
-)
-@click.option(
-    "--save_dir",
-    "--save-dir",
-    type=click.Path(dir_okay=True, file_okay=False),
-    default="annotation-results",
-    callback=create_dir_callback,
-    help="The path to the directory for saving results",
-    show_default=True,
-)
-@click.option(
-    "--name",
-    type=str,
-    default=None,
-    help="Name of directory in save-dir to write results to. defaults to "
-    "{engine}-annotations-{run_number}",
-)
-@click.option(
-    "--target_fps",
-    "--target-fps",
-    type=float,
-    default=None,
-    help="Target FPS when writing video files. Frames will be dropped to "
-    "closely match target FPS. --source must be a video file and if "
-    "target-fps is greater than the source video fps then it "
-    "will be ignored",
-    show_default=True,
-)
-@click.option(
-    "--no_save",
-    "--no-save",
-    is_flag=True,
-    help="Set flag when source is from webcam to not save results."
-    "Not supported for non-webcam sources",
-    show_default=True,
-)
-def main(
-    model_filepath: str,
-    source: str,
-    engine: str,
-    image_shape: tuple,
-    num_cores: Optional[int],
-    save_dir: str,
-    name: Optional[str],
-    target_fps: Optional[float],
-    no_save: bool,
-) -> None:
-    """
-    Annotation Script for YOLO with DeepSparse
-    """
-    save_dir = utils.get_annotations_save_dir(
-        initial_save_dir=save_dir,
-        tag=name,
-        engine=engine,
-    )
-
-    loader, saver, is_video = utils.get_yolo_loader_and_saver(
-        path=source,
-        save_dir=save_dir,
-        image_shape=image_shape,
-        target_fps=target_fps,
-        no_save=no_save,
-    )
-
-    is_webcam = source.isnumeric()
-    yolo_pipeline = Pipeline.create(
-        task="yolo",
-        model_path=model_filepath,
-        class_names="coco",
-        engine_type=engine,
-        num_cores=num_cores,
-    )
-
-    for iteration, (input_image, source_image) in enumerate(loader):
-
-        # annotate
-        annotated_images = utils.annotate(
-            pipeline=yolo_pipeline,
-            image_batch=input_image,
-            target_fps=target_fps,
-            calc_fps=is_video,
-            original_images=[source_image],
-        )
-
-        for annotated_image in annotated_images:
-            # display
-            if is_webcam:
-                cv2.imshow("annotated", annotated_image)
-                cv2.waitKey(1)
-
-            # save
-            if saver:
-                saver.save_frame(annotated_image)
-
-    if saver:
-        saver.close()
-
-    _LOGGER.info(f"Results saved to {save_dir}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/deepsparse/yolo/pipelines.py b/src/deepsparse/yolo/pipelines.py
deleted file mode 100644
index 2398313c31..0000000000
--- a/src/deepsparse/yolo/pipelines.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-import onnx
-
-from deepsparse.pipeline import Pipeline
-from deepsparse.utils import model_to_path
-from deepsparse.yolo.schemas import YOLOInput, YOLOOutput
-from deepsparse.yolo.utils import COCO_CLASSES, YoloPostprocessor, postprocess_nms
-
-
-try:
-    import cv2
-
-    cv2_error = None
-except ModuleNotFoundError as cv2_import_error:
-    cv2 = None
-    cv2_error = cv2_import_error
-
-
-@Pipeline.register(
-    task="yolo",
-    default_model_path=(
-        "zoo:cv/detection/yolov5-l/pytorch/ultralytics/coco/pruned_quant-aggressive_95"
-    ),
-)
-class YOLOPipeline(Pipeline):
-    """
-    Image Segmentation YOLO pipeline for DeepSparse
-
-    :param model_path: path on local system or SparseZoo stub to load the model from
-    :param engine_type: inference engine to use. Currently supported values include
-        'deepsparse' and 'onnxruntime'. Default is 'deepsparse'
-    :param batch_size: static batch size to use for inference. Default is 1
-    :param num_cores: number of CPU cores to allocate for inference engine. None
-        specifies all available cores. Default is None
-    :param scheduler: (deepsparse only) kind of scheduler to execute with.
-        Pass None for the default
-    :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
-        to use model as-is. Default is None
-    :param alias: optional name to give this pipeline instance, useful when
-        inferencing with multiple models. Default is None
-    :param class_names: Optional string identifier, dict, or json file of
-        class names to use for mapping class ids to class labels. Default is
-        `coco`
-    """
-
-    def __init__(
-        self,
-        *,
-        class_names: Optional[Union[str, Dict[str, str]]] = "coco",
-        model_config: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            **kwargs,
-        )
-
-        if isinstance(class_names, str):
-            if class_names.endswith(".json"):
-                class_names = json.load(open(class_names))
-            elif class_names == "coco":
-                class_names = COCO_CLASSES
-            else:
-                raise ValueError(f"Unknown class_names: {class_names}")
-
-        if isinstance(class_names, dict):
-            self._class_names = class_names
-        elif isinstance(class_names, list):
-            self._class_names = {
-                str(index): class_name for index, class_name in enumerate(class_names)
-            }
-        else:
-            raise ValueError(
-                "class_names must be a str identifier, dict, json file, or "
-                f"list of class names got {type(class_names)}"
-            )
-
-        onnx_model = onnx.load(self.onnx_file_path)
-        self.has_postprocessing = self.model_has_postprocessing(
-            loaded_onnx_model=onnx_model,
-        )
-        self.input_shape = self._infer_image_shape(onnx_model=onnx_model)
-        self.is_quantized = self.model_is_quantized(onnx_model=onnx_model)
-        self.postprocessor = (
-            None
-            if self.has_postprocessing
-            else YoloPostprocessor(
-                image_size=self.input_shape,
-                cfg=model_config,
-            )
-        )
-        self._model_config = model_config
-
-    @property
-    def model_config(self) -> str:
-        return self._model_config
-
-    @property
-    def class_names(self) -> Optional[Dict[str, str]]:
-        return self._class_names
-
-    @property
-    def input_schema(self) -> Type[YOLOInput]:
-        """
-        :return: pydantic model class that inputs to this pipeline must comply to
-        """
-        return YOLOInput
-
-    @property
-    def output_schema(self) -> Type[YOLOOutput]:
-        """
-        :return: pydantic model class that outputs of this pipeline must comply to
-        """
-        return YOLOOutput
-
-    def setup_onnx_file_path(self) -> str:
-        """
-        Performs any setup to unwrap and process the given `model_path` and other
-        class properties into an inference ready onnx file to be compiled by the
-        engine of the pipeline
-
-        :return: file path to the ONNX file for the engine to compile
-        """
-        return model_to_path(self.model_path)
-
-    def process_inputs(self, inputs: YOLOInput) -> List[numpy.ndarray]:
-        """
-        :param inputs: inputs to the pipeline. Must be the type of the `input_schema`
-            of this pipeline
-        :return: inputs of this model processed into a list of numpy arrays that
-            can be directly passed into the forward pass of the pipeline engine
-        """
-        image_batch = []
-
-        if isinstance(inputs.images, str):
-            inputs.images = [inputs.images]
-
-        for image in inputs.images:
-            if isinstance(image, str):
-                image = cv2.imread(image)
-                image = cv2.resize(image, dsize=self.input_shape)
-                image = image[:, :, ::-1].transpose(2, 0, 1)
-
-            image_batch.append(image)
-
-        image_batch = numpy.stack(image_batch, axis=0)
-        image_batch = numpy.ascontiguousarray(
-            image_batch,
-            dtype=numpy.int8 if self.is_quantized else numpy.float32,
-        )
-        image_batch /= 255
-
-        return [image_batch]
-
-    def process_engine_outputs(
-        self,
-        engine_outputs: List[numpy.ndarray],
-    ) -> YOLOOutput:
-        """
-        :param engine_outputs: list of numpy arrays that are the output of the engine
-            forward pass
-        :return: outputs of engine post-processed into an object in the `output_schema`
-            format of this pipeline
-        """
-
-        # post-processing
-        if self.postprocessor:
-            batch_output = self.postprocessor.pre_nms_postprocess(engine_outputs)
-        else:
-            batch_output = engine_outputs[
-                0
-            ]  # post-processed values stored in first output
-
-        # NMS
-        batch_output = postprocess_nms(batch_output)
-
-        batch_predictions, batch_boxes, batch_scores, batch_labels = [], [], [], []
-
-        for image_output in batch_output:
-            batch_predictions.append(image_output.tolist())
-            batch_boxes.append(image_output[:, 0:4].tolist())
-            batch_scores.append(image_output[:, 4].tolist())
-            batch_labels.append(
-                [
-                    self.class_names[str(class_ids)]
-                    for class_ids in image_output[:, 5].astype(int)
-                ]
-            )
-
-        return YOLOOutput(
-            predictions=batch_predictions,
-            boxes=batch_boxes,
-            scores=batch_scores,
-            labels=batch_labels,
-        )
-
-    def _infer_image_shape(self, onnx_model) -> Tuple[int, ...]:
-        """
-        Infer and return the expected shape of the input tensor
-
-        :return: The expected shape of the input tensor from onnx graph
-        """
-        input_tensor = onnx_model.graph.input[0]
-        return (
-            input_tensor.type.tensor_type.shape.dim[2].dim_value,
-            input_tensor.type.tensor_type.shape.dim[3].dim_value,
-        )
-
-    def model_has_postprocessing(self, loaded_onnx_model) -> bool:
-        """
-        :return: True if loaded_onnx_model has postprocessing, False otherwise
-        """
-        # get number of dimensions in each output
-        outputs_num_dims = [
-            len(output.type.tensor_type.shape.dim)
-            for output in loaded_onnx_model.graph.output
-        ]
-
-        # assume if only one output, then it is post-processed
-        if len(outputs_num_dims) == 1:
-            return True
-
-        return all(num_dims > outputs_num_dims[0] for num_dims in outputs_num_dims[1:])
-
-    def model_is_quantized(self, onnx_model) -> bool:
-        """
-        :return: True if loaded_onnx_model is quantized, False otherwise
-        """
-        return (
-            onnx_model.graph.input[0].type.tensor_type.elem_type
-            == onnx.TensorProto.UINT8
-        )
diff --git a/src/deepsparse/yolo/schemas.py b/src/deepsparse/yolo/schemas.py
deleted file mode 100644
index f60357dfb5..0000000000
--- a/src/deepsparse/yolo/schemas.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-Input/Output Schemas for Image Segmentation with YOLO
-"""
-from collections import namedtuple
-from typing import List, Union
-
-import numpy
-from pydantic import BaseModel
-
-
-__all__ = [
-    "YOLOOutput",
-    "YOLOInput",
-]
-
-_YOLOImageOutput = namedtuple(
-    "_YOLOImageOutput", ["predictions", "boxes", "scores", "labels"]
-)
-
-
-class YOLOInput(BaseModel):
-    """
-    Input model for image classification
-    """
-
-    images: Union[str, List[numpy.ndarray], List[str]]
-
-    class Config:
-        arbitrary_types_allowed = True
-
-
-class YOLOOutput(BaseModel):
-    """
-    Output model for image classification
-    """
-
-    predictions: List[List[List[float]]]
-    boxes: List[List[List[float]]]
-    scores: List[List[float]]
-    labels: List[List[str]]
-
-    def __getitem__(self, index):
-        if index >= len(self.predictions):
-            raise IndexError("Index out of range")
-
-        return _YOLOImageOutput(
-            self.predictions[index],
-            self.boxes[index],
-            self.scores[index],
-            self.labels[index],
-        )
-
-    def __iter__(self):
-        for index in range(len(self.predictions)):
-            yield self[index]
diff --git a/src/deepsparse/yolo/utils/__init__.py b/src/deepsparse/yolo/utils/__init__.py
deleted file mode 100644
index 5344738df6..0000000000
--- a/src/deepsparse/yolo/utils/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-
-from .coco_classes import *
-from .utils import *
diff --git a/src/deepsparse/yolo/utils/cli_helpers.py b/src/deepsparse/yolo/utils/cli_helpers.py
deleted file mode 100644
index ccd366236f..0000000000
--- a/src/deepsparse/yolo/utils/cli_helpers.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Optional, Union
-
-
-def parse_device(
-    ctx,
-    params,
-    value: Optional[Union[str, int]],
-) -> Optional[Union[str, int]]:
-    """
-    :param ctx: The click context
-    :param params: The click params
-    :param value: The device value to parse
-    :return: The correct inferred device
-    """
-    try:
-        return int(value)
-    except (ValueError, TypeError):
-        return value
-
-
-def create_dir_callback(ctx, params, value: str):
-    """
-    Create and return directory if it doesn't exist.
-
-    :param ctx: The click context
-    :param params: The click params
-    :param value: The value to create the directory from
-    :returns: The directory path
-    """
-    os.makedirs(value, exist_ok=True)
-    return value
diff --git a/src/deepsparse/yolo/utils/coco_classes.py b/src/deepsparse/yolo/utils/coco_classes.py
deleted file mode 100644
index 5e67829d8f..0000000000
--- a/src/deepsparse/yolo/utils/coco_classes.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-COCO_CLASSES = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
deleted file mode 100644
index 0e14aad9fe..0000000000
--- a/src/deepsparse/yolo/utils/utils.py
+++ /dev/null
@@ -1,795 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Helpers and Utilities for YOLO
-"""
-import functools
-import glob
-import itertools
-import logging
-import os
-import random
-import shutil
-import time
-from pathlib import Path
-from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
-
-import numpy
-import onnx
-import yaml
-
-import torch
-import torchvision
-from sparsezoo.utils import create_dirs
-
-
-try:
-    import cv2
-
-    cv2_error = None
-except ModuleNotFoundError as cv2_import_error:
-    cv2 = None
-    cv2_error = cv2_import_error
-
-_YOLO_CLASS_COLORS = list(itertools.product([0, 255, 128, 64, 192], repeat=3))
-_YOLO_CLASS_COLORS.remove((255, 255, 255))  # remove white from possible colors
-_LOGGER = logging.getLogger(__name__)
-
-# Default YOLO anchor grids
-_YOLO_DEFAULT_ANCHORS = [
-    torch.Tensor([[10, 13], [16, 30], [33, 23]]),
-    torch.Tensor([[30, 61], [62, 45], [59, 119]]),
-    torch.Tensor([[116, 90], [156, 198], [373, 326]]),
-]
-_YOLO_DEFAULT_ANCHOR_GRIDS = [
-    t.clone().view(1, -1, 1, 1, 2) for t in _YOLO_DEFAULT_ANCHORS
-]
-
-
-@functools.lru_cache(maxsize=None)
-def _get_color(label):
-    # cache color lookups
-    return random.choice(_YOLO_CLASS_COLORS)
-
-
-class YoloPostprocessor:
-    """
-    Class for performing post-processing of YOLO model predictions
-
-    :param image_size: size of input image to model. used to calculate stride based on
-        output shapes
-    """
-
-    def __init__(
-        self, image_size: Tuple[int, int] = (640, 640), cfg: Optional[str] = None
-    ):
-        self._image_size = image_size
-        self._anchor_grids = (
-            self._load_cfg_anchor_grid(cfg) if cfg else _YOLO_DEFAULT_ANCHOR_GRIDS
-        )
-        self._grids = {}  # Dict[Tuple[int], torch.Tensor]
-
-    def pre_nms_postprocess(self, outputs: List[numpy.ndarray]) -> torch.Tensor:
-        """
-        :param outputs: raw outputs of a YOLO model before anchor grid processing
-        :return: post-processed model outputs without NMS.
-        """
-        # postprocess and transform raw outputs into single torch tensor
-        processed_outputs = []
-        for idx, pred in enumerate(outputs):
-            pred = torch.from_numpy(pred)
-            pred = pred.sigmoid()
-
-            # get grid and stride
-            grid_shape = pred.shape[2:4]
-            grid = self._get_grid(grid_shape)
-            stride = self._image_size[0] / grid_shape[0]
-
-            # decode xywh box values
-            pred[..., 0:2] = (pred[..., 0:2] * 2.0 - 0.5 + grid) * stride
-            pred[..., 2:4] = (pred[..., 2:4] * 2) ** 2 * self._anchor_grids[idx]
-            # flatten anchor and grid dimensions ->
-            #       (bs, num_predictions, num_classes + 5)
-            processed_outputs.append(pred.view(pred.size(0), -1, pred.size(-1)))
-        return torch.cat(processed_outputs, 1)
-
-    def _get_grid(self, grid_shape: Tuple[int, int]) -> torch.Tensor:
-        if grid_shape not in self._grids:
-            # adapted from yolov5.yolo.Detect._make_grid
-            coords_y, coords_x = torch.meshgrid(
-                [torch.arange(grid_shape[0]), torch.arange(grid_shape[1])]
-            )
-            grid = torch.stack((coords_x, coords_y), 2)
-            self._grids[grid_shape] = grid.view(
-                1, 1, grid_shape[0], grid_shape[1], 2
-            ).float()
-        return self._grids[grid_shape]
-
-    @staticmethod
-    def _load_cfg_anchor_grid(cfg: str) -> List[torch.Tensor]:
-        with open(cfg) as f:
-            anchors = yaml.safe_load(f)["anchors"]
-
-        def _split_to_coords(coords_list):
-            return [
-                [coords_list[idx], coords_list[idx + 1]]
-                for idx in range(0, len(coords_list), 2)
-            ]
-
-        anchors = [torch.Tensor(_split_to_coords(coords)) for coords in anchors]
-        return [t.clone().view(1, -1, 1, 1, 2) for t in anchors]
-
-
-def postprocess_nms(outputs: Union[torch.Tensor, numpy.ndarray]) -> List[numpy.ndarray]:
-    """
-    :param outputs: Tensor of post-processed model outputs
-    :return: List of numpy arrays of NMS predictions for each image in the batch
-    """
-    # run nms in PyTorch, only post-process first output
-    if isinstance(outputs, numpy.ndarray):
-        outputs = torch.from_numpy(outputs)
-    nms_outputs = _non_max_suppression(outputs)
-    return [output.cpu().numpy() for output in nms_outputs]
-
-
-def _non_max_suppression(
-    prediction,
-    conf_thres=0.25,
-    iou_thres=0.45,
-    classes=None,
-    agnostic=False,
-    multi_label=False,
-    labels=(),
-):
-    # Ported from ultralytics/yolov5
-
-    nc = prediction.shape[2] - 5  # number of classes
-    xc = prediction[..., 4] > conf_thres  # candidates
-
-    # Checks
-    assert 0 <= conf_thres <= 1, (
-        f"Invalid Confidence threshold {conf_thres}, "
-        "valid values are between 0.0 and 1.0"
-    )
-    assert (
-        0 <= iou_thres <= 1
-    ), f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
-
-    # Settings
-    _, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
-    max_det = 300  # maximum number of detections per image
-    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
-    time_limit = 10.0  # seconds to quit after
-    redundant = True  # require redundant detections
-    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
-    merge = False  # use merge-NMS
-
-    t = time.time()
-    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
-    for xi, x in enumerate(prediction):  # image index, image inference
-        # Apply constraints
-        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0
-        x = x[xc[xi]]  # confidence
-
-        # Cat apriori labels if autolabelling
-        if labels and len(labels[xi]):
-            label_ = labels[xi]
-            v = torch.zeros((len(label_), nc + 5), device=x.device)
-            v[:, :4] = label_[:, 1:5]  # box
-            v[:, 4] = 1.0  # conf
-            v[range(len(label_)), label_[:, 0].long() + 5] = 1.0  # cls
-            x = torch.cat((x, v), 0)
-
-        # If none remain process next image
-        if not x.shape[0]:
-            continue
-
-        # Compute conf
-        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
-
-        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
-        box = _xywh2xyxy(x[:, :4])
-
-        # Detections matrix nx6 (xyxy, conf, cls)
-        if multi_label:
-            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
-            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
-        else:  # best class only
-            conf, j = x[:, 5:].max(1, keepdim=True)
-            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
-
-        # Filter by class
-        if classes is not None:
-            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
-
-        # Apply finite constraint
-        # if not torch.isfinite(x).all():
-        #     x = x[torch.isfinite(x).all(1)]
-
-        # Check shape
-        n = x.shape[0]  # number of boxes
-        if not n:  # no boxes
-            continue
-        elif n > max_nms:  # excess boxes
-            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
-
-        # Batched NMS
-        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
-        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
-        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
-        if i.shape[0] > max_det:  # limit detections
-            i = i[:max_det]
-        if merge and (1 < n < 3e3):  # Merge NMS (boxes merged using weighted mean)
-            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
-            iou = _box_iou(boxes[i], boxes) > iou_thres  # iou matrix
-            weights = iou * scores[None]  # box weights
-            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
-                1, keepdim=True
-            )  # merged boxes
-            if redundant:
-                i = i[iou.sum(1) > 1]  # require redundancy
-
-        output[xi] = x[i]
-        if (time.time() - t) > time_limit:
-            print(f"WARNING: NMS time limit {time_limit}s exceeded")
-            break  # time limit exceeded
-
-    return output
-
-
-def _xywh2xyxy(
-    x: Union[torch.Tensor, numpy.ndarray]
-) -> Union[torch.Tensor, numpy.ndarray]:
-    # ported from ultralytics/yolov5
-    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2]
-    # where xy1=top-left, xy2=bottom-right
-    y = x.clone() if isinstance(x, torch.Tensor) else numpy.copy(x)
-    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
-    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
-    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
-    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
-    return y
-
-
-def _box_iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor:
-    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
-    """
-    Return intersection-over-union (Jaccard index) of boxes.
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
-    Arguments:
-        box1 (Tensor[N, 4])
-        box2 (Tensor[M, 4])
-    Returns:
-        iou (Tensor[N, M]): the NxM matrix containing the pairwise
-            IoU values for every element in boxes1 and boxes2
-    """
-
-    def box_area(box):
-        # box = 4xn
-        return (box[2] - box[0]) * (box[3] - box[1])
-
-    area1 = box_area(box1.T)
-    area2 = box_area(box2.T)
-
-    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
-    inter = (
-        (
-            torch.min(box1[:, None, 2:], box2[:, 2:])
-            - torch.max(box1[:, None, :2], box2[:, :2])
-        )
-        .clamp(0)
-        .prod(2)
-    )
-    return inter / (
-        area1[:, None] + area2 - inter
-    )  # iou = inter / (area1 + area2 - inter)
-
-
-def yolo_onnx_has_postprocessing(model_path: str) -> bool:
-    """
-    :param model_path: file path to YOLO ONNX model
-    :return: True if YOLO postprocessing (pre-nms) is included in the ONNX graph,
-        this is assumed to be when the first output of the model has fewer dimensions
-        than the other outputs as the grid dimensions have been flattened
-    """
-    model = onnx.load(model_path)
-
-    # get number of dimensions in each output
-    outputs_num_dims = [
-        len(output.type.tensor_type.shape.dim) for output in model.graph.output
-    ]
-
-    # assume if only one output, then it is post-processed
-    if len(outputs_num_dims) == 1:
-        return True
-
-    return all(num_dims > outputs_num_dims[0] for num_dims in outputs_num_dims[1:])
-
-
-def annotate(
-    pipeline: "YOLOPipeline",  # noqa: F821
-    image_batch: Union[List[numpy.ndarray], List[str]],
-    target_fps: float = None,
-    calc_fps: bool = False,
-    original_images: Optional[Union[List[numpy.ndarray], numpy.ndarray]] = None,
-) -> List[numpy.ndarray]:
-    """
-    Annotated and return image_batch with bounding boxes and labels
-
-    :param pipeline: A YOLOPipeline object
-    :param image_batch: A list of image files, or batch of numpy image_batch
-    :param target_fps: If not None, then the pipeline will be run at this target
-    :param calc_fps: If True, and target_fps is None then the pipeline will
-        calculate the FPS
-    :param original_images: images from input_batch before any processing
-    :return: A list of annotated images
-
-    """
-
-    if not isinstance(image_batch, list):
-        image_batch = [image_batch]
-
-    if not original_images:
-        original_images = image_batch
-
-    batch_size = len(image_batch)
-    if image_batch and isinstance(image_batch[0], str):
-        original_images = [cv2.imread(image) for image in image_batch]
-
-    if target_fps is None and calc_fps:
-        start = time.time()
-
-    pipeline_outputs = pipeline(images=image_batch)
-
-    if target_fps is None and calc_fps:
-        target_fps = float(batch_size) / (time.time() - start)
-
-    annotated_images = []
-    for index, image_output in enumerate(pipeline_outputs):
-        image = original_images[index]
-        result = _annotate_image(
-            img=image,
-            boxes=image_output.boxes,
-            labels=image_output.labels,
-            scores=image_output.scores,
-            model_input_size=pipeline.input_shape,
-            images_per_sec=target_fps,
-        )
-        annotated_images.append(result)
-
-    return annotated_images
-
-
-def _annotate_image(
-    img: numpy.ndarray,
-    boxes: List[List[float]],
-    scores: List[float],
-    labels: List[str],
-    score_threshold: float = 0.35,
-    model_input_size: Tuple[int, int] = None,
-    images_per_sec: Optional[float] = None,
-) -> numpy.ndarray:
-    """
-    Draws bounding boxes on predictions of a detection model
-
-    :param img: Original image to annotate (no pre-processing needed)
-    :param boxes: List of bounding boxes (x1, y1, x2, y2)
-    :param scores: List of scores for each bounding box
-    :param labels: List of labels for each bounding box
-    :param score_threshold: minimum score a detection should have to be annotated
-        on the image. Default is 0.35
-    :param model_input_size: 2-tuple of expected input size for the given model to
-        be used for bounding box scaling with original image. Scaling will not
-        be applied if model_input_size is None. Default is None
-    :param images_per_sec: optional image_batch per second to annotate the left corner
-        of the image with
-    :return: the original image annotated with the given bounding boxes
-    """
-    img_res = numpy.copy(img)
-
-    scale_y = img.shape[0] / (1.0 * model_input_size[0]) if model_input_size else 1.0
-    scale_x = img.shape[1] / (1.0 * model_input_size[1]) if model_input_size else 1.0
-
-    for idx in range(len(boxes)):
-        label = labels[idx]
-        if scores[idx] > score_threshold:
-            annotation_text = f"{label}: {scores[idx]:.0%}"
-
-            # bounding box points
-            left = boxes[idx][0] * scale_x
-            top = boxes[idx][1] * scale_y
-            right = boxes[idx][2] * scale_x
-            bottom = boxes[idx][3] * scale_y
-
-            # calculate text size
-            (text_width, text_height), text_baseline = cv2.getTextSize(
-                annotation_text,
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.9,  # font scale
-                2,  # thickness
-            )
-            text_height += text_baseline
-
-            # make solid background for annotation text
-            cv2.rectangle(
-                img_res,
-                (int(left), int(top) - 33),
-                (int(left) + text_width, int(top) - 28 + text_height),
-                _get_color(label),
-                thickness=-1,  # filled solid
-            )
-
-            # add white annotation text
-            cv2.putText(
-                img_res,
-                annotation_text,
-                (int(left), int(top) - 10),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.9,  # font scale
-                (255, 255, 255),  # white text
-                2,  # thickness
-                cv2.LINE_AA,
-            )
-
-            # draw bounding box
-            cv2.rectangle(
-                img_res,
-                (int(left), int(top)),
-                (int(right), int(bottom)),
-                _get_color(label),
-                thickness=2,
-            )
-
-    if images_per_sec is not None:
-        cv2.putText(
-            img_res,
-            f"images_per_sec: {int(images_per_sec)}",
-            (50, 50),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            2.0,  # font scale
-            (245, 46, 6),  # color
-            2,  # thickness
-            cv2.LINE_AA,
-        )
-    return img_res
-
-
-def get_yolo_loader_and_saver(
-    path: str,
-    save_dir: str,
-    image_shape: Tuple[int, int] = (640, 640),
-    target_fps: Optional[float] = None,
-    no_save: bool = False,
-) -> Union[Iterable, Any, bool]:
-    """
-
-    :param path: file path to image or directory of .jpg files, a .mp4 video,
-        or an integer (i.e. 0) for web-cam
-    :param save_dir: path of directory to save to
-    :param image_shape: size of input image_batch to model
-    :param target_fps: fps to save potential video at
-    :param no_save: set true if not saving results of processing
-    :return: image loader iterable, result saver objects
-        image_batch, video, or web-cam based on path given, and a boolean value
-        that is True is the returned objects load videos
-    """
-    # video
-    if path.endswith(".mp4"):
-        loader = YoloVideoLoader(path, image_shape)
-        saver = VideoSaver(
-            save_dir,
-            loader.original_fps,
-            loader.original_frame_size,
-            target_fps,
-        )
-        return loader, saver, True
-    # webcam
-    if path.isnumeric():
-        loader = YoloWebcamLoader(int(path), image_shape)
-        saver = (
-            VideoSaver(save_dir, 30, loader.original_frame_size, None)
-            if not no_save
-            else None
-        )
-        return loader, saver, True
-    # image file(s)
-    return YoloImageLoader(path, image_shape), ImagesSaver(save_dir), False
-
-
-class YoloImageLoader:
-    """
-    Class for pre-processing and iterating over image_batch to be used as input for YOLO
-    models
-
-    :param path: Filepath to single image file or directory of image files to load,
-        glob paths also valid
-    :param image_size: size of input image_batch to model
-    """
-
-    def __init__(self, path: str, image_size: Tuple[int, int] = (640, 640)):
-        self._path = path
-        self._image_size = image_size
-
-        if os.path.isdir(path):
-            self._image_file_paths = [
-                os.path.join(path, file_name) for file_name in os.listdir(path)
-            ]
-        elif "*" in path:
-            self._image_file_paths = glob.glob(path)
-        elif os.path.isfile(path):
-            # single file
-            self._image_file_paths = [path]
-        else:
-            raise ValueError(f"{path} is not a file, glob, or directory")
-
-    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
-        for image_path in self._image_file_paths:
-            yield load_image(image_path, image_size=self._image_size)
-
-
-class YoloVideoLoader:
-    """
-    Class for pre-processing and iterating over video frames to be used as input for
-    YOLO models
-
-    :param path: Filepath to single video file
-    :param image_size: size of input image_batch to model
-    """
-
-    def __init__(self, path: str, image_size: Tuple[int, int] = (640, 640)):
-        self._path = path
-        self._image_size = image_size
-        self._vid = cv2.VideoCapture(self._path)
-        self._total_frames = int(self._vid.get(cv2.CAP_PROP_FRAME_COUNT))
-        self._fps = self._vid.get(cv2.CAP_PROP_FPS)
-
-    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
-        for _ in range(self._total_frames):
-            loaded, frame = self._vid.read()
-            if not loaded:
-                break
-            yield load_image(frame, image_size=self._image_size)
-        self._vid.release()
-
-    @property
-    def original_fps(self) -> float:
-        """
-        :return: the frames per second of the video this object reads
-        """
-        return self._fps
-
-    @property
-    def original_frame_size(self) -> Tuple[int, int]:
-        """
-        :return: the original size of frames in the video this object reads
-        """
-        return (
-            int(self._vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
-            int(self._vid.get(cv2.CAP_PROP_FRAME_HEIGHT)),
-        )
-
-    @property
-    def total_frames(self) -> int:
-        """
-        :return: the total number of frames this object may laod from the video
-        """
-        return self._total_frames
-
-
-class YoloWebcamLoader:
-    """
-    Class for pre-processing and iterating over webcam frames to be used as input for
-    YOLO models.
-
-    Adapted from: https://github.com/ultralytics/yolov5/blob/master/utils/datasets.py
-
-    :param camera: Webcam index
-    :param image_size: size of input image_batch to model
-    """
-
-    def __init__(self, camera: int, image_size: Tuple[int, int] = (640, 640)):
-
-        self._camera = camera
-        self._image_size = image_size
-        self._stream = cv2.VideoCapture(self._camera)
-        self._stream.set(cv2.CAP_PROP_BUFFERSIZE, 3)
-
-    def __iter__(self) -> Iterator[Tuple[numpy.ndarray, numpy.ndarray]]:
-        while True:
-            if cv2.waitKey(1) == ord("q"):  # q to quit
-                self._stream.release()
-                cv2.destroyAllWindows()
-                break
-            loaded, frame = self._stream.read()
-
-            assert loaded, f"Could not load image from webcam {self._camera}"
-
-            frame = cv2.flip(frame, 1)  # flip left-right
-            yield load_image(frame, image_size=self._image_size)
-
-    @property
-    def original_frame_size(self) -> Tuple[int, int]:
-        """
-        :return: the original size of frames in the stream this object reads
-        """
-        return (
-            int(self._stream.get(cv2.CAP_PROP_FRAME_WIDTH)),
-            int(self._stream.get(cv2.CAP_PROP_FRAME_HEIGHT)),
-        )
-
-
-class ImagesSaver:
-    """
-    Base class for saving YOLO model outputs. Saves each image as an individual file in
-    the given directory
-
-    :param save_dir: path to directory to write to
-    """
-
-    def __init__(self, save_dir: str):
-        self._save_dir = save_dir
-        self._idx = 0
-
-        create_dirs(save_dir)
-
-    def save_frame(self, image: numpy.ndarray):
-        """
-        :param image: numpy array of image to save
-        """
-        output_path = os.path.join(self._save_dir, f"result-{self._idx}.jpg")
-        cv2.imwrite(output_path, image)
-        self._idx += 1
-
-    def close(self):
-        """
-        perform any clean-up tasks
-        """
-        pass
-
-
-class VideoSaver(ImagesSaver):
-    """
-    Class for saving YOLO model outputs as a VideoFile
-
-    :param save_dir: path to directory to write to
-    :param original_fps: frames per second to save video with
-    :param output_frame_size: size of frames to write
-    :param target_fps: fps target for output video. if present, video
-        will be written with a certain number of the original frames
-        evenly dropped to match the target FPS.
-    """
-
-    def __init__(
-        self,
-        save_dir: str,
-        original_fps: float,
-        output_frame_size: Tuple[int, int],
-        target_fps: Optional[float] = None,
-    ):
-        super().__init__(save_dir)
-
-        self._output_frame_size = output_frame_size
-        self._original_fps = original_fps
-
-        if target_fps is not None and target_fps >= original_fps:
-            print(
-                f"target_fps {target_fps} is greater than source_fps "
-                f"{original_fps}. target fps file will not be invoked"
-            )
-        self._target_fps = target_fps
-
-        self._file_path = os.path.join(self._save_dir, "results.mp4")
-        self._writer = cv2.VideoWriter(
-            self._file_path,
-            cv2.VideoWriter_fourcc(*"mp4v"),
-            original_fps,
-            self._output_frame_size,
-        )
-        self._n_frames = 0
-
-    def save_frame(self, image: numpy.ndarray):
-        """
-        :param image: numpy array of image to save
-        """
-        self._writer.write(image)
-        self._n_frames += 1
-
-    def close(self):
-        """
-        perform any clean-up tasks
-        """
-        self._writer.release()
-        if self._target_fps is not None and self._target_fps < self._original_fps:
-            self._write_target_fps_video()
-
-    def _write_target_fps_video(self):
-        assert self._target_fps is not None
-        num_frames_to_keep = int(
-            self._n_frames * (self._target_fps / self._original_fps)
-        )
-        # adjust target fps so we can keep the same video duration
-        adjusted_target_fps = num_frames_to_keep * (self._original_fps / self._n_frames)
-
-        # select num_frames_to_keep evenly spaced frame idxs
-        frame_idxs_to_keep = set(
-            numpy.round(numpy.linspace(0, self._n_frames, num_frames_to_keep))
-            .astype(int)
-            .tolist()
-        )
-
-        # create new video writer for adjusted video
-        vid_path = os.path.join(
-            self._save_dir, f"_results-{adjusted_target_fps:.2f}fps.mp4"
-        )
-        fps_writer = cv2.VideoWriter(
-            vid_path,
-            cv2.VideoWriter_fourcc(*"mp4v"),
-            adjusted_target_fps,
-            self._output_frame_size,
-        )
-
-        # read from original video and write to FPS adjusted video
-        saved_vid = cv2.VideoCapture(self._file_path)
-        for idx in range(self._n_frames):
-            _, frame = saved_vid.read()
-            if idx in frame_idxs_to_keep:
-                fps_writer.write(frame)
-
-        saved_vid.release()
-        fps_writer.release()
-        shutil.move(vid_path, self._file_path)  # overwrite original file
-
-
-def load_image(
-    img: Union[str, numpy.ndarray], image_size: Tuple[int, int] = (640, 640)
-) -> Tuple[List[numpy.ndarray], List[numpy.ndarray]]:
-    """
-    :param img: file path to image or raw image array
-    :param image_size: target shape for image
-    :return: Image loaded into numpy and reshaped to the given shape and the original
-        image
-    """
-    img = cv2.imread(img) if isinstance(img, str) else img
-    img_resized = cv2.resize(img, image_size)
-    img_transposed = img_resized[:, :, ::-1].transpose(2, 0, 1)
-
-    return img_transposed, img
-
-
-def get_annotations_save_dir(
-    initial_save_dir: str,
-    tag: Optional[str] = None,
-    engine: Optional[str] = None,
-) -> str:
-    """
-    Returns the directory to save annotations to. If directory exists and is
-    non-empty, a number is appended to the end of the directory name.
-
-    :param initial_save_dir: Initial directory to save annotations to
-    :param tag: A tag under which to save the annotations inside `save_dir`
-    :param engine: Used to generate a unique tag if it is not provided.
-    :return: A new unique dir path to save annotations to
-    """
-    name = tag or f"{engine}-annotations"
-    initial_save_dir = os.path.join(initial_save_dir, name)
-    counter = 0
-    new_save_dir = initial_save_dir
-    while Path(new_save_dir).exists() and any(Path(new_save_dir).iterdir()):
-        counter += 1
-        new_save_dir = os.path.join(initial_save_dir, f"{name}-{counter:03d}")
-
-    _LOGGER.info(f"Results will be saved to {new_save_dir}")
-    Path(new_save_dir).mkdir(parents=True, exist_ok=True)
-    return new_save_dir