diff --git a/README.md b/README.md
index 91e4d21b22..e0ea89bcc3 100644
--- a/README.md
+++ b/README.md
@@ -139,12 +139,12 @@ deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
 ## 👩‍💻 NLP Inference Example
 
 ```python
-from deepsparse.transformers import pipeline
+from deepsparse import Pipeline
 
 # SparseZoo model stub or path to ONNX file
 model_path = "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni"
 
-qa_pipeline = pipeline(
+qa_pipeline = Pipeline.create(
     task="question-answering",
     model_path=model_path,
 )
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 1e19a300f4..aecf36f10d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -131,7 +131,7 @@ def __init__(
             self._engine_args["scheduler"] = scheduler
 
         self._onnx_file_path = self.setup_onnx_file_path()
-        self._engine = self._initialize_engine()
+        self.engine = self._initialize_engine()
         pass
 
     def __call__(self, *args, **kwargs) -> BaseModel:
@@ -350,13 +350,6 @@ def model_path(self) -> str:
         """
         return self._model_path
 
-    @property
-    def engine(self) -> Union[Engine, ORTEngine]:
-        """
-        :return: engine instance used for model forward pass in pipeline
-        """
-        return self._engine
-
     @property
     def engine_args(self) -> Dict[str, Any]:
         """
diff --git a/src/deepsparse/transformers/__init__.py b/src/deepsparse/transformers/__init__.py
index 89c7eb68ef..1264aa316d 100644
--- a/src/deepsparse/transformers/__init__.py
+++ b/src/deepsparse/transformers/__init__.py
@@ -120,4 +120,3 @@ def _check_transformers_install():
 from .helpers import *
 from .loaders import *
 from .pipelines import *
-from .server import *
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 85486b312f..d1c12577cf 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -58,7 +58,7 @@
 
 from tqdm.auto import tqdm
 
-from deepsparse.transformers import pipeline
+from deepsparse import Pipeline
 
 
 from datasets import load_dataset, load_metric  # isort: skip
@@ -79,14 +79,14 @@ def squad_eval(args):
     squad_metrics = load_metric("squad")
 
     # load QA pipeline
-    question_answer = pipeline(
+    question_answer = Pipeline.create(
         task="question-answering",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {question_answer.model}")
+    print(f"Engine info: {question_answer.engine}")
 
     for idx, sample in enumerate(tqdm(squad)):
         pred = question_answer(
@@ -96,7 +96,7 @@ def squad_eval(args):
         )
 
         squad_metrics.add_batch(
-            predictions=[{"prediction_text": pred["answer"], "id": sample["id"]}],
+            predictions=[{"prediction_text": pred.answer, "id": sample["id"]}],
             references=[{"answers": sample["answers"], "id": sample["id"]}],
         )
 
@@ -114,19 +114,19 @@ def mnli_eval(args):
     mnli_metrics = load_metric("glue", "mnli")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     for idx, sample in enumerate(tqdm(mnli_matched)):
         pred = text_classify(sample["premise"], sample["hypothesis"])
         mnli_metrics.add_batch(
-            predictions=[int(pred[0]["label"].split("_")[-1])],
+            predictions=[int(pred.labels[0].split("_")[-1])],
             references=[sample["label"]],
         )
 
@@ -152,20 +152,20 @@ def qqp_eval(args):
     qqp_metrics = load_metric("glue", "qqp")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     for idx, sample in enumerate(tqdm(qqp)):
         pred = text_classify([[sample["question1"], sample["question2"]]])
 
         qqp_metrics.add_batch(
-            predictions=[int(pred[0]["label"].split("_")[-1])],
+            predictions=[int(pred.labels[0].split("_")[-1])],
             references=[sample["label"]],
         )
 
@@ -181,14 +181,14 @@ def sst2_eval(args):
     sst2_metrics = load_metric("glue", "sst2")
 
     # load pipeline
-    text_classify = pipeline(
+    text_classify = Pipeline.create(
         task="text-classification",
         model_path=args.onnx_filepath,
         engine_type=args.engine,
         num_cores=args.num_cores,
-        max_length=args.max_sequence_length,
+        sequence_length=args.max_sequence_length,
     )
-    print(f"Engine info: {text_classify.model}")
+    print(f"Engine info: {text_classify.engine}")
 
     for idx, sample in enumerate(tqdm(sst2)):
         pred = text_classify(
@@ -196,7 +196,7 @@ def sst2_eval(args):
         )
 
         sst2_metrics.add_batch(
-            predictions=[int(pred[0]["label"].split("_")[-1])],
+            predictions=[int(pred.labels[0].split("_")[-1])],
             references=[sample["label"]],
         )
 
diff --git a/src/deepsparse/transformers/pipelines.py b/src/deepsparse/transformers/pipelines.py
deleted file mode 100644
index 7725a0e2c2..0000000000
--- a/src/deepsparse/transformers/pipelines.py
+++ /dev/null
@@ -1,1414 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Adaptation of transformers.pipelines and onnx_transformers.pipelines
-
-adapted from:
-https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines/base.py
-https://github.com/patil-suraj/onnx_transformers/blob/master/onnx_transformers/pipelines.py
-
-"""
-
-import json
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from itertools import chain
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import numpy as np
-from transformers.configuration_utils import PretrainedConfig
-from transformers.data import (
-    SquadExample,
-    SquadFeatures,
-    squad_convert_examples_to_features,
-)
-from transformers.file_utils import ExplicitEnum
-from transformers.models.auto import AutoConfig, AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
-from transformers.utils import logging
-
-from deepsparse import Engine, compile_model, cpu
-from deepsparse.transformers.helpers import (
-    fix_numpy_types,
-    get_onnx_path_and_configs,
-    overwrite_transformer_onnx_model_inputs,
-)
-from deepsparse.transformers.loaders import get_batch_loader
-
-
-try:
-    import onnxruntime
-
-    ort_import_error = None
-except Exception as ort_import_err:
-    onnxruntime = None
-    ort_import_error = ort_import_err
-
-__all__ = [
-    "ArgumentHandler",
-    "Pipeline",
-    "TextClassificationPipeline",
-    "TokenClassificationPipeline",
-    "QuestionAnsweringPipeline",
-    "pipeline",
-    "overwrite_transformer_onnx_model_inputs",
-    "SUPPORTED_ENGINES",
-    "SUPPORTED_TASKS",
-]
-
-logger = logging.get_logger(__name__) if logging else None
-
-
-class ArgumentHandler(ABC):
-    """
-    Base interface for handling arguments for each Pipeline.
-    """
-
-    @abstractmethod
-    def __call__(self, *args, **kwargs):
-        raise NotImplementedError()
-
-
-class DefaultArgumentHandler(ArgumentHandler):
-    """
-    Default argument parser handling parameters for each Pipeline`.
-    """
-
-    @staticmethod
-    def handle_kwargs(kwargs: Dict) -> List:
-        """
-        :param kwargs: key word arguments for a pipeline
-        :return: list of the processed key word arguments
-        """
-        if len(kwargs) == 1:
-            output = list(kwargs.values())
-        else:
-            output = list(chain(kwargs.values()))
-
-        return DefaultArgumentHandler.handle_args(output)
-
-    @staticmethod
-    def handle_args(args: Sequence[Any]) -> List[str]:
-        """
-        :param args: sequence of arguments to a pipeline
-        :return: list of formatted, processed arguments
-        """
-
-        # Only one argument, let's do case by case
-        if len(args) == 1:
-            if isinstance(args[0], str):
-                return [args[0]]
-            elif not isinstance(args[0], list):
-                return list(args)
-            else:
-                return args[0]
-
-        # Multiple arguments (x1, x2, ...)
-        elif len(args) > 1:
-            if all([isinstance(arg, str) for arg in args]):
-                return list(args)
-
-            # If not instance of list, then it should be an instance of iterable
-            elif isinstance(args, Iterable):
-                return list(chain.from_iterable(chain(args)))
-            else:
-                raise ValueError(
-                    f"Invalid input type {type(args)}. Pipeline supports "
-                    "Union[str, Iterable[str]]"
-                )
-        else:
-            return []
-
-    def __call__(self, *args, **kwargs):
-        if len(kwargs) > 0 and len(args) > 0:
-            raise ValueError("Pipeline cannot handle mixed args and kwargs")
-
-        if len(kwargs) > 0:
-            return DefaultArgumentHandler.handle_kwargs(kwargs)
-        else:
-            return DefaultArgumentHandler.handle_args(args)
-
-
-class _ScikitCompat(ABC):
-    """
-    Interface layer for the Scikit and Keras compatibility.
-    """
-
-    @abstractmethod
-    def transform(self, X):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def predict(self, X):
-        raise NotImplementedError()
-
-
-class Pipeline(_ScikitCompat):
-    """
-    The Pipeline class is the class from which all pipelines inherit.
-    Refer to this class for methods shared across different pipelines.
-    This base Pipeline class provides support for multiple inference engine backends.
-
-    Base class implementing pipelined operations.
-    Pipeline workflow is defined as a sequence of the following operations:
-
-        Input -> Tokenization -> Model Inference ->
-        Post-Processing (task dependent) -> Output
-
-    Pipeline supports running with the DeepSparse engine or onnxruntime.
-
-    :param model: loaded inference engine to run the model with, can be a
-        deepsparse Engine or onnxruntime InferenceSession
-    :param tokenizer: tokenizer to be used for preprocessing
-    :param config: transformers model config for this model
-    :param engine_type: name of inference engine that is used. Options are
-        deepsparse and onnxruntime
-    :param max_length: maximum sequence length to set for model inputs by default.
-        default value is 128
-    :param input_names: list of input names to the neural network
-    :param args_parser: Reference to the object in charge of parsing supplied
-        pipeline parameters. A default is provided if None
-    :param binary_output: if True, stores outputs as pickled binaries to avoid
-        storing large amount of textual data. Default is False
-    """
-
-    default_input_names = None
-
-    def __init__(
-        self,
-        model: Union[Engine, "onnxruntime.InferenceSession"],
-        tokenizer: PreTrainedTokenizer,
-        config: PretrainedConfig,
-        engine_type: str,
-        max_length: int = 128,
-        input_names: Optional[List[str]] = None,
-        args_parser: ArgumentHandler = None,
-        binary_output: bool = False,
-    ):
-
-        self.model = model
-        self.tokenizer = tokenizer
-        self.config = config
-        self.engine_type = engine_type
-        self.max_length = max_length
-        self.input_names = input_names
-        self.binary_output = binary_output
-        self._args_parser = args_parser or DefaultArgumentHandler()
-        self._framework = (
-            "np" if self.engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE] else "pt"
-        )
-
-    def transform(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines.
-        This method will forward to __call__().
-        """
-        return self(X=X)
-
-    def predict(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines.
-        This method will forward to __call__().
-        """
-        return self(X=X)
-
-    def _parse_and_tokenize(
-        self, *args, padding=True, add_special_tokens=True, **kwargs
-    ):
-        # Parse arguments
-        inputs = self._args_parser(*args, **kwargs)
-        inputs = self.tokenizer(
-            inputs,
-            add_special_tokens=add_special_tokens,
-            return_tensors=self._framework,
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-        )
-
-        return inputs
-
-    def __call__(self, *args, **kwargs):
-        inputs = self._parse_and_tokenize(*args, **kwargs)
-        return self._forward(inputs)
-
-    def _forward(self, inputs):
-        if not all(name in inputs for name in self.input_names):
-            raise ValueError(
-                f"pipeline expected arrays with names {self.input_names}, received "
-                f"inputs: {list(inputs.keys())}"
-            )
-
-        if self.engine_type == ORT_ENGINE:
-            inputs = {k: v for k, v in inputs.items() if k in self.input_names}
-            return self.model.run(None, inputs)
-        elif self.engine_type == DEEPSPARSE_ENGINE:
-            return self.model.run([inputs[name] for name in self.input_names])
-        # TODO: torch
-        # with self.device_placement():
-        #         with torch.no_grad():
-        #             inputs = self.ensure_tensor_on_device(**inputs)
-        #             predictions = self.model(**inputs)[0].cpu()
-        # if return_tensors:
-        #     return predictions
-        # else:
-        #     return predictions.numpy()
-
-
-class TokenClassificationArgumentHandler(ArgumentHandler):
-    """
-    Handles arguments for token classification.
-    """
-
-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
-
-        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
-            inputs = list(inputs)
-            batch_size = len(inputs)
-        elif isinstance(inputs, str):
-            inputs = [inputs]
-            batch_size = 1
-        else:
-            raise ValueError("At least one input is required.")
-
-        offset_mapping = kwargs.get("offset_mapping")
-        if offset_mapping:
-            if isinstance(offset_mapping, list) and isinstance(
-                offset_mapping[0], tuple
-            ):
-                offset_mapping = [offset_mapping]
-            if len(offset_mapping) != batch_size:
-                raise ValueError(
-                    "offset_mapping should have the same batch size as the input"
-                )
-        return inputs, offset_mapping
-
-
-class QuestionAnsweringArgumentHandler(ArgumentHandler):
-    """
-    QuestionAnsweringPipeline requires the user to provide multiple arguments
-    (i.e. question & context) to be mapped
-    to internal `transformers.SquadExample`
-
-    QuestionAnsweringArgumentHandler manages all the possible to create a
-    `transformers.SquadExample` from the command-line supplied arguments
-    """
-
-    def __call__(self, *args, **kwargs):
-        # Position args, handling is sensibly the same as X and data,
-        # so forwarding to avoid duplicating
-        if args is not None and len(args) > 0:
-            if len(args) == 1:
-                kwargs["X"] = args[0]
-            else:
-                kwargs["X"] = list(args)
-
-        # Generic compatibility with sklearn and Keras
-        # Batched data
-        if "X" in kwargs or "data" in kwargs:
-            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
-
-            if isinstance(inputs, dict):
-                inputs = [inputs]
-            else:
-                # Copy to avoid overriding arguments
-                inputs = [i for i in inputs]
-
-            for i, item in enumerate(inputs):
-                if isinstance(item, dict):
-                    if any(k not in item for k in ["question", "context"]):
-                        raise KeyError(
-                            "You need to provide a dictionary with keys "
-                            "{question:..., context:...}"
-                        )
-
-                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
-
-                elif not isinstance(item, SquadExample):
-                    arg_name = "X" if "X" in kwargs else "data"
-                    raise ValueError(
-                        f"{arg_name} argument needs to be of type "
-                        "(list[SquadExample | dict], SquadExample, dict)"
-                    )
-
-        # Tabular input
-        elif "question" in kwargs and "context" in kwargs:
-            if isinstance(kwargs["question"], str):
-                kwargs["question"] = [kwargs["question"]]
-
-            if isinstance(kwargs["context"], str):
-                kwargs["context"] = [kwargs["context"]]
-
-            inputs = [
-                QuestionAnsweringPipeline.create_sample(q, c)
-                for q, c in zip(kwargs["question"], kwargs["context"])
-            ]
-        else:
-            raise ValueError(f"Unknown arguments {kwargs}")
-
-        if not isinstance(inputs, list):
-            inputs = [inputs]
-
-        return inputs
-
-
-class TextClassificationPipeline(Pipeline):
-    """
-    Text classification pipeline using any `ModelForSequenceClassification`.
-
-    This text classification pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"text-classification"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a text classification task.
-
-    :param return_all_scores: set True to return all model scores. Default False
-    """
-
-    def __init__(self, return_all_scores: bool = False, **kwargs):
-        super().__init__(**kwargs)
-
-        self.return_all_scores = return_all_scores
-
-    def __call__(self, *args, **kwargs):
-        """
-        Classify the text(s) given as inputs.
-
-        :param args: One or several texts (or one list of prompts) to classify
-        :param args: kwargs for inner call function
-        :return: A list or a list of list of dicts: Each result comes as list of dicts
-            with the following keys:
-            - `label` -- The label predicted.
-            - `score` -- The corresponding probability.
-            If ``self.return_all_scores=True``, one dictionary is returned per label
-        """
-        outputs = super().__call__(*args, **kwargs)
-
-        if isinstance(outputs, list) and outputs:
-            outputs = outputs[0]
-
-        if self.config.num_labels == 1:
-            scores = 1.0 / (1.0 + np.exp(-outputs))
-        else:
-            scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
-        if self.return_all_scores:
-            return [
-                [
-                    {"label": self.config.id2label[i], "score": score.item()}
-                    for i, score in enumerate(item)
-                ]
-                for item in scores
-            ]
-        else:
-            return [
-                {
-                    "label": self.config.id2label[item.argmax()],
-                    "score": item.max().item(),
-                }
-                for item in scores
-            ]
-
-
-class AggregationStrategy(ExplicitEnum):
-    """
-    All the valid aggregation strategies for TokenClassificationPipeline
-    """
-
-    NONE = "none"
-    SIMPLE = "simple"
-    FIRST = "first"
-    AVERAGE = "average"
-    MAX = "max"
-
-
-class TokenClassificationPipeline(Pipeline):
-    """
-    Named Entity Recognition pipeline using any `ModelForTokenClassification`.
-
-    This token classification pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"token-classification"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a token classification task.
-
-    :param args_parser: argument parser to use default is
-        TokenClassificationArgumentHandler
-    :param aggregation_strategy: AggregationStrategy Enum object to determine
-        the pipeline aggregation strategy. Default is AggregationStrategy.NONE
-    :param ignore_labels: list of labels to ignore. Default is `["O"]`
-    """
-
-    default_input_names = "sequences"
-
-    def __init__(
-        self,
-        args_parser: ArgumentHandler = None,
-        aggregation_strategy: AggregationStrategy = AggregationStrategy.NONE,
-        ignore_labels: List[str] = False,
-        **kwargs,
-    ):
-        super().__init__(
-            args_parser=args_parser or TokenClassificationArgumentHandler(),
-            **kwargs,
-        )
-
-        self.ignore_labels = ignore_labels or ["O"]
-
-        if isinstance(aggregation_strategy, str):
-            aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
-
-        if (
-            aggregation_strategy
-            in {
-                AggregationStrategy.FIRST,
-                AggregationStrategy.MAX,
-                AggregationStrategy.AVERAGE,
-            }
-            and not self.tokenizer.is_fast
-        ):
-            raise ValueError(
-                "Slow tokenizers cannot handle subwords. Please set the "
-                '`aggregation_strategy` option to `"simple"` or use a fast tokenizer.'
-            )
-
-        self.aggregation_strategy = aggregation_strategy
-
-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
-        """
-        Classify each token of the text(s) given as inputs.
-
-
-        :param inputs: One or several texts (or one list of texts) for token
-            classification
-        :return: A list or a list of list of :obj:`dict`: Each result comes as a list
-            of dictionaries (one for each token in the corresponding input, or each
-            entity if this pipeline was instantiated with an aggregation_strategy)
-            with the following keys:
-            - `word` -- The token/word classified.
-            - `score` -- The corresponding probability for `entity`.
-            - `entity` -- The entity predicted for that token/word (it is named
-                `entity_group` when `aggregation_strategy` is not `"none"`.
-            - `index` -- The index of the corresponding token in the sentence.
-            - `start` -- index of the start of the corresponding entity in the sentence
-                Only exists if the offsets are available within the tokenizer
-            - `end` -- The index of the end of the corresponding entity in the sentence.
-                Only exists if the offsets are available within the tokenizer
-        """
-
-        _inputs, offset_mappings = self._args_parser(inputs, **kwargs)
-
-        answers = []
-
-        tokens = self.tokenizer(
-            _inputs,
-            return_tensors=self._framework,
-            truncation=TruncationStrategy.LONGEST_FIRST.value,
-            padding=PaddingStrategy.MAX_LENGTH.value,
-            return_special_tokens_mask=True,
-            return_offsets_mapping=self.tokenizer.is_fast,
-        )
-
-        if self.tokenizer.is_fast:
-            offset_mapping = tokens.pop("offset_mapping")
-        elif not offset_mappings:
-            offset_mapping = [None] * len(_inputs)
-
-        special_tokens_mask = tokens.pop("special_tokens_mask")
-
-        # Forward
-        _forward_pass = self._forward(tokens)
-        for entities_index, current_entities in enumerate(_forward_pass[0]):
-            input_ids = tokens["input_ids"][entities_index]
-
-            scores = np.exp(current_entities) / np.exp(current_entities).sum(
-                -1, keepdims=True
-            )
-            pre_entities = self.gather_pre_entities(
-                _inputs[entities_index],
-                input_ids,
-                scores,
-                offset_mapping[entities_index],
-                special_tokens_mask[entities_index],
-            )
-            grouped_entities = self.aggregate(pre_entities, self.aggregation_strategy)
-            # Filter anything that is in self.ignore_labels
-            current_entities = [
-                entity
-                for entity in grouped_entities
-                if entity.get("entity", None) not in self.ignore_labels
-                and entity.get("entity_group", None) not in self.ignore_labels
-            ]
-            answers.append(current_entities)
-
-        if len(answers) == 1:
-            return answers[0]
-        return answers
-
-    def gather_pre_entities(
-        self,
-        sentence: str,
-        input_ids: np.ndarray,
-        scores: np.ndarray,
-        offset_mapping: Optional[List[Tuple[int, int]]],
-        special_tokens_mask: np.ndarray,
-    ) -> List[dict]:
-        pre_entities = []
-        for idx, token_scores in enumerate(scores):
-            # Filter special_tokens, they should only occur
-            # at the sentence boundaries since we're not encoding pairs of
-            # sentences so we don't have to keep track of those.
-            if special_tokens_mask[idx]:
-                continue
-
-            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
-            if offset_mapping is not None:
-                start_ind, end_ind = offset_mapping[idx]
-                word_ref = sentence[start_ind:end_ind]
-                is_subword = len(word_ref) != len(word)
-
-                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
-                    word = word_ref
-                    is_subword = False
-            else:
-                start_ind = None
-                end_ind = None
-                is_subword = False
-
-            pre_entity = {
-                "word": word,
-                "scores": token_scores,
-                "start": start_ind,
-                "end": end_ind,
-                "index": idx,
-                "is_subword": is_subword,
-            }
-            pre_entities.append(pre_entity)
-        return pre_entities
-
-    def aggregate(
-        self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> List[dict]:
-        if aggregation_strategy in {
-            AggregationStrategy.NONE,
-            AggregationStrategy.SIMPLE,
-        }:
-            entities = []
-            for pre_entity in pre_entities:
-                entity_idx = pre_entity["scores"].argmax()
-                score = pre_entity["scores"][entity_idx]
-                entity = {
-                    "entity": self.config.id2label[entity_idx],
-                    "score": score,
-                    "index": pre_entity["index"],
-                    "word": pre_entity["word"],
-                    "start": pre_entity["start"],
-                    "end": pre_entity["end"],
-                }
-                entities.append(entity)
-        else:
-            entities = self.aggregate_words(pre_entities, aggregation_strategy)
-
-        if aggregation_strategy == AggregationStrategy.NONE:
-            return entities
-        return self.group_entities(entities)
-
-    def aggregate_word(
-        self, entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> dict:
-        word = self.tokenizer.convert_tokens_to_string(
-            [entity["word"] for entity in entities]
-        )
-        if aggregation_strategy == AggregationStrategy.FIRST:
-            scores = entities[0]["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif aggregation_strategy == AggregationStrategy.MAX:
-            max_entity = max(entities, key=lambda entity: entity["scores"].max())
-            scores = max_entity["scores"]
-            idx = scores.argmax()
-            score = scores[idx]
-            entity = self.config.id2label[idx]
-        elif aggregation_strategy == AggregationStrategy.AVERAGE:
-            scores = np.stack([entity["scores"] for entity in entities])
-            average_scores = np.nanmean(scores, axis=0)
-            entity_idx = average_scores.argmax()
-            entity = self.config.id2label[entity_idx]
-            score = average_scores[entity_idx]
-        else:
-            raise ValueError("Invalid aggregation_strategy")
-        new_entity = {
-            "entity": entity,
-            "score": score,
-            "word": word,
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return new_entity
-
-    def aggregate_words(
-        self, entities: List[dict], aggregation_strategy: AggregationStrategy
-    ) -> List[dict]:
-        assert aggregation_strategy not in {
-            AggregationStrategy.NONE,
-            AggregationStrategy.SIMPLE,
-        }, "NONE and SIMPLE strategies are invalid"
-
-        word_entities = []
-        word_group = None
-        for entity in entities:
-            if word_group is None:
-                word_group = [entity]
-            elif entity["is_subword"]:
-                word_group.append(entity)
-            else:
-                word_entities.append(
-                    self.aggregate_word(word_group, aggregation_strategy)
-                )
-                word_group = [entity]
-        # Last item
-        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
-        return word_entities
-
-    def group_sub_entities(self, entities: List[dict]) -> dict:
-        # Get the first entity in the entity group
-        entity = entities[0]["entity"].split("-")[-1]
-        scores = np.nanmean([entity["score"] for entity in entities])
-        tokens = [entity["word"] for entity in entities]
-
-        entity_group = {
-            "entity_group": entity,
-            "score": np.mean(scores),
-            "word": self.tokenizer.convert_tokens_to_string(tokens),
-            "start": entities[0]["start"],
-            "end": entities[-1]["end"],
-        }
-        return entity_group
-
-    def get_tag(self, entity_name: str) -> Tuple[str, str]:
-        if entity_name.startswith("B-"):
-            bi = "B"
-            tag = entity_name[2:]
-        elif entity_name.startswith("I-"):
-            bi = "I"
-            tag = entity_name[2:]
-        else:
-            # It's not in B-, I- format
-            bi = "B"
-            tag = entity_name
-        return bi, tag
-
-    def group_entities(self, entities: List[dict]) -> List[dict]:
-
-        entity_groups = []
-        entity_group_disagg = []
-
-        for entity in entities:
-            if not entity_group_disagg:
-                entity_group_disagg.append(entity)
-                continue
-
-            # If the current entity is similar and adjacent to the previous entity,
-            # append it to the disaggregated entity group
-            # The split is meant to account for the "B" and "I" prefixes
-            # Shouldn't merge if both entities are B-type
-            bi, tag = self.get_tag(entity["entity"])
-            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
-
-            if tag == last_tag and bi != "B":
-                # Modify subword type to be previous_type
-                entity_group_disagg.append(entity)
-            else:
-                # If the current entity is different from the previous entity
-                # aggregate the disaggregated entity group
-                entity_groups.append(self.group_sub_entities(entity_group_disagg))
-                entity_group_disagg = [entity]
-        if entity_group_disagg:
-            # it's the last entity, add it to the entity groups
-            entity_groups.append(self.group_sub_entities(entity_group_disagg))
-
-        return entity_groups
-
-
-class QuestionAnsweringPipeline(Pipeline):
-    """
-    Question Answering pipeline using any `ModelForQuestionAnswering`
-
-    This question answering pipeline can currently be loaded from `pipeline()`
-    using the following task identifier: `"question-answering"`.
-
-    The models that this pipeline can use are models that have been fine-tuned on
-    a question answering task.
-
-    :param model: loaded inference engine to run the model with, can be a
-        deepsparse Engine or onnxruntime InferenceSession
-    :param tokenizer: tokenizer to be used for preprocessing
-    :param config: transformers model config for this model
-    :param engine_type: name of inference engine that is used. Options are
-        deepsparse and onnxruntime
-    :param input_names: list of input names to the neural network
-    :param args_parser: Reference to the object in charge of parsing supplied
-        pipeline parameters. A default is provided if None
-    :param binary_output: if True, stores outputs as pickled binaries to avoid
-        storing large amount of textual data. Default is False
-    """
-
-    default_input_names = "question,context"
-
-    def __init__(
-        self,
-        model: Union[Engine, "onnxruntime.InferenceSession"],
-        tokenizer: PreTrainedTokenizer,
-        engine_type: str,
-        input_names: Optional[List[str]] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            engine_type=engine_type,
-            args_parser=QuestionAnsweringArgumentHandler(),
-            input_names=input_names,
-            **kwargs,
-        )
-
-    @staticmethod
-    def create_sample(
-        question: Union[str, List[str]], context: Union[str, List[str]]
-    ) -> Union[SquadExample, List[SquadExample]]:
-        """
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :return: processed SquadExample object(s) for each question/context pair given
-        """
-        if isinstance(question, list):
-            return [
-                SquadExample(None, q, c, None, None, None)
-                for q, c in zip(question, context)
-            ]
-        else:
-            return SquadExample(None, question, context, None, None, None)
-
-    def __call__(self, *args, **kwargs):
-        """
-        Answer the question(s) given as inputs by using the context(s).
-        Multiple arguments can be used to pass the context, question data
-
-        :param args: SquadExample or list of them containing the question and context
-        :param X: SquadExample or list of them containing the question and context
-        :param data: SquadExample or list of them containing the question and context
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :param topk: the number of answers to return. Will be chosen by
-            order of likelihood)
-        :param doc_stride: if the context is too long to fit with the question for the
-            model, it will be split in several chunks with some overlap. This argument
-            controls the size of that overlap
-        :param max_answer_len: maximum length of predicted answers (e.g., only
-            answers with a shorter length are considered)
-        :param max_seq_len: maximum length of the total sentence (context + question)
-            after tokenization. The context will be split in several chunks
-            (using the doc_stride) if needed
-        :param max_question_len: maximum length of the question after tokenization.
-            It will be truncated if needed
-        :param handle_impossible_answer: whether or not we accept impossible as an
-            answer
-        :param num_spans: maximum number of span to use as input from a long
-            context. Default is to stride the entire context string
-        :param preprocessed_inputs: if provided, preprocessing will be skipped in favor
-            of these inputs. Expected format is the output of self.preprocess; a tuple
-            of (examples, features_list)
-        :return: dict or list of dictionaries, each containing the following keys:
-            `"score"` - The probability associated to the answer
-            `"start"` - The start index of the answer
-            `"end"` - The end index of the answer
-            `"answer"` - The answer to the question
-        """
-        # Set defaults values
-        kwargs.setdefault("topk", 1)
-        kwargs.setdefault("max_answer_len", 15)
-        kwargs.setdefault("handle_impossible_answer", False)
-        kwargs.setdefault("preprocessed_inputs", None)  # (examples, features_list)
-
-        if kwargs["topk"] < 1:
-            raise ValueError(f"topk parameter should be >= 1 (got {kwargs['topk']})")
-
-        if kwargs["max_answer_len"] < 1:
-            raise ValueError(
-                "max_answer_len parameter should be >= 1 "
-                f"(got {kwargs['max_answer_len']})"
-            )
-
-        # run pre-processing if not provided
-        examples, features_list = kwargs["preprocessed_inputs"] or self.preprocess(
-            *args, **kwargs
-        )
-
-        # forward pass and post-processing
-        all_answers = []
-        for features, example in zip(features_list, examples):
-            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
-            fw_args = {
-                k: [feature.__dict__[k] for feature in features]
-                for k in model_input_names
-            }
-
-            # Manage tensor allocation on correct device
-            fw_args = {k: np.array(v) for (k, v) in fw_args.items()}
-            start, end = self._forward(fw_args)[:2]
-
-            # TODO: torch
-            # fw_args = {k: torch.tensor(v, device=self.device)
-            #   for (k, v) in fw_args.items()}
-            # start, end = self.model(**fw_args)[:2]
-            # start, end = start.cpu().numpy(), end.cpu().numpy()
-
-            min_null_score = 1000000  # large and positive
-            answers = []
-            for (feature, start_, end_) in zip(features, start, end):
-                # Ensure padded tokens & question tokens cannot belong
-                undesired_tokens = (
-                    np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
-                )
-
-                # Generate mask
-                undesired_tokens_mask = undesired_tokens == 0.0
-
-                # Make sure non-context indexes cannot contribute to the softmax
-                start_ = np.where(undesired_tokens_mask, -10000.0, start_)
-                end_ = np.where(undesired_tokens_mask, -10000.0, end_)
-
-                # Normalize logits and spans to retrieve the answer
-                start_ = np.exp(
-                    start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))
-                )
-                end_ = np.exp(
-                    end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True))
-                )
-
-                if kwargs["handle_impossible_answer"]:
-                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
-
-                # Mask CLS
-                start_[0] = end_[0] = 0.0
-
-                starts, ends, scores = self.decode(
-                    start_, end_, kwargs["topk"], kwargs["max_answer_len"]
-                )
-
-                if not self.tokenizer.is_fast:
-                    char_to_word = np.array(example.char_to_word_offset)
-                    answers += [
-                        {
-                            "score": score.item(),
-                            "start": np.where(
-                                char_to_word == feature.token_to_orig_map[s]
-                            )[0][0].item(),
-                            "end": np.where(
-                                char_to_word == feature.token_to_orig_map[e]
-                            )[0][-1].item(),
-                            "answer": " ".join(
-                                example.doc_tokens[
-                                    feature.token_to_orig_map[
-                                        s
-                                    ] : feature.token_to_orig_map[e]
-                                    + 1
-                                ]
-                            ),
-                        }
-                        for s, e, score in zip(starts, ends, scores)
-                    ]
-                else:
-                    question_first = bool(self.tokenizer.padding_side == "right")
-
-                    # Sometimes the max probability token is in the middle of a word so:
-                    # we start by finding the right word containing the token with
-                    # `token_to_word` then we convert this word in a character span
-                    answers += [
-                        {
-                            "score": score.item(),
-                            "start": feature.encoding.word_to_chars(
-                                feature.encoding.token_to_word(s),
-                                sequence_index=1 if question_first else 0,
-                            )[0],
-                            "end": feature.encoding.word_to_chars(
-                                feature.encoding.token_to_word(e),
-                                sequence_index=1 if question_first else 0,
-                            )[1],
-                            "answer": example.context_text[
-                                feature.encoding.word_to_chars(
-                                    feature.encoding.token_to_word(s),
-                                    sequence_index=1 if question_first else 0,
-                                )[0] : feature.encoding.word_to_chars(
-                                    feature.encoding.token_to_word(e),
-                                    sequence_index=1 if question_first else 0,
-                                )[
-                                    1
-                                ]
-                            ],
-                        }
-                        for s, e, score in zip(starts, ends, scores)
-                    ]
-
-            if kwargs["handle_impossible_answer"]:
-                answers.append(
-                    {"score": min_null_score, "start": 0, "end": 0, "answer": ""}
-                )
-
-            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[
-                : kwargs["topk"]
-            ]
-            all_answers += answers
-
-        if len(all_answers) == 1:
-            return all_answers[0]
-        return all_answers
-
-    def preprocess(self, *args, **kwargs) -> Tuple[Any, Any]:
-        """
-        preprocess the given QA model inputs using squad_convert_examples_to_features
-
-        :param args: SquadExample or list of them containing the question and context
-        :param X: SquadExample or list of them containing the question and context
-        :param data: SquadExample or list of them containing the question and context
-        :param question: single question or list of question strings
-        :param context: single context or list of context strings
-        :param doc_stride: if the context is too long to fit with the question for the
-            model, it will be split in several chunks with some overlap. This argument
-            controls the size of that overlap
-        :param max_seq_len: maximum length of the total sentence (context + question)
-            after tokenization. The context will be split in several chunks
-            (using the doc_stride) if needed
-        :param max_question_len: maximum length of the question after tokenization.
-            It will be truncated if needed
-        :param num_spans: maximum number of spans to use as input from a long
-            context. Default is to stride the entire context string
-        :return: tuple of SquadExample inputs and preprocessed features list
-        """
-        kwargs.setdefault("doc_stride", 128)
-        kwargs.setdefault("max_seq_len", self.max_length)
-        kwargs.setdefault("max_question_len", 64)
-        kwargs.setdefault("num_spans", None)
-
-        # Convert inputs to features
-        examples = self._args_parser(*args, **kwargs)
-        if not self.tokenizer.is_fast:
-            features_list = [
-                squad_convert_examples_to_features(
-                    examples=[example],
-                    tokenizer=self.tokenizer,
-                    max_seq_length=kwargs["max_seq_len"],
-                    doc_stride=kwargs["doc_stride"],
-                    max_query_length=kwargs["max_question_len"],
-                    padding_strategy=PaddingStrategy.MAX_LENGTH.value,
-                    is_training=False,
-                    tqdm_enabled=False,
-                )
-                for example in examples
-            ]
-        else:
-            features_list = self._encode_features_fast(examples, **kwargs)
-
-        if kwargs["num_spans"]:
-            features_list = [
-                features[: kwargs["num_spans"]] for features in features_list
-            ]
-
-        return examples, features_list
-
-    def decode(
-        self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int
-    ) -> Tuple:
-        """
-        :param start: Individual start probabilities for each token
-        :param end: Individual end probabilities for each token
-        :param topk: Indicates how many possible answer span(s) to extract from the
-            model output
-        :param max_answer_len: Maximum size of the answer to extract from the model
-            output
-        :return: probabilities for each span to be the actual answer. Will filter out
-            unwanted and impossible cases
-        """
-        # Ensure we have batch axis
-        if start.ndim == 1:
-            start = start[None]
-
-        if end.ndim == 1:
-            end = end[None]
-
-        # Compute the score of each tuple(start, end) to be the real answer
-        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
-
-        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = np.tril(np.triu(outer), max_answer_len - 1)
-
-        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
-        scores_flat = candidates.flatten()
-        if topk == 1:
-            idx_sort = [np.argmax(scores_flat)]
-        elif len(scores_flat) < topk:
-            idx_sort = np.argsort(-scores_flat)
-        else:
-            idx = np.argpartition(-scores_flat, topk)[0:topk]
-            idx_sort = idx[np.argsort(-scores_flat[idx])]
-
-        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
-        return start, end, candidates[0, start, end]
-
-    def span_to_answer(
-        self, text: str, start: int, end: int
-    ) -> Dict[str, Union[str, int]]:
-        """
-        When decoding from token probabilities, this method maps token indexes to
-        actual word in the initial context.
-
-        :param text: The actual context to extract the answer from
-        :param start: The answer starting token index
-        :param end: The answer end token index
-        :return: Dictionary containing the start, end, and answer
-        """
-        words = []
-        token_idx = char_start_idx = char_end_idx = chars_idx = 0
-
-        for i, word in enumerate(text.split(" ")):
-            token = self.tokenizer.tokenize(word)
-
-            # Append words if they are in the span
-            if start <= token_idx <= end:
-                if token_idx == start:
-                    char_start_idx = chars_idx
-
-                if token_idx == end:
-                    char_end_idx = chars_idx + len(word)
-
-                words += [word]
-
-            # Stop if we went over the end of the answer
-            if token_idx > end:
-                break
-
-            # Append the subtokenization length to the running index
-            token_idx += len(token)
-            chars_idx += len(word) + 1
-
-        # Join text with spaces
-        return {
-            "answer": " ".join(words),
-            "start": max(0, char_start_idx),
-            "end": min(len(text), char_end_idx),
-        }
-
-    def _encode_features_fast(self, examples: Any, **kwargs) -> List[SquadFeatures]:
-        features_list = []
-        for example in examples:
-            # Define the side we want to truncate / pad and the text/pair sorting
-            question_first = bool(self.tokenizer.padding_side == "right")
-
-            encoded_inputs = self.tokenizer(
-                text=example.question_text if question_first else example.context_text,
-                text_pair=(
-                    example.context_text if question_first else example.question_text
-                ),
-                padding=PaddingStrategy.MAX_LENGTH.value,
-                truncation="only_second" if question_first else "only_first",
-                max_length=kwargs["max_seq_len"],
-                stride=kwargs["doc_stride"],
-                return_tensors="np",
-                return_token_type_ids=True,
-                return_overflowing_tokens=True,
-                return_offsets_mapping=True,
-                return_special_tokens_mask=True,
-            )
-
-            total_spans = len(encoded_inputs["input_ids"])
-
-            # p_mask: mask with 1 for token than cannot be in the answer
-            # We put 0 on the tokens from the context and 1 everywhere else
-            p_mask = np.asarray(
-                [
-                    [
-                        tok != 1 if question_first else 0
-                        for tok in encoded_inputs.sequence_ids(span_id)
-                    ]
-                    for span_id in range(total_spans)
-                ]
-            )
-
-            # keep the cls_token unmasked
-            if self.tokenizer.cls_token_id is not None:
-                cls_index = np.nonzero(
-                    encoded_inputs["input_ids"] == self.tokenizer.cls_token_id
-                )
-                p_mask[cls_index] = 0
-
-            features = []
-            for span_idx in range(total_spans):
-                features.append(
-                    SquadFeatures(
-                        input_ids=encoded_inputs["input_ids"][span_idx],
-                        attention_mask=encoded_inputs["attention_mask"][span_idx],
-                        token_type_ids=encoded_inputs["token_type_ids"][span_idx],
-                        p_mask=p_mask[span_idx].tolist(),
-                        encoding=encoded_inputs[span_idx],
-                        # the following values are unused for fast tokenizers
-                        cls_index=None,
-                        token_to_orig_map={},
-                        example_index=0,
-                        unique_id=0,
-                        paragraph_len=0,
-                        token_is_max_context=0,
-                        tokens=[],
-                        start_position=0,
-                        end_position=0,
-                        is_impossible=False,
-                        qas_id=None,
-                    )
-                )
-            features_list.append(features)
-        return features_list
-
-
-@dataclass
-class TaskInfo:
-    """
-    Information about an NLP task
-
-    :param pipeline_constructor: reference to constructor for the given pipeline task
-    :param default model name: the transformers canonical name for the default model
-    :param base_stub: sparsezoo stub path for the base model for this task
-    :param default_pruned_stub: sparsezoo stub path for the default pruned model
-        for this task
-    :param default_quant_stub: sparsezoo stub path for the default quantized model
-        for this task
-    """
-
-    pipeline_constructor: Callable[[Any], Pipeline]
-    default_model_name: str
-    base_stub: Optional[str] = None
-    default_pruned_stub: Optional[str] = None
-    default_quant_stub: Optional[str] = None
-
-
-# Register all the supported tasks here
-SUPPORTED_TASKS = {
-    "ner": TaskInfo(
-        pipeline_constructor=TokenClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "question-answering": TaskInfo(
-        pipeline_constructor=QuestionAnsweringPipeline,
-        default_model_name="bert-base-uncased",
-        base_stub=(
-            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none"
-        ),
-        default_pruned_stub=(
-            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/"
-            "pruned-aggressive_98"
-        ),
-    ),
-    "sentiment-analysis": TaskInfo(
-        pipeline_constructor=TextClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "text-classification": TaskInfo(
-        pipeline_constructor=TextClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-    "token-classification": TaskInfo(
-        pipeline_constructor=TokenClassificationPipeline,
-        default_model_name="bert-base-uncased",
-    ),
-}
-
-DEEPSPARSE_ENGINE = "deepsparse"
-ORT_ENGINE = "onnxruntime"
-
-SUPPORTED_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
-
-
-def pipeline(
-    task: str,
-    model_name: Optional[str] = None,
-    model_path: Optional[str] = None,
-    engine_type: str = DEEPSPARSE_ENGINE,
-    config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-    max_length: int = 128,
-    num_cores: Optional[int] = None,
-    scheduler: Optional[str] = None,
-    batch_size: Optional[int] = 1,
-    **kwargs,
-) -> Pipeline:
-    """
-    Utility factory method to build a Pipeline
-
-    :param task: name of the task to define which pipeline to create. Currently
-        supported task - "question-answering"
-    :param model_name: canonical name of the hugging face model this model is based on
-    :param model_path: path to model directory containing `model.onnx`, `config.json`,
-        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
-    :param engine_type: inference engine name to use. Supported options are 'deepsparse'
-        and 'onnxruntime'
-    :param config: huggingface model config, if none provided, default will be used
-        which will be from the model name or sparsezoo stub if given for model path
-    :param tokenizer: huggingface tokenizer, if none provided, default will be used
-    :param max_length: maximum sequence length of model inputs. default is 128
-    :param num_cores: number of CPU cores to run engine with. Default is the maximum
-        available
-    :param scheduler: The scheduler to use for the engine. Can be None, single or multi.
-    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
-        Note: `question-answering` pipeline only supports a batch_size of 1.
-    :param kwargs: additional key word arguments for task specific pipeline constructor
-    :return: Pipeline object for the given taks and model
-    """
-
-    # Retrieve the task
-    if task not in SUPPORTED_TASKS:
-        raise KeyError(
-            f"Unknown task {task}, available tasks are {list(SUPPORTED_TASKS.keys())}"
-        )
-    if engine_type not in SUPPORTED_ENGINES:
-        raise ValueError(
-            f"Unsupported engine {engine_type}, supported engines "
-            f"are {SUPPORTED_ENGINES}"
-        )
-    if task == "question-answering" and batch_size != 1:
-        raise ValueError(
-            f"{task} pipeline only supports batch_size 1. "
-            f"Supplied batch_size = {batch_size}"
-        )
-    task_info = SUPPORTED_TASKS[task]
-
-    model_path = model_path or _get_default_model_path(task_info)
-    model_name = model_name or task_info.default_model_name
-
-    onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(model_path)
-
-    # default the tokenizer and config to file in model directory or given model name
-    config = config or config_path or model_name
-    tokenizer = tokenizer or tokenizer_path or model_name
-
-    # create model
-    model, input_names = _create_model(
-        onnx_path,
-        engine_type,
-        num_cores,
-        max_length,
-        scheduler=scheduler,
-        batch_size=batch_size,
-    )
-
-    # Instantiate tokenizer if needed
-    if isinstance(tokenizer, (str, tuple)):
-        if isinstance(tokenizer, tuple):
-            # For tuple we have (tokenizer name, {kwargs})
-            tokenizer_kwargs = tokenizer[1]
-            tokenizer_kwargs["model_max_length"] = max_length
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer, model_max_length=max_length
-            )
-
-    # Instantiate config if needed
-    if config is not None and isinstance(config, str):
-        config = AutoConfig.from_pretrained(config, finetuning_task=task)
-
-    return task_info.pipeline_constructor(
-        model=model,
-        tokenizer=tokenizer,
-        config=config,
-        engine_type=engine_type,
-        max_length=max_length,
-        input_names=input_names,
-        **kwargs,
-    )
-
-
-def _get_default_model_path(task_info: TaskInfo) -> str:
-    if cpu.cpu_vnni_compatible() and task_info.default_quant_stub:
-        return task_info.default_quant_stub
-    return task_info.default_pruned_stub or task_info.base_stub
-
-
-def _create_model(
-    model_path: str,
-    engine_type: str,
-    num_cores: Optional[int],
-    max_length: int = 128,
-    scheduler: Optional[str] = None,
-    batch_size: int = 1,
-) -> Tuple[Union[Engine, "onnxruntime.InferenceSession"], List[str]]:
-    onnx_path, input_names, _ = overwrite_transformer_onnx_model_inputs(
-        model_path, max_length=max_length
-    )
-
-    if engine_type == DEEPSPARSE_ENGINE:
-        model = compile_model(
-            onnx_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            scheduler=scheduler,
-        )
-    elif engine_type == ORT_ENGINE:
-        _validate_ort_import()
-        sess_options = onnxruntime.SessionOptions()
-        if num_cores is not None:
-            sess_options.intra_op_num_threads = num_cores
-        sess_options.log_severity_level = 3
-        sess_options.graph_optimization_level = (
-            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        )
-
-        model = onnxruntime.InferenceSession(onnx_path, sess_options=sess_options)
-
-    return model, input_names
-
-
-def _validate_ort_import():
-    if ort_import_error is not None:
-        raise ImportError(
-            "An exception occurred when importing onxxruntime. Please verify that "
-            "onnxruntime is installed in order to use the onnxruntime inference "
-            f"engine. \n\nException info: {ort_import_error}"
-        )
-
-
-def process_dataset(
-    pipeline_object: Callable,
-    data_path: str,
-    batch_size: int,
-    task: str,
-    output_path: str,
-) -> None:
-    """
-    :param pipeline_object: An instantiated pipeline Callable object
-    :param data_path: Path to input file, supports csv, json and text files
-    :param batch_size: batch_size to use for inference
-    :param task: The task pipeline is instantiated for
-    :param output_path: Path to a json file to output inference results to
-    """
-    batch_loader = get_batch_loader(
-        data_file=data_path,
-        batch_size=batch_size,
-        task=task,
-    )
-    # Wraps pipeline object to make numpy types serializable
-    pipeline_object = fix_numpy_types(pipeline_object)
-    with open(output_path, "a") as output_file:
-        for batch in batch_loader:
-            batch_output = pipeline_object(**batch)
-            json.dump(batch_output, output_file)
-            output_file.write("\n")
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 1afdc79cf4..2fdcd27236 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -17,7 +17,8 @@
 """
 
 
-from typing import Any, List, Mapping
+import warnings
+from typing import Any, List, Mapping, Optional
 
 import numpy
 from transformers.models.auto import AutoConfig, AutoTokenizer
@@ -29,7 +30,10 @@
 )
 
 
-__all__ = ["TransformersPipeline"]
+__all__ = [
+    "TransformersPipeline",
+    "pipeline",
+]
 
 
 class TransformersPipeline(Pipeline):
@@ -149,3 +153,67 @@ def tokens_to_engine_input(
             )
 
         return [tokens[name] for name in self.onnx_input_names]
+
+
+def pipeline(
+    task: str,
+    model_name: Optional[str] = None,
+    model_path: Optional[str] = None,
+    engine_type: str = "deepsparse",
+    config: Optional[str] = None,
+    tokenizer: Optional[str] = None,
+    max_length: int = 128,
+    num_cores: Optional[int] = None,
+    scheduler: Optional[str] = None,
+    batch_size: Optional[int] = 1,
+    **kwargs,
+) -> Pipeline:
+    """
+    [DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete DeepSparse
+    pipelines for tranformers tasks use deepsparse.Pipeline.create(task, ...)
+
+    Utility factory method to build a Pipeline
+
+    :param task: name of the task to define which pipeline to create. Currently
+        supported task - "question-answering"
+    :param model_name: canonical name of the hugging face model this model is based on
+    :param model_path: path to model directory containing `model.onnx`, `config.json`,
+        and `tokenizer.json` files, ONNX model file, or SparseZoo stub
+    :param engine_type: inference engine name to use. Options are 'deepsparse'
+        and 'onnxruntime'. Default is 'deepsparse'
+    :param config: huggingface model config, if none provided, default will be used
+        which will be from the model name or sparsezoo stub if given for model path
+    :param tokenizer: huggingface tokenizer, if none provided, default will be used
+    :param max_length: maximum sequence length of model inputs. default is 128
+    :param num_cores: number of CPU cores to run engine with. Default is the maximum
+        available
+    :param scheduler: The scheduler to use for the engine. Can be None, single or multi
+    :param batch_size: The batch_size to use for the pipeline. Defaults to 1
+        Note: `question-answering` pipeline only supports a batch_size of 1.
+    :param kwargs: additional key word arguments for task specific pipeline constructor
+    :return: Pipeline object for the given taks and model
+    """
+    warnings.warn(
+        "[DEPRECATED] - deepsparse.transformers.pipeline is deprecated to craete "
+        "DeepSparse pipelines for tranformers tasks use deepsparse.Pipeline.create()"
+    )
+
+    if config is not None or tokenizer is not None:
+        raise ValueError(
+            "Directly passing in a config or tokenizer to DeepSparse transformers "
+            "pipelines is no longer supported. config and tokenizer objects should "
+            "be specified by including config.json and tokenizer.json files in the "
+            "model directory respectively"
+        )
+
+    return Pipeline.create(
+        task=task,
+        model_path=model_path,
+        engine_type=engine_type,
+        batch_size=batch_size,
+        num_cores=num_cores,
+        scheduler=scheduler,
+        sequence_length=max_length,
+        default_model_name=model_name,
+        **kwargs,
+    )
diff --git a/src/deepsparse/transformers/pipelines/question_answering.py b/src/deepsparse/transformers/pipelines/question_answering.py
index f3699d350a..2ed3b46752 100644
--- a/src/deepsparse/transformers/pipelines/question_answering.py
+++ b/src/deepsparse/transformers/pipelines/question_answering.py
@@ -340,7 +340,7 @@ def _tokenize(self, example: SquadExample):
             # keep the cls_token unmasked
             if self.tokenizer.cls_token_id is not None:
                 cls_index = numpy.nonzero(
-                    encoded_inputs["input_ids"] == self.tokenizer.cls_token_id
+                    encoded_inputs["input_ids"][0] == self.tokenizer.cls_token_id
                 )
                 p_mask[cls_index] = 0
 
diff --git a/src/deepsparse/transformers/server.py b/src/deepsparse/transformers/server.py
deleted file mode 100644
index 59035dba80..0000000000
--- a/src/deepsparse/transformers/server.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Specs, schemas, and pipelines for use when serving transformers models
-"""
-
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from deepsparse.tasks import SupportedTasks
-from deepsparse.transformers.pipelines import Pipeline, pipeline
-
-
-try:
-    from deepsparse.server.config import ServeModelConfig
-
-    deepsparse_server_err = None
-except Exception as _err:
-    deepsparse_server_err = _err
-    ServeModelConfig = object
-
-try:
-    from pydantic import BaseModel, Field
-
-    pydantic_import_err = None
-except Exception as _err:
-    pydantic_import_err = _err
-    BaseModel = object
-    Field = dict
-
-
-__all__ = [
-    "create_pipeline_definitions",
-    "QuestionAnsweringRequest",
-    "QuestionAnsweringResponse",
-    "TextClassificationRequest",
-    "TextClassificationResponse",
-    "TokenClassificationRequest",
-    "TokenClassificationResponse",
-]
-
-
-def create_pipeline_definitions(
-    model_config: ServeModelConfig,
-) -> Tuple[Pipeline, Any, Any, Dict]:
-    """
-    Create a pipeline definition and the supporting files for a given model config
-    to use for serving in the DeepSparse inference server
-
-    :param model_config: the server model config describing the model and params
-    :return: a tuple containing (the pipeline to use for inference,
-        the expected request body, the expected response body,
-        any additional keyword args for use with the server)
-    """
-    if deepsparse_server_err:
-        raise deepsparse_server_err
-
-    if pydantic_import_err:
-        raise pydantic_import_err
-
-    if SupportedTasks.nlp.question_answering.matches(model_config.task):
-        request_model = QuestionAnsweringRequest
-        response_model = Union[
-            List[QuestionAnsweringResponse],
-            QuestionAnsweringResponse,
-        ]
-        kwargs = {}
-    elif SupportedTasks.nlp.text_classification.matches(model_config.task):
-        request_model = TextClassificationRequest
-        response_model = Union[
-            List[TextClassificationResponse], List[List[TextClassificationResponse]]
-        ]
-        kwargs = {}
-    elif SupportedTasks.nlp.token_classification.matches(model_config.task):
-        request_model = TokenClassificationRequest
-        response_model = Union[
-            List[TokenClassificationResponse], List[List[TokenClassificationResponse]]
-        ]
-        kwargs = {}
-    else:
-        raise ValueError(
-            f"unrecognized task given of {model_config.task} for config {model_config}"
-        )
-
-    pipeline_instance: Pipeline = pipeline(
-        task=model_config.task.lower().replace("_", "-"),
-        model_path=model_config.model_path,
-        engine_type=model_config.engine,
-        num_cores=model_config.num_cores,
-        scheduler=model_config.scheduler,
-        batch_size=model_config.batch_size,
-        **model_config.kwargs,
-    )
-
-    return pipeline_instance, request_model, response_model, kwargs
-
-
-class QuestionAnsweringRequest(BaseModel):
-    """
-    The request model for Question Answering Task
-    """
-
-    question: Union[List[str], str] = Field(
-        description="Either a string or a List of string questions to answer"
-    )
-    context: Union[List[str], str] = Field(
-        description="Either a string or List of strings representing the context "
-        "for each question"
-    )
-
-
-class TokenClassificationRequest(BaseModel):
-    """
-    Schema for TokenClassificationPipeline Request
-    """
-
-    inputs: Union[List[str], str] = Field(
-        description="A string or List of strings representing input to"
-        "TokenClassificationPipeline task"
-    )
-
-
-class TextClassificationRequest(BaseModel):
-    """
-    Schema for TextClassificationPipeline Request
-    """
-
-    sequences: Union[List[str], str] = Field(
-        description="A string or List of strings representing input to"
-        "TextClassificationPipeline task"
-    )
-
-
-class QuestionAnsweringResponse(BaseModel):
-    """
-    Schema for a result from Question Answering Task
-    """
-
-    score: float = Field(description="confidence score for prediction")
-    start: int = Field(description="The start index of the answer")
-    end: int = Field(description="The end index of the answer")
-    answer: str = Field(description="The predicted answer")
-
-
-class TokenClassificationResponse(BaseModel):
-    """
-    Schema for TokenClassificationPipeline Response
-    """
-
-    entity: str = Field(
-        description="The entity predicted for that token/word (it is named"
-        "`entity_group` when `aggregation_strategy` is not `none`."
-    )
-    score: float = Field(description="The corresponding probability for `entity`.")
-    index: int = Field(
-        description="The index of the corresponding token in the sentence."
-    )
-    word: str = Field(description="The token/word classified.")
-    start: Optional[int] = Field(
-        description="The index of the start of the corresponding entity in the "
-        "sentence. Only exists if the offsets are available within the tokenizer"
-    )
-    end: Optional[int] = Field(
-        description="The index of the end of the corresponding entity in the sentence. "
-        "Only exists if the offsets are available within the tokenizer"
-    )
-
-
-class TextClassificationResponse(BaseModel):
-    """
-    Schema for TextClassificationPipeline Response
-    """
-
-    label: str = Field(description="The label predicted.")
-    score: float = Field(description="The corresponding probability.")