From 7c1324efc9be78f999d37d4a681caa2e90ea2677 Mon Sep 17 00:00:00 2001 From: Yong Huang Date: Wed, 17 May 2023 14:33:54 +0800 Subject: [PATCH 1/2] fix readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b211226..7fff6fc 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ from batch_inference.batcher.concat_batcher import ConcatBatcher @batching(batcher=ConcatBatcher(), max_batch_size=32) class MyModel: def __init__(self, k, n): - self.weights = np.random.randn((k, n)).astype("f") + self.weights = np.random.randn(k, n).astype("f") # shape of x: [batch_size, m, k] def predict_batch(self, x): @@ -75,6 +75,7 @@ def process_request(x): y = host.predict(x) return y +host.stop() ``` **Batcher** is responsible to merge queries and split outputs. In this case ConcatBatcher will concat input tensors into a batched tensors at first dimension. We provide a set of built-in Batchers for common scenarios, and you can also implement your own Batcher. See [What is Batcher](https://microsoft.github.io/batch-inference/batcher/what_is_batcher.html) for more information. From edffffd2b52a3f231dbf5202e95d7251db668a23 Mon Sep 17 00:00:00 2001 From: Yong Huang Date: Wed, 17 May 2023 14:38:08 +0800 Subject: [PATCH 2/2] rephrase --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7fff6fc..9f93cb8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Batch Inference Toolkit -Batch Inference Toolkit(batch-inference) is a Python package that batches model input tensors coming from multiple users dynamically, executes the model, un-batches output tensors and then returns them back to each user respectively. This will improve system throughput because of better compute parallelism and better cache locality. The entire process is transparent to developers. +Batch Inference Toolkit(batch-inference) is a Python package that batches model input tensors coming from multiple requests dynamically, executes the model, un-batches output tensors and then returns them back to each request respectively. This will improve system throughput because of better compute parallelism and better cache locality. The entire process is transparent to developers. ## When to use