In [1]:
import numpy as np
import sys
import tritonclient.grpc as grpcclient
from transformers import BertTokenizer
import torch as t



In [5]:
bt = BertTokenizer.from_pretrained("bert-base-uncased")
sent1 = "I love you very much."
sent2 = "It is a good day."
input_ids1 = t.tensor([bt.encode(sent1)])
am1 = t.ones(input_ids1.size())
input_ids2 = t.tensor([bt.encode(sent2)])
am2 = t.ones(input_ids2.size())

In [23]:
input_idx = t.stack((input_ids1, input_ids2))
input_idx = input_idx.squeeze()
am = t.stack((am1, am2))
am = am.squeeze()

In [24]:
input_idx.numpy()

array([[ 101, 1045, 2293, 2017, 2200, 2172, 1012,  102],
       [ 101, 2009, 2003, 1037, 2204, 2154, 1012,  102]])

In [3]:
triton_client = grpcclient.InferenceServerClient(
    url='localhost:8001',
    verbose=False,
    ssl=False,
    root_certificates=None,
    private_key=None,
    certificate_chain=None
)

In [4]:
model_name = 'bert'

In [18]:
inputs = []
inputs.append(grpcclient.InferInput('input_idx', [2, 8], "INT64"))
inputs.append(grpcclient.InferInput('attention_mask', [2, 8], "FP32"))
# input_data = np.ones(shape=(2,10), dtype=np.float32)
# input_data

In [25]:
inputs[0].set_data_from_numpy(input_idx.numpy())
inputs[1].set_data_from_numpy(am.numpy())

In [26]:
outputs = []
outputs.append(grpcclient.InferRequestedOutput('output'))

In [27]:
result = triton_client.infer(
    model_name=model_name,
    inputs=inputs,
    outputs=outputs,
    client_timeout=None,
    headers={'test':'1'},
    compression_algorithm=None
)

infer, metadata dict_items([('test', '1')])
model_name: "bert"
inputs {
  name: "input_idx"
  datatype: "INT64"
  shape: 2
  shape: 8
}
inputs {
  name: "attention_mask"
  datatype: "FP32"
  shape: 2
  shape: 8
}
outputs {
  name: "output"
}
raw_input_contents: "e\000\000\000\000\000\000\000\025\004\000\000\000\000\000\000\365\010\000\000\000\000\000\000\341\007\000\000\000\000\000\000\230\010\000\000\000\000\000\000|\010\000\000\000\000\000\000\364\003\000\000\000\000\000\000f\000\000\000\000\000\000\000e\000\000\000\000\000\000\000\331\007\000\000\000\000\000\000\323\007\000\000\000\000\000\000\r\004\000\000\000\000\000\000\234\010\000\000\000\000\000\000j\010\000\000\000\000\000\000\364\003\000\000\000\000\000\000f\000\000\000\000\000\000\000"
raw_input_contents: "\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?\000\000\200?"

model_name:

In [28]:
statistics = triton_client.get_inference_statistics(model_name=model_name)

get_inference_statistics, metadata ()
name: "bert"

model_stats {
  name: "bert"
  version: "1"
  last_inference: 1658035743556
  inference_count: 2
  execution_count: 1
  inference_stats {
    success {
      count: 1
      ns: 149624647
    }
    fail {
    }
    queue {
      count: 1
      ns: 203154
    }
    compute_input {
      count: 1
      ns: 26147883
    }
    compute_infer {
      count: 1
      ns: 122818569
    }
    compute_output {
      count: 1
      ns: 113344
    }
    cache_hit {
    }
    cache_miss {
    }
  }
  batch_stats {
    batch_size: 2
    compute_input {
      count: 1
      ns: 26147883
    }
    compute_infer {
      count: 1
      ns: 122818569
    }
    compute_output {
      count: 1
      ns: 113344
    }
  }
}



In [9]:
print(statistics)

model_stats {
  name: "test"
  version: "1"
  last_inference: 1657519901208
  inference_count: 2
  execution_count: 2
  inference_stats {
    success {
      count: 2
      ns: 11747533
    }
    fail {
    }
    queue {
      count: 2
      ns: 225270
    }
    compute_input {
      count: 2
      ns: 10231600
    }
    compute_infer {
      count: 2
      ns: 967262
    }
    compute_output {
      count: 2
      ns: 46370
    }
    cache_hit {
    }
    cache_miss {
    }
  }
  batch_stats {
    batch_size: 1
    compute_input {
      count: 2
      ns: 10231600
    }
    compute_infer {
      count: 2
      ns: 967262
    }
    compute_output {
      count: 2
      ns: 46370
    }
  }
}



In [10]:
len(statistics.model_stats)

1

In [11]:
output_data = result.as_numpy('output')

In [12]:
output_data

array([[-0.19844893, -0.1383495 , -0.3870649 ],
       [-0.19844893, -0.1383495 , -0.3870649 ]], dtype=float32)