## OverView
In this guide, we show how to enable PyTorch model with OpenVINO, and how to optimize Vision Transformers models with quantize.

1. Setup Development Environment
2. Convert the PyTorch model to ONNX model
3. Apply Bf16 quantization using OpenVINO
3. Apply Int8 quantization using OpenVINO

## Setup Development Environment

In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Convert the Pytorch model to ONNX model

In [2]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests
import torch
from pathlib import Path

url = 'https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg'
image = Image.open(requests.get(url, stream=True).raw)

model_id="nateraw/vit-base-beans"
model_name="vit-base-beans"
onnx_path = Path("onnx")

image_processor = ViTImageProcessor.from_pretrained(model_id)
model = ViTForImageClassification.from_pretrained(model_id)

inputs = image_processor(images=image, return_tensors="pt")

# print(model(**inputs))

torch.onnx.export(model, inputs["pixel_values"], model_name+'.onnx',
    input_names=["input"], output_names=["output"],
    dynamic_axes={'input': {0:'batch'}, 'output': {0:'batch'}})

print("Convert success!")

  from .autonotebook import tqdm as notebook_tqdm
  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


Convert success!


## Apply Bf16 quantization using OpenVINO

### Test the performance (latency) of quantized model 

In [3]:
import numpy as np
import openvino.runtime as ov
from time import perf_counter
import numpy as np
from PIL import Image
import requests
from datasets import load_dataset

payload="https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg"
image = Image.open(requests.get(payload, stream=True).raw)

def measure_latency(model, inputs):
    # prepare date
    latencies = []
    # warm up
    for _ in range(10):
        _ = model(inputs)
    # Timed run
    for _ in range(1000):
        start_time = perf_counter()
        _ = model(inputs)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

core = ov.Core()
core.set_property("CPU", {"INFERENCE_PRECISION_HINT": "f32"})
fp32_model = core.compile_model(model_name+'.onnx', "AUTO")

core.set_property("CPU", {"INFERENCE_PRECISION_HINT": "bf16"})
bf16_model = core.compile_model(model_name+'.onnx', "AUTO")

inputs = image_processor(image, return_tensors="pt")
# Create tensor from external memory
ov_inputs=inputs["pixel_values"].numpy()
input_tensor = ov.Tensor(array=ov_inputs, shape=[1, 3, 224, 224])

print(f"benchmark with models:")
rtn_fp32_model = measure_latency(fp32_model, input_tensor)
rtn_bf16_model = measure_latency(bf16_model, input_tensor)

print(f"fp32_model: {rtn_fp32_model[0]}")
print(f"bf16_model: {rtn_bf16_model[0]}")
print(f"Improvement through quantization: {round(rtn_fp32_model[1]/rtn_bf16_model[1], 2)}x")


benchmark with models:
fp32_model: P95 latency (ms) - 16.660605580545962; Average latency (ms) - 15.12 +\- 1.59;
bf16_model: P95 latency (ms) - 8.549499016953632; Average latency (ms) - 8.00 +\- 1.64;
Improvement through quantization: 1.95x


### Test the accuraccy of quantized model 

In [4]:
from sklearn.metrics import accuracy_score
import numpy as np
import os
from datasets import load_dataset

eval_dataset = load_dataset("beans",split=["test"])[0]

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return dict(accuracy=accuracy_score(predictions, labels))

def predict(model, image):
    inputs = image_processor(image, return_tensors="pt")
    ov_inputs=inputs["pixel_values"].numpy()
    input_tensor = ov.Tensor(array=ov_inputs, shape=[1, 3, 224, 224])
    return model(input_tensor)

size = len(eval_dataset["image"])

fp32_eval_pred = ([predict(fp32_model, eval_dataset["image"][i])[fp32_model.output(0)] for i in range(size)], eval_dataset["labels"])
bf16_eval_pred = ([predict(bf16_model, eval_dataset["image"][i])[bf16_model.output(0)] for i in range(size)], eval_dataset["labels"])

fp32_accuracy = compute_metrics(fp32_eval_pred)
bf16_accuracy = compute_metrics(bf16_eval_pred)

print(f"fp32_accuracy: {fp32_accuracy['accuracy']*100:.2f}%")
print(f"bf16_accuracy: {bf16_accuracy['accuracy']*100:.2f}%")
print(f"The quantized model achieves {round(bf16_accuracy['accuracy']/fp32_accuracy['accuracy'],4)*100:.2f}% accuracy of the fp32 model")

Found cached dataset beans (/home/marvin/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791)
100%|██████████| 1/1 [00:00<00:00, 725.41it/s]


fp32_accuracy: 96.88%
bf16_accuracy: 96.88%
The quantized model achieves 100.00% accuracy of the fp32 model


## Apply Int8 quantization using OpenVINO



### Prepare the calibration dataset

In [5]:

import os
from pathlib import Path
import warnings

import torch
from torchvision import transforms as T
from torchvision.datasets import CIFAR10

import matplotlib.pyplot as plt
import numpy as np

from openvino.runtime import Core, Tensor

warnings.filterwarnings("ignore")

# Set the data and model directories
MODEL_DIR = 'model'
CALIB_DIR = 'calib'
CIFAR_DIR = 'data/datasets/beans'
CALIB_SET_SIZE = 300
MODEL_NAME = 'vit-base-beans'

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CALIB_DIR, exist_ok=True)
os.makedirs(CIFAR_DIR, exist_ok=True)

Downlaod the dataset

In [6]:
import cv2
from datasets import load_dataset

ds = load_dataset('beans')['train']
lbs = ds['labels']

_index = 0
_label_index = [100, 100, 100]

for idx, info in enumerate(ds):
    im = info["image"]
    label = info["labels"]
    if _label_index[label] > 0:
        im = im.resize((224, 224))
        im.save(Path(CALIB_DIR) / f'{label}_{_index}.jpg')
        _label_index[label] = _label_index[label] - 1
        _index = _index + 1

Found cached dataset beans (/home/marvin/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791)
100%|██████████| 3/3 [00:00<00:00, 1218.92it/s]


In [7]:
onnx_model_path = Path(".") / 'vit-base-beans.onnx'
ir_model_xml = Path(MODEL_DIR) / onnx_model_path.with_suffix('.xml')
ir_model_bin = Path(MODEL_DIR) / onnx_model_path.with_suffix('.bin')

Now, convert this model into the OpenVINO IR using Model Optimizer:



In [8]:
!mo -m $onnx_model_path  --output_dir $MODEL_DIR


[ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11.
Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html
[ SUCCESS ] Generated IR version 11 model.
[ SUCCESS ] XML file: /home/marvin/workspace/greennet/vit/model/vit-base-beans.xml
[ SUCCESS ] BIN file: /home/marvin/workspace/greennet/vit/model/vit-base-beans.bin


Compress the model with the following command:

`pot -q default -m <path_to_xml> -w <path_to_bin> --engine simplified --data-source <path_to_data>`

In [9]:
!pot -q default -m $ir_model_xml -w $ir_model_bin --engine simplified --data-source $CALIB_DIR --output-dir compressed --direct-dump --name $MODEL_NAME

INFO:openvino.tools.pot.app.run:Output log dir: compressed
INFO:openvino.tools.pot.app.run:Creating pipeline:
 Algorithm: DefaultQuantization
 Parameters:
	preset                     : performance
	stat_subset_size           : 300
	target_device              : ANY
	model_type                 : None
	dump_intermediate_model    : False
	inplace_statistics         : True
	exec_log_dir               : compressed
INFO:openvino.tools.pot.data_loaders.image_loader:Layout value is set [N,C,H,W]
INFO:openvino.tools.pot.pipeline.pipeline:Inference Engine version:                2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.pipeline.pipeline:Model Optimizer version:                 2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.pipeline.pipeline:Post-Training Optimization Tool version: 2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.statistics.collector:Start computing statistics for algorithms : DefaultQuantization
INFO:openvino.tools.pot.

### Test the performance (latency) of quantized model


In [10]:
optimized_model_path = Path('compressed/optimized')
optimized_model_xml = optimized_model_path / '{}.xml'.format(MODEL_NAME)
optimized_model_bin = optimized_model_path / '{}.bin'.format(MODEL_NAME)

In [15]:
int8_model = core.compile_model(str(optimized_model_xml))

print(f"benchmark with models:")
rtn_fp32_model = measure_latency(fp32_model, input_tensor)
rtn_bf16_model = measure_latency(bf16_model, input_tensor)
rtn_int8_model = measure_latency(int8_model, input_tensor)

print(f"fp32_model: {rtn_fp32_model[0]}")
print(f"bf16_model: {rtn_bf16_model[0]}")
print(f"int8_model: {rtn_int8_model[0]}")
print(f"Improvement through bf16 quantization: {round(rtn_fp32_model[1]/rtn_bf16_model[1], 2)}x")
print(f"Improvement through int8 quantization: {round(rtn_fp32_model[1]/rtn_int8_model[1], 2)}x")

benchmark with models:
fp32_model: P95 latency (ms) - 16.08027223846875; Average latency (ms) - 15.07 +\- 1.23;
bf16_model: P95 latency (ms) - 8.533450739923865; Average latency (ms) - 7.91 +\- 1.04;
int8_model: P95 latency (ms) - 9.595101361628622; Average latency (ms) - 8.52 +\- 0.91;
Improvement through bf16 quantization: 1.88x
Improvement through int8 quantization: 1.68x


: 

In [12]:
# Inference FP32 model (OpenVINO IR)
!benchmark_app -m $ir_model_xml -d CPU -api async -b 1

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 130.23 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input (node: input) : f32 / [...] / [?,3,224,224]
[ INFO ] Model outputs:
[ INFO ]     output (node: output) : f32 / [...] / [?,3]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input': [1,3,224,224]
[ INFO ] Reshape model took 6.33 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     input (node

In [13]:
!benchmark_app -m $optimized_model_xml -d CPU -api async -b 1


[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 74.60 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input (node: input) : f32 / [...] / [?,3,224,224]
[ INFO ] Model outputs:
[ INFO ]     output (node: output) : f32 / [...] / [?,3]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input': [1,3,224,224]
[ INFO ] Reshape model took 10.80 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     input (node

### Test the accuraccy of quantized model 

In [14]:
ie = Core()

int8_model = ie.compile_model(str(optimized_model_xml))

size = len(eval_dataset["image"])

fp32_eval_pred = ([predict(fp32_model, eval_dataset["image"][i])[fp32_model.output(0)] for i in range(size)], eval_dataset["labels"])
int8_eval_pred = ([predict(int8_model, eval_dataset["image"][i])[int8_model.output(0)] for i in range(size)], eval_dataset["labels"])

fp32_accuracy = compute_metrics(fp32_eval_pred)
int8_accuracy = compute_metrics(int8_eval_pred)

print(f"fp32_accuracy: {fp32_accuracy['accuracy']*100:.2f}%")
print(f"int8_accuracy: {int8_accuracy['accuracy']*100:.2f}%")
print(f"The quantized model achieves {round(int8_accuracy['accuracy']/fp32_accuracy['accuracy'],4)*100:.2f}% accuracy of the fp32 model")


fp32_accuracy: 96.88%
int8_accuracy: 42.19%
The quantized model achieves 43.55% accuracy of the fp32 model


## Apply Encoder fusion

### Fp32 encoder fusion

In [2]:
# prepare env
!pushd extension && bash prepare.sh && popd

~/workspace/greennet/vit/extension ~/workspace/greennet/vit
--2023-03-22 02:09:52--  https://github.com/oneapi-src/oneDNN/releases/download/v0.21-rc/mklml_lnx_2019.0.5.20190502.tgz
Resolving child-prc.intel.com (child-prc.intel.com)... 10.239.120.55
Connecting to child-prc.intel.com (child-prc.intel.com)|10.239.120.55|:913... connected.
Proxy request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/58414589/8c825300-d8a7-11e9-918a-f6d6bce48f33?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230322%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230322T060920Z&X-Amz-Expires=300&X-Amz-Signature=95f948ebf76544dd9338759fcc8e03d77b7fafbc1db183bdf4cc855e4deb7a13&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=58414589&response-content-disposition=attachment%3B%20filename%3Dmklml_lnx_2019.0.5.20190502.tgz&response-content-type=application%2Foctet-stream [following]
--2023-03-22 02:09:53--  

In [3]:
# install custom package
!pushd extension && python setup.py install && popd

~/workspace/greennet/vit/extension ~/workspace/greennet/vit
running install
running bdist_egg
running egg_info
writing fused_bert.egg-info/PKG-INFO
writing dependency_links to fused_bert.egg-info/dependency_links.txt
writing top-level names to fused_bert.egg-info/top_level.txt
reading manifest file 'fused_bert.egg-info/SOURCES.txt'
writing manifest file 'fused_bert.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_ext
building 'fused_bert' extension
Emitting ninja build file /home/marvin/workspace/greennet/vit/extension/build/temp.linux-x86_64-cpython-39/build.ninja...
Compiling objects...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/1] c++ -MMD -MF /home/marvin/workspace/greennet/vit/extension/build/temp.linux-x86_64-cpython-39/bert.o.d -pthread -B /home/marvin/.conda/envs/ipex/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -

In [8]:

from transformers import AutoImageProcessor, ViTModel
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

model.eval()

inputs = image_processor(image, return_tensors="pt")

print("output with original model: \n", model(**inputs))

No config specified, defaulting to: cats-image/image
Found cached dataset cats-image (/home/marvin/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e)
100%|██████████| 1/1 [00:00<00:00, 1100.00it/s]


output with original model: 
 BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.1559,  0.0914,  0.1518,  ..., -0.3180, -0.0859, -0.0903],
         [-0.2254,  0.0864,  0.4752,  ..., -0.1781,  0.1726,  0.1334],
         [ 0.0444,  0.0677,  0.4199,  ..., -0.2576,  0.1191,  0.0130],
         ...,
         [-0.0153, -0.0396,  0.1684,  ..., -0.1672,  0.1869,  0.1025],
         [ 0.0249, -0.0382,  0.2046,  ...,  0.0517,  0.1489,  0.1320],
         [-0.1748, -0.0254,  0.2523,  ..., -0.1474,  0.1627,  0.1325]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 5.8399e-02, -3.0683e-01,  3.1213e-01, -1.1009e-01, -1.4752e-01,
          4.9735e-01, -1.5786e-01,  4.8658e-01, -4.6255e-01,  2.4344e-01,
          2.9942e-02,  2.8738e-01, -4.8914e-01, -9.9510e-03, -2.8943e-01,
          3.1443e-01, -6.2883e-02, -2.6637e-01, -3.9652e-01,  2.9896e-01,
          2.1507e-01, -1.9265e-01,  1.1786e-01,  2.5995e-01,  3.5440e-01,
         -3.7968e-01,  4.8320e-01, -3.5686e-01,  2.3996

In [9]:
import vit_optimizer

# do encoder fusion
vit_optimizer.optimize_bert_encoder(model)
print("output with custom model: \n", model(**inputs))

output with custom model: 
 BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.1566,  0.0904,  0.1512,  ..., -0.3179, -0.0860, -0.0919],
         [-0.2235,  0.0856,  0.4779,  ..., -0.1767,  0.1711,  0.1323],
         [ 0.0449,  0.0681,  0.4221,  ..., -0.2572,  0.1173,  0.0126],
         ...,
         [-0.0152, -0.0393,  0.1688,  ..., -0.1690,  0.1859,  0.1011],
         [ 0.0236, -0.0377,  0.2051,  ...,  0.0520,  0.1476,  0.1315],
         [-0.1741, -0.0263,  0.2539,  ..., -0.1462,  0.1610,  0.1299]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 5.6516e-02, -3.0602e-01,  3.1479e-01, -1.1303e-01, -1.5046e-01,
          4.9822e-01, -1.5958e-01,  4.8751e-01, -4.6317e-01,  2.4204e-01,
          2.7707e-02,  2.8867e-01, -4.9035e-01, -1.0389e-02, -2.9009e-01,
          3.1144e-01, -6.2464e-02, -2.6714e-01, -3.9563e-01,  3.0180e-01,
          2.1344e-01, -1.9213e-01,  1.1921e-01,  2.5933e-01,  3.5225e-01,
         -3.8258e-01,  4.8355e-01, -3.5428e-01,  2.4096e-

### Int8 encoder fusion

In [11]:
# int8 quantization with encoder fusion

import torch
from transformers import AutoImageProcessor, ViTModel
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

model.eval()

inputs = image_processor(image, return_tensors="pt")

# Dynamic quantization with PT
q_model = torch.quantization.quantize_dynamic(model)

print("output with original int8 model: \n", q_model(**inputs))

No config specified, defaulting to: cats-image/image
Found cached dataset cats-image (/home/marvin/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e)
100%|██████████| 1/1 [00:00<00:00, 812.22it/s]


output with original int8 model: 
 BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.0671,  0.0577,  0.1704,  ..., -0.2897, -0.0215, -0.1593],
         [-0.1750,  0.1396,  0.4689,  ..., -0.2322,  0.2679,  0.2359],
         [ 0.0272,  0.1598,  0.4527,  ..., -0.2294,  0.1809,  0.1539],
         ...,
         [ 0.1208, -0.0062,  0.2898,  ..., -0.1514,  0.1938,  0.0468],
         [-0.0377, -0.0633,  0.1613,  ..., -0.1985,  0.0992,  0.2628],
         [-0.1818, -0.0094,  0.2225,  ..., -0.2125,  0.1303,  0.1951]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-1.6607e-01, -2.5335e-01,  2.7456e-01,  1.9225e-01, -8.4645e-02,
          4.7062e-01, -1.2369e-01,  4.2519e-01, -6.2605e-01,  2.7125e-01,
          5.8259e-02,  3.5742e-01, -4.4340e-01, -6.6103e-02, -2.4886e-01,
          8.0751e-03,  1.0978e-01, -2.5643e-01, -3.0650e-01,  4.2634e-01,
          2.0308e-01, -1.0710e-01,  1.5625e-01,  4.5200e-01,  3.6681e-01,
         -4.1965e-01,  3.2702e-01, -2.4375e-01,  3

In [12]:

import vit_optimizer
vit_optimizer.optimize_bert_encoder(model, is_int8=True)

# Dynamic quantization with PT
model = torch.quantization.quantize_dynamic(model)

print("output with custom int8 model: \n", q_model(**inputs))

output with custom int8 model: 
 BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.0671,  0.0577,  0.1704,  ..., -0.2897, -0.0215, -0.1593],
         [-0.1750,  0.1396,  0.4689,  ..., -0.2322,  0.2679,  0.2359],
         [ 0.0272,  0.1598,  0.4527,  ..., -0.2294,  0.1809,  0.1539],
         ...,
         [ 0.1208, -0.0062,  0.2898,  ..., -0.1514,  0.1938,  0.0468],
         [-0.0377, -0.0633,  0.1613,  ..., -0.1985,  0.0992,  0.2628],
         [-0.1818, -0.0094,  0.2225,  ..., -0.2125,  0.1303,  0.1951]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-1.6607e-01, -2.5335e-01,  2.7456e-01,  1.9225e-01, -8.4645e-02,
          4.7062e-01, -1.2369e-01,  4.2519e-01, -6.2605e-01,  2.7125e-01,
          5.8259e-02,  3.5742e-01, -4.4340e-01, -6.6103e-02, -2.4886e-01,
          8.0751e-03,  1.0978e-01, -2.5643e-01, -3.0650e-01,  4.2634e-01,
          2.0308e-01, -1.0710e-01,  1.5625e-01,  4.5200e-01,  3.6681e-01,
         -4.1965e-01,  3.2702e-01, -2.4375e-01,  3.4