# Compare Transfomer Models Before and After Compression

In [2]:
# import timeit
import os
import timeit
from sentence_transformers import SentenceTransformer, util

In [3]:
st_time = timeit.default_timer()
model = SentenceTransformer("msmarco-distilbert-base-tas-b")
sentences = [
    "the fifty mannequin heads floating in the pool kind of freaked them out",
    "she swore she just saw her sushi move",
    "he embraced his new life as an eggplant",
    "my dentist tells me that chewing bricks is very bad for your teeth",
    "the dental specialist recommended an immediate stop to flossing with construction materials"
]

embeddings = model.encode(sentences)
print(embeddings.shape)
print("Time taken: ", timeit.default_timer() - st_time)

  return torch._C._cuda_getDeviceCount() > 0


(5, 768)
Time taken:  1.5632251650094986


In [8]:
from pathlib import Path
from typing import List

from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer

In [9]:
import torch
import torch.nn.functional as F
from transformers import Pipeline


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        self.preprocess_params = {}
        return self.preprocess_params, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, return_tensors="pt"
        )
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = self.mean_pooling(
            model_outputs["outputs"], model_outputs["attention_mask"]
        )
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[
            0
        ]  # First element of model_output contains all token embeddings
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )


In [25]:
onnx_path = Path("../ml/model/sentence-transformers/msmarco-distilbert-base-tas-b.onnx")
model = ORTModelForFeatureExtraction.from_pretrained(onnx_path)
tokenizer = AutoTokenizer.from_pretrained(onnx_path)
pipeline = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

def generate_embeddings(inputs: List[str]):
    embeddings = pipeline(inputs)
    return embeddings

In [28]:
st_time = timeit.default_timer()
onnx_embeddings = generate_embeddings(sentences)
print(len(onnx_embeddings))
print("Time taken: ", timeit.default_timer() - st_time)

5
Time taken:  0.04071255400776863


In [29]:
onnx_embeddings[0].shape

torch.Size([1, 768])