# Автоматическая компрессия моделей (Optimum)

In [66]:
import torch
from datasets import load_dataset
from evaluate import load
from itertools import islice
from tqdm import tqdm
from time import time
from collections import defaultdict
import itertools

from transformers import WhisperForConditionalGeneration, WhisperProcessor

from optimum.pipelines import pipeline as optimum_pipeline

from transformers import pipeline as transformers_pipeline
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq, ORTQuantizer, AutoQuantizationConfig

In [39]:
NUM_EXAMPLES = 50
librispeech_test_clean = load_dataset("librispeech_asr", "clean", split=f"test[:{NUM_EXAMPLES}]")
warmup_samples = islice(librispeech_test_clean, 10)

## Базовая модель

In [50]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to("cpu")

In [51]:
def map_to_pred(batch, model):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cpu"))[0]
    transcription = processor.decode(predicted_ids)
    return processor.tokenizer._normalize(transcription)

In [52]:
res = defaultdict(list)

#WARMUP
for el in tqdm(warmup_samples):
    map_to_pred(el, model)

#INFERENCE
t = time()
for el in tqdm(librispeech_test_clean):
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(map_to_pred(el, model))

total_t = time() - t
t = (total_t / NUM_EXAMPLES)
print(f"total time on 50 examples: {total_t}")
print(f"avg. time on example: {t}")

wer = load("wer")
print(f"WER: {100 * wer.compute(references=res['reference'], predictions=res['prediction'])}")

cer = load("cer")
print(f"CER: {100 * cer.compute(references=res['reference'], predictions=res['prediction'])}")

0it [00:00, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [02:13<00:00,  2.66s/it]


total time on 50 examples: 133.24286723136902
avg. time on example: 2.6648573446273804
WER: 3.982777179763186
CER: 1.340373679935012


## Inference pipelines with the ONNX Runtime accelerator

In [54]:
onnx_asr_pipe = optimum_pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-small",
  accelerator="ort",
  #device="cpu",
)

Framework not specified. Using pt to export to ONNX.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> False
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
  if input_shape[-1] > 1:
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.
  and 

In [55]:
feature_extractor = onnx_asr_pipe.tokenizer.feature_extractor
tokenizer = onnx_asr_pipe.tokenizer.tokenizer

onnx_asr_pipe.feature_extractor = feature_extractor
onnx_asr_pipe.tokenizer = tokenizer

In [56]:
res = defaultdict(list)

#WARMUP
for el in tqdm(warmup_samples):
    onnx_asr_pipe(el["audio"]["array"])

#INFERENCE
t = time()
for el in tqdm(librispeech_test_clean):
    #print(el)
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(processor.tokenizer._normalize(onnx_asr_pipe(el["audio"]["array"])['text']))

total_t = time() - t
t = (total_t / NUM_EXAMPLES)
print(f"total time on 50 examples: {total_t}")
print(f"avg. time on example: {t}")

wer = load("wer")
print(f"WER: {100 * wer.compute(references=res['reference'], predictions=res['prediction'])}")

cer = load("cer")
print(f"CER: {100 * cer.compute(references=res['reference'], predictions=res['prediction'])}")

0it [00:00, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:34<00:00,  1.88s/it]


total time on 50 examples: 94.12685203552246
avg. time on example: 1.8825370407104491
WER: 3.982777179763186
CER: 1.340373679935012


## Optimum Inference with ONNX Runtime

In [57]:
model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", export=True) # ONNX checkpoint
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

onnx_asr = transformers_pipeline("automatic-speech-recognition", model=model, 
                                 tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

Framework not specified. Using pt to export to ONNX.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> False
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
  if input_shape[-1] > 1:
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.
  and 

In [58]:
res = defaultdict(list)

#WARMUP
for el in tqdm(warmup_samples):
    onnx_asr(el["audio"]["array"])

#INFERENCE
t = time()
for el in tqdm(librispeech_test_clean):
    #print(el)
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(processor.tokenizer._normalize(onnx_asr(el["audio"]["array"])['text']))

total_t = time() - t
t = (total_t / NUM_EXAMPLES)
print(f"total time on 50 examples: {total_t}")
print(f"avg. time on example: {t}")

wer = load("wer")
print(f"WER: {100 * wer.compute(references=res['reference'], predictions=res['prediction'])}")

cer = load("cer")
print(f"CER: {100 * cer.compute(references=res['reference'], predictions=res['prediction'])}")

0it [00:00, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:31<00:00,  1.82s/it]


total time on 50 examples: 91.01328778266907
avg. time on example: 1.8202657556533814
WER: 3.982777179763186
CER: 1.340373679935012


## Quantization

In [63]:
onnx_model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", export=True)
model_dir = onnx_model.model_save_dir
model_dir
#quantizer = ORTQuantizer.from_pretrained(ort_model)

Framework not specified. Using pt to export to ONNX.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> False
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
  if input_shape[-1] > 1:
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> True
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.
  and 

WindowsPath('C:/Users/Arina/AppData/Local/Temp/tmp00i4nwai')

In [64]:
# Create encoder quantizer
encoder_quantizer = ORTQuantizer.from_pretrained(model_dir, file_name="encoder_model.onnx")

# Create decoder quantizer
decoder_quantizer = ORTQuantizer.from_pretrained(model_dir, file_name="decoder_model.onnx")

# Create decoder with past key values quantizer
decoder_wp_quantizer = ORTQuantizer.from_pretrained(model_dir, file_name="decoder_with_past_model.onnx")

# Create Quantizer list
quantizer = [encoder_quantizer, decoder_quantizer, decoder_wp_quantizer]

In [76]:
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

In [77]:
for q in quantizer:
    q.quantize(save_dir="./quantized_model",quantization_config=dqconfig)

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: False)
Quantizing model...
Saving quantized model at: quantized_model (external data format: False)
Configuration saved in quantized_model\ort_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: False)
Quantizing model...
Saving quantized model at: quantized_model (external data format: False)
Configuration saved in quantized_model\ort_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Creating dynamic quantizer: QOperator (mode: I

In [80]:
quantized_model = ORTModelForSpeechSeq2Seq.from_pretrained('./quantized_model')
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

onnx_quantized_asr = transformers_pipeline("automatic-speech-recognition", model=quantized_model, 
                                 tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generation config file not found, using a generation config created from the model config.


In [81]:
res = defaultdict(list)

#WARMUP
for el in tqdm(warmup_samples):
    onnx_asr(el["audio"]["array"])

#INFERENCE
t = time()
for el in tqdm(librispeech_test_clean):
    #print(el)
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(processor.tokenizer._normalize(onnx_quantized_asr(el["audio"]["array"])['text']))

total_t = time() - t
t = (total_t / NUM_EXAMPLES)
print(f"total time on 50 examples: {total_t}")
print(f"avg. time on example: {t}")

wer = load("wer")
print(f"WER: {100 * wer.compute(references=res['reference'], predictions=res['prediction'])}")

cer = load("cer")
print(f"CER: {100 * cer.compute(references=res['reference'], predictions=res['prediction'])}")

0it [00:00, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:19<00:00,  1.59s/it]


total time on 50 examples: 79.68720746040344
avg. time on example: 1.5937441492080688
WER: 4.090419806243272
CER: 1.421608448415922
