In [1]:
from transformers.pipelines import pipeline
from keybert import KeyBERT
from onclusiveml.ml_compile import CompiledPipeline
import torch_neuron

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hf_pipeline = pipeline(task='feature-extraction',model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [3]:
try:
    compiled_pipeline_short = CompiledPipeline.from_pretrained('compiled_pipeline_short')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_short = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=25, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_short.save_pretrained('compiled_pipeline_short')
    print('Recompiled and exported')

Reloaded


In [4]:
try:
    compiled_pipeline_medium = CompiledPipeline.from_pretrained('compiled_pipeline_medium')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_medium = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=250, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_medium.save_pretrained('compiled_pipeline_medium')
    print('Recompiled and exported')

Reloaded


In [5]:
try:
    compiled_pipeline_long = CompiledPipeline.from_pretrained('compiled_pipeline_long')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_long = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=500, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_long.save_pretrained('compiled_pipeline_long')
    print('Recompiled and exported')

Reloaded


In [6]:
short_doc = 'this is a test sample for latency benchmarking ' * 2
short_medium = short_doc * 5
medium_doc = short_doc * 10
medium_long_doc = short_doc * 15
long_doc = medium_doc * 3

docs ={
    'doc_short':[short_doc,] * 5,
    'doc_short_medium': [short_medium,] * 5,
    'doc_medium':[medium_doc,] * 5,
    'doc_medium_long': [medium_long_doc,] * 5,
    'doc_long': [long_doc,] * 5
}

for doc in docs:
    print(len(docs[doc][0].split()))

16
80
160
240
480


In [7]:
for doc in docs:
    print(len(hf_pipeline.tokenizer(docs[doc][0],truncation=True)['input_ids']))

22
102
202
302
512


In [8]:
pipeline_samples = {
    'uncompiled':hf_pipeline,
    'compiled_short':compiled_pipeline_short,
    'compiled_medium':compiled_pipeline_medium,
    'compiled_long':compiled_pipeline_long
}

(compiled_pipeline_short.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_medium.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_long.compiled_pipeline.model.compilation_specs['tracing__max_length'])

(25, 250, 500)

In [9]:
pipeline_results = {
    'uncompiled':{},
    'compiled_short':{},
    'compiled_medium':{},
    'compiled_long':{},
    'n_tokens': {}
}

In [10]:
import time

n_runs = 50

for doc_scenario in docs.keys():
    
    print(f'Document: {doc_scenario}')
    pipeline_results['n_tokens'][doc_scenario] = len(hf_pipeline.tokenizer(docs[doc_scenario][0],truncation=True)['input_ids'])
    
    for pipeline_scenario in pipeline_samples.keys():
        
        print(f'Pipeline: {pipeline_scenario}')
        
        start = time.time()
        
        for i in range(n_runs):
            pipeline_samples[pipeline_scenario](docs[doc_scenario],truncation=True)
            
        pipeline_results[pipeline_scenario][doc_scenario] = (time.time() - start) / n_runs
        
        print(f'Pipeline {pipeline_scenario} duration: {pipeline_results[pipeline_scenario][doc_scenario]}')
        print('---------------------------------------------------------------')
        time.sleep(2)
        
    print(f'=============================================================')

Document: doc_short
Pipeline: uncompiled
Pipeline uncompiled duration: 0.047734155654907226
---------------------------------------------------------------
Pipeline: compiled_short
Pipeline compiled_short duration: 0.015323305130004882
---------------------------------------------------------------
Pipeline: compiled_medium
Pipeline compiled_medium duration: 0.07378016948699952
---------------------------------------------------------------
Pipeline: compiled_long
Pipeline compiled_long duration: 0.23576385021209717
---------------------------------------------------------------
Document: doc_short_medium
Pipeline: uncompiled
Pipeline uncompiled duration: 0.07844781398773193
---------------------------------------------------------------
Pipeline: compiled_short
Pipeline compiled_short duration: 0.015454797744750977
---------------------------------------------------------------
Pipeline: compiled_medium
Pipeline compiled_medium duration: 0.07171805858612061
---------------------------

In [11]:
import pandas as pd

results_df = pd.DataFrame(pipeline_results)
results_df

Unnamed: 0,uncompiled,compiled_short,compiled_medium,compiled_long,n_tokens
doc_short,0.047734,0.015323,0.07378,0.235764,22
doc_short_medium,0.078448,0.015455,0.071718,0.230418,102
doc_medium,0.113761,0.016572,0.072155,0.23318,202
doc_medium_long,0.160617,0.017675,0.073349,0.231643,302
doc_long,0.295232,0.02075,0.076787,0.232976,512


In [12]:
for pipeline_scenario in ('uncompiled','compiled_short','compiled_medium','compiled_long'):
    results_df[f'{pipeline_scenario}_milliseconds_per_non_trivial_token'] = (results_df[pipeline_scenario] / results_df['n_tokens']) * 1000

In [13]:
results_df.iloc[:,4:]

Unnamed: 0,n_tokens,uncompiled_milliseconds_per_non_trivial_token,compiled_short_milliseconds_per_non_trivial_token,compiled_medium_milliseconds_per_non_trivial_token,compiled_long_milliseconds_per_non_trivial_token
doc_short,22,2.169734,0.696514,3.353644,10.716539
doc_short_medium,102,0.769096,0.151518,0.703118,2.258997
doc_medium,202,0.563174,0.082041,0.357202,1.154356
doc_medium_long,302,0.531845,0.058528,0.242878,0.767028
doc_long,512,0.576624,0.040528,0.149974,0.455032


- n_tokens ~ 20 -> compile at 25 gives 3x-4x speedup
- n_tokens ~ 100 -> compile at 250 gives ~ 1.5 speedup
- n_tokens ~ 200 - 300 -> compile at 250 gives ~ 2x speedup
- n_tokens >= 400 -> compile at 512 is ~ 1x