In [2]:
from transformers.pipelines import pipeline
from keybert import KeyBERT
from onclusiveml.ml_compile import CompiledPipeline
import torch_neuron

In [3]:
hf_pipeline = pipeline(task='feature-extraction',model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [4]:
try:
    compiled_pipeline_short = CompiledPipeline.from_pretrained('compiled_pipeline_short')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_short = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=25, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_short.save_pretrained('compiled_pipeline_short')
    print('Recompiled and exported')

Reloaded


In [5]:
try:
    compiled_pipeline_medium = CompiledPipeline.from_pretrained('compiled_pipeline_medium')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_medium = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=250, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_medium.save_pretrained('compiled_pipeline_medium')
    print('Recompiled and exported')

Reloaded


In [6]:
try:
    compiled_pipeline_long = CompiledPipeline.from_pretrained('compiled_pipeline_long')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_long = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=500, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_long.save_pretrained('compiled_pipeline_long')
    print('Recompiled and exported')

Reloaded


In [15]:
short_doc = 'this is a test sample for latency benchmarking ' * 2
short_medium = short_doc * 5
medium_doc = short_doc * 10
medium_long_doc = short_doc * 15
long_doc = medium_doc * 3

docs ={
    'doc_short':[short_doc,] * 5,
    'doc_short_medium': [short_medium,] * 5,
    'doc_medium':[medium_doc,] * 5,
    'doc_medium_long': [medium_long_doc,] * 5,
    'doc_long': [long_doc,] * 5
}

for doc in docs:
    print(len(docs[doc][0].split()))

16
80
160
240
480


In [16]:
for doc in docs:
    print(len(hf_pipeline.tokenizer(docs[doc][0],truncation=True)['input_ids']))

22
102
202
302
512


In [17]:
pipeline_samples = {
    'uncompiled':hf_pipeline,
    'compiled_short':compiled_pipeline_short,
    'compiled_medium':compiled_pipeline_medium,
    'compiled_long':compiled_pipeline_long
}

(compiled_pipeline_short.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_medium.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_long.compiled_pipeline.model.compilation_specs['tracing__max_length'])

(25, 250, 500)

In [18]:
pipeline_results = {
    'uncompiled':{},
    'compiled_short':{},
    'compiled_medium':{},
    'compiled_long':{},
    'n_tokens': {}
}

In [19]:
import time

n_runs = 50

for doc_scenario in docs.keys():
    
    print(f'Document: {doc_scenario}')
    pipeline_results['n_tokens'][doc_scenario] = len(hf_pipeline.tokenizer(docs[doc_scenario][0],truncation=True)['input_ids'])
    
    for pipeline_scenario in pipeline_samples.keys():
        
        print(f'Pipeline: {pipeline_scenario}')
        
        start = time.time()
        
        for i in range(n_runs):
            pipeline_samples[pipeline_scenario](docs[doc_scenario],truncation=True)
            
        pipeline_results[pipeline_scenario][doc_scenario] = (time.time() - start) / n_runs
        
        print(f'Pipeline {pipeline_scenario} duration: {pipeline_results[pipeline_scenario][doc_scenario]}')
        print('---------------------------------------------------------------')
        time.sleep(2)
        
    print(f'=============================================================')

Document: doc_short
Pipeline: uncompiled
Pipeline uncompiled duration: 0.049095802307128907
---------------------------------------------------------------
Pipeline: compiled_short
Pipeline compiled_short duration: 0.014618668556213379
---------------------------------------------------------------
Pipeline: compiled_medium
Pipeline compiled_medium duration: 0.07433931827545166
---------------------------------------------------------------
Pipeline: compiled_long
Pipeline compiled_long duration: 0.23350279331207274
---------------------------------------------------------------
Document: doc_short_medium
Pipeline: uncompiled
Pipeline uncompiled duration: 0.08046710014343261
---------------------------------------------------------------
Pipeline: compiled_short
Pipeline compiled_short duration: 0.015561342239379883
---------------------------------------------------------------
Pipeline: compiled_medium
Pipeline compiled_medium duration: 0.07355178833007812
---------------------------

In [20]:
import pandas as pd

results_df = pd.DataFrame(pipeline_results)
results_df

Unnamed: 0,uncompiled,compiled_short,compiled_medium,compiled_long,n_tokens
doc_short,0.049096,0.014619,0.074339,0.233503,22
doc_short_medium,0.080467,0.015561,0.073552,0.233884,102
doc_medium,0.114147,0.016665,0.07757,0.234562,202
doc_medium_long,0.164947,0.017795,0.073568,0.232279,302
doc_long,0.3122,0.020887,0.076984,0.239388,512


In [21]:
for pipeline_scenario in ('uncompiled','compiled_short','compiled_medium','compiled_long'):
    results_df[f'{pipeline_scenario}_milliseconds_per_non_trivial_token'] = (results_df[pipeline_scenario] / results_df['n_tokens']) * 1000

In [22]:
results_df.iloc[:,4:]

Unnamed: 0,n_tokens,uncompiled_milliseconds_per_non_trivial_token,compiled_short_milliseconds_per_non_trivial_token,compiled_medium_milliseconds_per_non_trivial_token,compiled_long_milliseconds_per_non_trivial_token
doc_short,22,2.231627,0.664485,3.37906,10.613763
doc_short_medium,102,0.788893,0.152562,0.721096,2.292977
doc_medium,202,0.565086,0.082502,0.384008,1.161198
doc_medium_long,302,0.546181,0.058923,0.243602,0.769135
doc_long,512,0.609766,0.040795,0.150359,0.467554


- n_tokens ~ 20 -> compile at 25 gives 3x-4x speedup
- n_tokens ~ 100 -> compile at 250 gives ~ 1.5 speedup
- n_tokens ~ 200 - 300 -> compile at 250 gives ~ 2x speedup
- n_tokens >= 400 -> compile at 512 is ~ 1x