In [1]:
import torch
import torch.neuron
from transformers import AutoTokenizer, AutoModel
from transformers.pipelines import pipeline
from time import time
from keybert import KeyBERT

In [2]:
# --- initialize models
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
hf_pipeline = pipeline(
    "feature-extraction", 
    tokenizer=tokenizer,
    model=model
)

In [3]:
# --- create prediction files
sample_inputs = [
    '''Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).''',
    '''Überwachtes Lernen ist die maschinelle Lernaufgabe, eine Funktion zu lernen, die
         ordnet eine Eingabe einer Ausgabe basierend auf beispielhaften Eingabe-Ausgabe-Paaren zu. Es folgert a
         Funktion aus beschrifteten Trainingsdaten, die aus einer Reihe von Trainingsbeispielen bestehen.
         Beim überwachten Lernen ist jedes Beispiel ein Paar, das aus einem Eingabeobjekt besteht
         (typischerweise ein Vektor) und einem gewünschten Ausgangswert (auch Überwachungssignal genannt).
         Ein überwachter Lernalgorithmus analysiert die Trainingsdaten und erzeugt eine abgeleitete Funktion.
         die zum Mapping neuer Beispiele verwendet werden können. Ein optimales Szenario ermöglicht die
         Algorithmus, um die Klassenbezeichnungen für unsichtbare Instanzen korrekt zu bestimmen. Dafür braucht man
         den Lernalgorithmus zum Verallgemeinern der Trainingsdaten auf ungesehene Situationen in a
         'vernünftiger' Weg (siehe induktive Vorspannung).''',
    '''El aprendizaje supervisado es la tarea de aprendizaje automático de aprender una función que
         asigna una entrada a una salida en función de pares de entrada-salida de ejemplo. Se infiere un
         función a partir de datos de entrenamiento etiquetados que consisten en un conjunto de ejemplos de entrenamiento.
         En el aprendizaje supervisado, cada ejemplo es un par que consta de un objeto de entrada
         (típicamente un vector) y un valor de salida deseado (también llamado señal de supervisión).
         Un algoritmo de aprendizaje supervisado analiza los datos de entrenamiento y produce una función inferida,
         que se puede utilizar para mapear nuevos ejemplos. Un escenario óptimo permitirá que la
         algoritmo para determinar correctamente las etiquetas de clase para instancias no vistas. Esto requiere
         el algoritmo de aprendizaje para generalizar a partir de los datos de entrenamiento a situaciones no vistas en un
         manera 'razonable' (ver sesgo inductivo).'''
]

In [4]:
# neuron trace the model
tokenizer_settings = {
    'max_length':10,
    'padding':'max_length',
    'truncation':True,
    'add_special_tokens':False
}

tokenized_inputs = tokenizer(sample_inputs[0], return_tensors='pt',**tokenizer_settings)
tokenized_inputs_for_tracing = tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], tokenized_inputs['token_type_ids']
model_predictions = model(**tokenized_inputs)

In [5]:
neuron_model = torch.neuron.trace(model, tokenized_inputs_for_tracing, strict=False)

neuron_model.save(f"./{tokenizer_settings['max_length']}_neuron_traced_paraphrase-multilingual-MiniLM-L12-v2.pt")
type(neuron_model)

INFO:Neuron:There are 3 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 563, fused = 546, percent fused = 96.98%


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


INFO:Neuron:Compiling function _NeuronGraph$688 with neuron-cc
INFO:Neuron:Compiling with command line: '/home/ec2-user/repositories/ml-serve-app/pytorch_venv/bin/neuron-cc compile /tmp/tmp2123eycx/graph_def.pb --framework TENSORFLOW --pipeline compile SaveTemps --output /tmp/tmp2123eycx/graph_def.neff --io-config {"inputs": {"0:0": [[1, 10, 384], "float32"], "1:0": [[1, 1, 1, 10], "float32"]}, "outputs": ["batchnorm_24/add_1:0", "Tanh_12:0"]} --verbose 35'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
....
Compiler status PASS


INFO:Neuron:Number of arithmetic operators (post-compilation) before = 563, compiled = 546, percent compiled = 96.98%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 1 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 100.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 96
INFO:Neuron: => aten::add: 36
INFO:Neuron: => aten::contiguous: 12
INFO:Neuron: => aten::div: 12
INFO:Neuron: => aten::dropout: 37
INFO:Neuron: => aten::gelu: 12
INFO:Neuron: => aten::layer_norm: 25
INFO:Neuron: => aten::linear: 73
INFO:Neuron: => aten::matmul: 24
INFO:Neuron: => aten::permute: 48
INFO:Neuron: => aten::select: 1
INFO:Neuron: => aten::size: 96
INFO:Neuron: => aten::slice: 1
INFO:Neuron: => aten::softmax: 12
INFO:Neuron: => aten::tanh: 1
INFO:Neuron: => aten::transpose: 12
INFO:Neuron: => aten::view: 48
INFO:Neuron:Not compiled operators (and operator counts) to N

torch.jit._trace.TopLevelTracedModule

In [6]:
neuron_model = torch.jit.load(f"./{tokenizer_settings['max_length']}_neuron_traced_paraphrase-multilingual-MiniLM-L12-v2.pt")
type(neuron_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

torch.jit._script.RecursiveScriptModule

In [7]:
neuron_predictions_2 = neuron_model(*tokenized_inputs_for_tracing)

In [8]:
pipeline_predictions = hf_pipeline(sample_inputs, tokenize_kwargs=tokenizer_settings)

In [9]:
len(pipeline_predictions[0][0]) # n_batch x 1 x n_tokens x n_embed


177

### HF pipeline and HF keybert latencies

In [10]:
n_test = 50

hf_start = time()
for i in range(n_test):
    hf_pipeline(sample_inputs, tokenize_kwargs = tokenizer_settings)
hf_end = time()

print(f'Average latency hf pipeline with fixed tokenization length: {(hf_end - hf_start)/n_test * 1000}ms')

Average latency hf pipeline with fixed tokenization length: 73.57719898223877ms


In [11]:
n_test = 50

hf_start = time()
for i in range(n_test):
    hf_pipeline(sample_inputs)
hf_end = time()

print(f'Average latency hf pipeline with dynamic tokenization length: {(hf_end - hf_start)/n_test * 1000}ms')

Average latency hf pipeline with dynamic tokenization length: 69.37009334564209ms


In [12]:
hf_keybert = KeyBERT(model=hf_pipeline)

n_test = 50

hf_start = time()
for i in range(n_test):
    hf_keywords = hf_keybert.extract_keywords(sample_inputs, keyphrase_ngram_range=(1, 1), stop_words=None)
hf_end = time()

print(f'Average latency hf keybert with fixed tokenization length: {(hf_end - hf_start)/n_test * 1000}ms')

Average latency hf keybert with fixed tokenization length: 1786.524739265442ms


### Neuron pipeline and neuron keybert latencies

In [13]:
import sys
sys.path.append('/home/ec2-user/repositories/ml-mesh/')
from libs.keyword.onclusiveml.keyword.compile_pipeline import compile_pipeline

neuron_pipeline = compile_pipeline(
    pipeline=hf_pipeline,
    traced_model=neuron_model,
    tokenizer_settings=tokenizer_settings,
)

In [14]:
n_test = 50

neuron_start = time()
for i in range(n_test):
    neuron_pipeline(sample_inputs)
neuron_end = time()

print(f'Average latency neuron pipeline with fixed tokenization length: {(neuron_end - neuron_start)/n_test * 1000}ms')

Average latency neuron pipeline with fixed tokenization length: 8.94350528717041ms


In [15]:
neuron_keybert = KeyBERT(model=neuron_pipeline)

n_test = 50

neuron_start = time()
for i in range(n_test):
    neuron_keywords = neuron_keybert.extract_keywords(sample_inputs, keyphrase_ngram_range=(1, 1), stop_words=None)
neuron_end = time()

print(f'Average latency neuron keybert with fixed tokenization length: {(neuron_end - neuron_start)/n_test * 1000}ms')

Average latency neuron keybert with fixed tokenization length: 670.1397895812988ms


In [16]:
type(neuron_model)

torch.jit._script.RecursiveScriptModule