In [1]:
from transformers import AutoTokenizer, AutoModel, PreTrainedModel
from transformers.pipelines import pipeline, Pipeline
import torch
import torch.neuron

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hf_pipeline = pipeline("feature-extraction", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
hf_pipeline_tokenizer = hf_pipeline.tokenizer
hf_pipeline_model = hf_pipeline.model

In [3]:
tokens = hf_pipeline_tokenizer('this is a test sententce for tokenization.',return_tensors='pt',padding='max_length',max_length=50)
tokens['input_ids'].size(), tokens['attention_mask'].size()

(torch.Size([1, 50]), torch.Size([1, 50]))

In [4]:
hf_pipeline_model_output = hf_pipeline_model(**tokens)

In [5]:
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", return_dict=False) # if no task is specified, the instantiated model returns the last hidden state needed for feature extraction
model.eval()
model_output = model(**{'input_ids':tokens['input_ids'],'attention_mask':tokens['attention_mask']})

In [6]:
hf_pipeline_output = hf_pipeline('this is a test sententce for tokenization.',return_tensors='pt',tokenize_kwargs={'padding':'max_length','max_length':50})

In [7]:
hf_pipeline_model_output[0].size(), model_output[0].size(), hf_pipeline_output[0].size()

(torch.Size([1, 50, 384]), torch.Size([1, 50, 384]), torch.Size([50, 384]))

In [8]:
hf_pipeline_model_output[1].size(), model_output[1].size(), # hf_pipeline_output[1].size() doesnt exist

(torch.Size([1, 384]), torch.Size([1, 384]))

In [9]:
torch.all(hf_pipeline_model_output[0].eq(model_output[0])), torch.all(hf_pipeline_model_output[0].eq(hf_pipeline_output[0]))

(tensor(True), tensor(True))

In [10]:
torch.all(hf_pipeline_model_output[1].eq(model_output[1])) # hf_pipeline_output[1].size() doesnt exist

tensor(True)

In [11]:
tracing_inputs = tokens['input_ids'], tokens['attention_mask']
tracing_inputs

(tensor([[    0,   903,    83,    10,  3034,  9325,  2517,   329,   100,    47,
           1098, 47691,     5,     2,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]]))

In [12]:
traced_model = torch.neuron.trace(model, example_inputs=tracing_inputs, compiler_args=['--fast-math','none']) # ['--fast-math','fp32-cast-matmult']

INFO:Neuron:There are 3 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuron-cc/neuron-cc-ops/neuron-cc-ops-pytorch.html)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 571, fused = 546, percent fused = 95.62%


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


INFO:Neuron:Compiler args type is <class 'list'> value is ['--fast-math', 'none']
INFO:Neuron:Compiling function _NeuronGraph$686 with neuron-cc
INFO:Neuron:Compiling with command line: '/home/ec2-user/repositories/ml-mesh/.venv/bin/neuron-cc compile /tmp/tmptw0y277p/graph_def.pb --framework TENSORFLOW --pipeline compile SaveTemps --output /tmp/tmptw0y277p/graph_def.neff --io-config {"inputs": {"0:0": [[1, 50, 384], "float32"], "1:0": [[1, 1, 1, 50], "float32"]}, "outputs": ["BertEncoder_51/BertLayer_35/BertOutput_5/LayerNorm_7/aten_layer_norm/batchnorm/add_1:0", "BertPooler_52/Tanh_11/aten_tanh/Tanh:0"]} --fast-math none --verbose 35'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
....
Compiler status PASS


INFO:Neuron:Number of arithmetic operators (post-compilation) before = 571, compiled = 546, percent compiled = 95.62%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 1 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 100.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 96
INFO:Neuron: => aten::add: 36
INFO:Neuron: => aten::contiguous: 12
INFO:Neuron: => aten::div: 12
INFO:Neuron: => aten::dropout: 37
INFO:Neuron: => aten::gelu: 12
INFO:Neuron: => aten::layer_norm: 25
INFO:Neuron: => aten::linear: 73
INFO:Neuron: => aten::matmul: 24
INFO:Neuron: => aten::permute: 48
INFO:Neuron: => aten::select: 1
INFO:Neuron: => aten::size: 96
INFO:Neuron: => aten::slice: 1
INFO:Neuron: => aten::softmax: 12
INFO:Neuron: => aten::tanh: 1
INFO:Neuron: => aten::transpose: 12
INFO:Neuron: => aten::view: 48
INFO:Neuron:Not compiled operators (and operator counts) to N

In [13]:
traced_model_output = traced_model(*tracing_inputs)
traced_model_output[0].size(), traced_model_output[1].size()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

(torch.Size([1, 50, 384]), torch.Size([1, 384]))

In [14]:
model_output[0], traced_model_output[0]

(tensor([[[ 0.0597,  0.6885, -0.1838,  ...,  0.0782,  0.2163, -0.2030],
          [-0.4493,  0.7935, -0.2839,  ...,  0.4988,  0.5003,  0.4248],
          [-0.4368,  0.7815, -0.3105,  ...,  0.3446,  0.4974,  0.0964],
          ...,
          [ 0.0673,  0.1325, -0.1477,  ...,  0.0170, -0.1464,  0.1434],
          [ 0.0546,  0.1333, -0.0947,  ...,  0.0360, -0.1197,  0.1503],
          [ 0.0159,  0.1558, -0.0797,  ...,  0.0489, -0.0968,  0.1202]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 0.0600,  0.6861, -0.1808,  ...,  0.0760,  0.2152, -0.2020],
          [-0.4481,  0.7930, -0.2825,  ...,  0.5024,  0.4955,  0.4217],
          [-0.4355,  0.7804, -0.3085,  ...,  0.3451,  0.4957,  0.0955],
          ...,
          [ 0.0680,  0.1299, -0.1483,  ...,  0.0167, -0.1490,  0.1402],
          [ 0.0531,  0.1313, -0.0929,  ...,  0.0373, -0.1207,  0.1483],
          [ 0.0166,  0.1549, -0.0795,  ...,  0.0500, -0.0997,  0.1182]]]))

In [29]:
torch.allclose(traced_model_output[0],model_output[0], atol=1e-02, rtol=1e-02)

True

In [15]:
type(traced_model) # traced model as output by tracing method

torch.jit._trace.TopLevelTracedModule

In [16]:
type(model) # original model as initialized by the huggingface from_pretrained API

transformers.models.bert.modeling_bert.BertModel

In [17]:
traced_model.save('neuron_traced_model.pt')

In [18]:
reloaded_traced_model = torch.jit.load('neuron_traced_model.pt')


In [19]:
type(reloaded_traced_model) # reloaded traced model

torch.jit._script.RecursiveScriptModule

In [20]:
model.save_pretrained('./test_model')

In [21]:
model_reloaded = AutoModel.from_pretrained('./test_model')

In [22]:
type(model_reloaded)

transformers.models.bert.modeling_bert.BertModel

In [23]:
reloaded_model_outputs = model_reloaded(**tokens)

In [24]:
torch.all(reloaded_model_outputs[0].eq(model_output[0]))

tensor(True)

In [31]:
reloaded_traced_model_outputs = reloaded_traced_model(*tracing_inputs)

In [32]:
torch.all(traced_model_output[0].eq(reloaded_traced_model_outputs[0]))

tensor(True)