In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.neuron
from compiled_model import CompiledModel, compile_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
tokenizer.save_pretrained('./hf_tokenizer')

model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", return_dict=True)
model.save_pretrained('./hf_model')

In [3]:
SEQUENCE_MAX_LENGTH = 50

tokens = tokenizer('this is a test sententce for tokenization.',return_tensors='pt',padding='max_length',max_length=SEQUENCE_MAX_LENGTH)
tokens['input_ids'].size(), tokens['attention_mask'].size()

(torch.Size([1, 50]), torch.Size([1, 50]))

In [4]:
model_output = model(input_ids=tokens['input_ids'],attention_mask=tokens['attention_mask'])
model_output[0].size(), model_output[1].size()

(torch.Size([1, 50, 384]), torch.Size([1, 384]))

### 1. Test `.from_model` + inference

In [5]:
compiled_model = CompiledModel.from_model(model, max_length=SEQUENCE_MAX_LENGTH, strict=False)
compiled_model.model, compiled_model.config, compiled_model.compilation_specs, compiled_model.__class__

INFO:Neuron:There are 3 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuron-cc/neuron-cc-ops/neuron-cc-ops-pytorch.html)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 571, fused = 546, percent fused = 95.62%


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


INFO:Neuron:Compiler args type is <class 'list'> value is ['--fast-math', 'none']
INFO:Neuron:Compiling function _NeuronGraph$686 with neuron-cc
INFO:Neuron:Compiling with command line: '/home/ec2-user/repositories/ml-mesh/.venv/bin/neuron-cc compile /tmp/tmp75eh5rmu/graph_def.pb --framework TENSORFLOW --pipeline compile SaveTemps --output /tmp/tmp75eh5rmu/graph_def.neff --io-config {"inputs": {"0:0": [[1, 50, 384], "float32"], "1:0": [[1, 1, 1, 50], "float32"]}, "outputs": ["BertEncoder_51/BertLayer_35/BertOutput_5/LayerNorm_7/aten_layer_norm/batchnorm/add_1:0", "BertPooler_52/Tanh_11/aten_tanh/Tanh:0"]} --fast-math none --verbose 35'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
....
Compiler status PASS


INFO:Neuron:Number of arithmetic operators (post-compilation) before = 571, compiled = 546, percent compiled = 95.62%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 1 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 100.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 96
INFO:Neuron: => aten::add: 36
INFO:Neuron: => aten::contiguous: 12
INFO:Neuron: => aten::div: 12
INFO:Neuron: => aten::dropout: 37
INFO:Neuron: => aten::gelu: 12
INFO:Neuron: => aten::layer_norm: 25
INFO:Neuron: => aten::linear: 73
INFO:Neuron: => aten::matmul: 24
INFO:Neuron: => aten::permute: 48
INFO:Neuron: => aten::select: 1
INFO:Neuron: => aten::size: 96
INFO:Neuron: => aten::slice: 1
INFO:Neuron: => aten::softmax: 12
INFO:Neuron: => aten::tanh: 1
INFO:Neuron: => aten::transpose: 12
INFO:Neuron: => aten::view: 48
INFO:Neuron:Not compiled operators (and operator counts) to N

(AwsNeuronGraphModule(
   original_name=AwsNeuronGraphModule
   (_NeuronGraph#82): NeuronModuleV2(original_name=NeuronModuleV2)
 ),
 BertConfig {
   "_name_or_path": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
   "architectures": [
     "BertModel"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 384,
   "initializer_range": 0.02,
   "intermediate_size": 1536,
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.27.1",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 250037
 },
 {'tracing_kwargs': {'strict': False,
   'dynamic_batch_size': True,
   'compiler_args': ['--fast-math', 'none']},
  'tracing__ba

In [6]:
compiled_model_output = compiled_model(input_ids=tokens['input_ids'],attention_mask=tokens['attention_mask'])
compiled_model_output[0].size(), compiled_model_output[1].size()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

(torch.Size([1, 50, 384]), torch.Size([1, 384]))

In [7]:
model_output[0], compiled_model_output[0]

(tensor([[[ 0.0597,  0.6885, -0.1838,  ...,  0.0782,  0.2163, -0.2030],
          [-0.4493,  0.7935, -0.2839,  ...,  0.4988,  0.5003,  0.4248],
          [-0.4368,  0.7815, -0.3105,  ...,  0.3446,  0.4974,  0.0964],
          ...,
          [ 0.0673,  0.1325, -0.1477,  ...,  0.0170, -0.1464,  0.1434],
          [ 0.0546,  0.1333, -0.0947,  ...,  0.0360, -0.1197,  0.1503],
          [ 0.0159,  0.1558, -0.0797,  ...,  0.0489, -0.0968,  0.1202]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 0.0600,  0.6861, -0.1808,  ...,  0.0760,  0.2152, -0.2020],
          [-0.4481,  0.7930, -0.2825,  ...,  0.5024,  0.4955,  0.4217],
          [-0.4355,  0.7804, -0.3085,  ...,  0.3451,  0.4957,  0.0955],
          ...,
          [ 0.0680,  0.1299, -0.1483,  ...,  0.0167, -0.1490,  0.1402],
          [ 0.0531,  0.1313, -0.0929,  ...,  0.0373, -0.1207,  0.1483],
          [ 0.0166,  0.1549, -0.0795,  ...,  0.0500, -0.0997,  0.1182]]]))

In [8]:
torch.allclose(model_output[0],compiled_model_output[0], atol=1e-02, rtol=1e-02)

True

In [9]:
compiled_model.compilation_specs

{'tracing_kwargs': {'strict': False,
  'dynamic_batch_size': True,
  'compiler_args': ['--fast-math', 'none']},
 'tracing__batch_size': 1,
 'tracing__max_length': 50,
 'tracing__neuron': True}

### Test `.save_pretrained` & `load_pretrained` + inference

In [10]:
compiled_model.save_pretrained('./compiled_model')

In [11]:
compiled_model_reloaded = CompiledModel.from_pretrained('./compiled_model')

In [12]:
compiled_model_reloaded_output = compiled_model_reloaded(tokens['input_ids'],tokens['attention_mask'])
compiled_model_reloaded_output[0].size(), compiled_model_reloaded_output[1].size()

Model output is a dictionary. Converting


(torch.Size([1, 50, 384]), torch.Size([1, 384]))

In [13]:
compiled_model_reloaded_output[0], compiled_model_reloaded_output[0]

(tensor([[[ 0.0600,  0.6861, -0.1808,  ...,  0.0760,  0.2152, -0.2020],
          [-0.4481,  0.7930, -0.2825,  ...,  0.5024,  0.4955,  0.4217],
          [-0.4355,  0.7804, -0.3085,  ...,  0.3451,  0.4957,  0.0955],
          ...,
          [ 0.0680,  0.1299, -0.1483,  ...,  0.0167, -0.1490,  0.1402],
          [ 0.0531,  0.1313, -0.0929,  ...,  0.0373, -0.1207,  0.1483],
          [ 0.0166,  0.1549, -0.0795,  ...,  0.0500, -0.0997,  0.1182]]]),
 tensor([[[ 0.0600,  0.6861, -0.1808,  ...,  0.0760,  0.2152, -0.2020],
          [-0.4481,  0.7930, -0.2825,  ...,  0.5024,  0.4955,  0.4217],
          [-0.4355,  0.7804, -0.3085,  ...,  0.3451,  0.4957,  0.0955],
          ...,
          [ 0.0680,  0.1299, -0.1483,  ...,  0.0167, -0.1490,  0.1402],
          [ 0.0531,  0.1313, -0.0929,  ...,  0.0373, -0.1207,  0.1483],
          [ 0.0166,  0.1549, -0.0795,  ...,  0.0500, -0.0997,  0.1182]]]))

In [14]:
torch.all(compiled_model_output[0].eq(compiled_model_reloaded_output[0]))

tensor(True)

In [15]:
compiled_model.compilation_specs, compiled_model_reloaded.compilation_specs

({'tracing_kwargs': {'strict': False,
   'dynamic_batch_size': True,
   'compiler_args': ['--fast-math', 'none']},
  'tracing__batch_size': 1,
  'tracing__max_length': 50,
  'tracing__neuron': True},
 {'tracing_kwargs': {'strict': False,
   'dynamic_batch_size': True,
   'compiler_args': ['--fast-math', 'none']},
  'tracing__batch_size': 1,
  'tracing__max_length': 50,
  'tracing__neuron': True})

In [16]:
compiled_model.compilation_specs == compiled_model_reloaded.compilation_specs

True