## library imports

In [1]:
import torch
import transformers
import onnx
import onnxruntime.training.onnxblock as onnxblock
from datasets import load_dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


## generating artifacts

In [9]:
#model = transformers.AutoModel.from_pretrained('google/mobilebert-uncased')
from transformers import MobileBertConfig, MobileBertModel 
config = MobileBertConfig(num_hidden_layers=4)
model = MobileBertModel(config)
model_name = 'mobilebert-uncased'

In [11]:
# create the random input

# expects
# input_ids = torch.LongTensor of shape (batch size, seq len)
# attention_mask = torch.FloatTensor of shape (batch size, seq len)
# token_type_ids = torch.LongTensor of shape (bs, seq len)

num_seq = 2
seq_len = 150
vocab = 20000
input_ids = torch.randint(vocab, (num_seq, seq_len), requires_grad=False)
attention_mask = torch.ones((num_seq, seq_len), dtype=torch.float, requires_grad=False)
token_type_ids = torch.ones((num_seq, seq_len), dtype=torch.long, requires_grad=False)


In [7]:
torch.onnx.export(model, (input_ids, attention_mask, token_type_ids),
                  f"training_artifacts/{model_name}.onnx", 
                  input_names=["input_ids", "attention_mask", "token_type_ids"],
                  output_names=["output"],
                   dynamic_axes={
                     "input_ids": {0: "num_seq"},
                     "attention_mask": {0: "num_seq"},
                     "token_type_ids": {0: "num_seq"}
                   },
                   export_params=True, 
                   do_constant_folding=False,
                   training=torch.onnx.TrainingMode.TRAINING)



In [4]:
num_seq = 1000
seq_len = 150
vocab = 20000
input_ids = torch.randint(vocab, (num_seq, seq_len), requires_grad=False)
attention_mask = torch.ones((num_seq, seq_len), dtype=torch.float, requires_grad=False)
token_type_ids = torch.ones((num_seq, seq_len), dtype=torch.long, requires_grad=False)

torch.onnx.export(model, (input_ids, attention_mask, token_type_ids),
                  f"training_artifacts/{model_name}.onnx", 
                  input_names=["input_ids", "attention_mask", "token_type_ids"],
                  output_names=["output"])

  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [4]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

for name, buffer in model.named_buffers():
    print(name)

embeddings.word_embeddings.weight True
embeddings.position_embeddings.weight True
embeddings.token_type_embeddings.weight True
embeddings.embedding_transformation.weight True
embeddings.embedding_transformation.bias True
embeddings.LayerNorm.bias True
embeddings.LayerNorm.weight True
encoder.layer.0.attention.self.query.weight True
encoder.layer.0.attention.self.query.bias True
encoder.layer.0.attention.self.key.weight True
encoder.layer.0.attention.self.key.bias True
encoder.layer.0.attention.self.value.weight True
encoder.layer.0.attention.self.value.bias True
encoder.layer.0.attention.output.dense.weight True
encoder.layer.0.attention.output.dense.bias True
encoder.layer.0.attention.output.LayerNorm.bias True
encoder.layer.0.attention.output.LayerNorm.weight True
encoder.layer.0.intermediate.dense.weight True
encoder.layer.0.intermediate.dense.bias True
encoder.layer.0.output.dense.weight True
encoder.layer.0.output.dense.bias True
encoder.layer.0.output.LayerNorm.bias True
encoder.

In [8]:
class MobileBERTWithLoss(onnxblock.TrainingModel):
    def __init__(self):
        super().__init__()
        self.loss = onnxblock.loss.CrossEntropyLoss()

    def build(self, loss_node_input_name):
        print('inside mobilebertwithloss build')
        return self.loss(loss_node_input_name)


# Load the model from the exported inference ONNX file.
onnx_model = onnx.load(f"training_artifacts/{model_name}.onnx")
print('done loading onnx model')
eval_model = None
optimizer_model = None

training_block = MobileBERTWithLoss()
for name, param in model.named_parameters():
    if param.requires_grad:
        training_block.requires_grad(name)
    else:
        training_block.requires_grad(name, False)

training_block.requires_grad('embeddings.position_ids', False)

print('done creating trainingblock')

inference_model_output_name = "output"
with onnxblock.onnx_model(onnx_model) as model_accessor:
    print('within "with" block')
    loss_output_name = training_block(inference_model_output_name)
    print('training block initialized')
    eval_model = model_accessor.eval_model
    print('eval model created')

print('inference model done')

optimizer_block = onnxblock.optim.AdamW()
with onnxblock.onnx_model() as model_accessor:
    optimizer_outputs = optimizer_block(training_block.parameters())
    optimizer_model = model_accessor.model
print('optimizer model done')

done loading onnx model
done creating trainingblock
within "with" block
inside mobilebertwithloss build


2023-03-29 18:46:29.569226336 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Transpose_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569262834 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Slice_2_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569268734 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Reshape_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569284234 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Gather_1_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569288533 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Mul_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569303433 [I:onnxruntime:D

KeyboardInterrupt: 

leanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Constant_25_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569719615 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_1243'. It is no longer used by any node.
2023-03-29 18:46:29.569723715 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer '/embeddings/Constant_11_output_0'. It is no longer used by any node.
2023-03-29 18:46:29.569727915 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_2603'. It is no longer used by any node.
2023-03-29 18:46:29.569731714 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_1241'. It is no longer used by any node.
2023-03-29 18:46:29.569735514 [I:onnxruntime:Default, graph.cc:3493 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx

In [28]:

onnxblock.save_checkpoint(training_block.parameters(), f"training_artifacts/{model_name}.ckpt")
onnx.save(onnx_model, f"training_artifacts/{model_name}_training.onnx")
onnx.save(eval_model, f"training_artifacts/{model_name}_eval.onnx")
onnx.save(optimizer_model, f"training_artifacts/{model_name}_optimizer.onnx")

## generating tokens

In [2]:
def tokenize_function(examples, pad_to_len):
    tokenizer = transformers.AutoTokenizer.from_pretrained("google/mobilebert-uncased")
    # filter out empty strings
    examples["text"] = [sent for sent in examples["text"] if len(sent) > 0]
    return tokenizer(examples["text"], return_special_tokens_mask=True, padding="max_length", max_length=pad_to_len, truncation=True)

def generate_tokens(corpus):
    """
    Takes in a Dataset with a "text" feature.

    Returns a Dataset with the following features: text, input_ids, token_type_ids, attention_mask, special_tokens_mask
    """
    # pad_to_len must be calculated before the batching happens to create consistent sizes in the resulting tensor
    # pad_to_len = max([len(sent) for sent in corpus["text"]])
    pad_to_len = 150 # shortened for demonstration purposes
    return corpus.map(tokenize_function, batched=True, fn_kwargs={"pad_to_len": pad_to_len})

def generate_json_dict(token_dataset):
    """
    Takes in a Dataset with the following features: text, input_ids, token_type_ids, attention_mask, special_tokens_mask

    Basically changes the 2d Python lists into two fields: a shape & a flattened list, for easier conversion to OnnxValues

    Returns a dictionary with the following keys: input_ids, input_size, token_type_ids, token_type_size, attention_mask, attention_mask_size, special_tokens_mask, special_tokens_size
    """
    json_dict = {}
    keys_to_convert = ["input_ids", "token_type_ids", "attention_mask", "special_tokens_mask"]

    for key_name in keys_to_convert:
        # add field for the shape of the tensor
        json_dict[key_name + "_shape"] = [len(token_dataset[key_name]), len(token_dataset[key_name][0])]
        # flatten list
        json_dict[key_name] = [num for sent in token_dataset[key_name] for num in sent]
    
    return json_dict


In [3]:
dataset_name = "wikitext" 
dataset_config = "wikitext-2-v1"
# corpus = type DatasetDict with three Datasets: test, train, validation
corpus = load_dataset(dataset_name, dataset_config)

Found cached dataset wikitext (/home/carolinezhu/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|██████████| 3/3 [00:00<00:00, 733.57it/s]


In [15]:
#corpus["train"]["text"]
# max([len(sent) for sent in corpus["train"]["text"]])
# 3837
#len(corpus["train"]["text"])
# 36718

train_test = generate_tokens(corpus["train"])

                                                                  

In [16]:
print(len(train_test["input_ids"]))
# 36718
print(len(train_test["input_ids"][0]))
# 3837

max([len(sent) for sent in train_test["input_ids"]])

23767
150


150

In [4]:
test_tokens = generate_tokens(corpus["test"])
test_tokens = generate_json_dict(test_tokens)
train_tokens = generate_tokens(corpus["train"])
train_tokens = generate_json_dict(train_tokens)
validation_tokens = generate_tokens(corpus["validation"])
validation_tokens = generate_json_dict(validation_tokens)

Loading cached processed dataset at /home/carolinezhu/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-b83d710dd1d17dd2.arrow
Loading cached processed dataset at /home/carolinezhu/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-f91b7e5c1178149a.arrow
Loading cached processed dataset at /home/carolinezhu/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-aa24fc21c9608d06.arrow


In [None]:
# write all the tokens to a json file
file_names = ["test_tokens.json", "train_tokens.json", "validation_tokens.json"]
token_dicts = [test_tokens, train_tokens, validation_tokens]

def write_dicts_to_files(file_names, dicts):
    # assumes file_names and dicts are 2 lists w/ the same lengths
    for i in range(len(file_names)):
        with open(file_names[i], "w") as json_file:
            json.dump(dicts[i], json_file)

write_dicts_to_files(file_names, token_dicts)