## library imports

In [39]:
import torch
import transformers
import onnx
import onnxruntime.training.onnxblock as onnxblock
from datasets import load_dataset
import json
import random
import re

## generating artifacts

In [40]:
from transformers import MobileBertConfig
config = MobileBertConfig(num_hidden_layers=2)
model = transformers.MobileBertForMaskedLM.from_pretrained('google/mobilebert-uncased', config=config)
# model = transformers.AutoModel.from_pretrained('google/mobilebert-uncased')
model_name = 'mobilebert-uncased'

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForMaskedLM: ['mobilebert.encoder.layer.16.attention.self.value.weight', 'mobilebert.encoder.layer.2.bottleneck.attention.dense.bias', 'mobilebert.encoder.layer.18.bottleneck.attention.LayerNorm.bias', 'mobilebert.encoder.layer.6.ffn.0.output.dense.bias', 'mobilebert.encoder.layer.23.ffn.0.output.dense.weight', 'mobilebert.encoder.layer.5.bottleneck.input.dense.bias', 'mobilebert.encoder.layer.12.bottleneck.attention.dense.weight', 'mobilebert.encoder.layer.14.ffn.2.intermediate.dense.bias', 'mobilebert.encoder.layer.22.bottleneck.input.dense.bias', 'cls.seq_relationship.weight', 'mobilebert.encoder.layer.10.ffn.2.output.LayerNorm.weight', 'mobilebert.encoder.layer.16.output.bottleneck.dense.bias', 'mobilebert.encoder.layer.17.ffn.0.intermediate.dense.weight', 'mobilebert.encoder.layer.12.ffn.1.output.dense.bias', 'mobilebert.encoder.layer.2.ffn.2.output.LayerNorm.weight', 'mobil

In [41]:
tokenizer = transformers.AutoTokenizer.from_pretrained("google/mobilebert-uncased")
inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

In [42]:
class FlatModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *local_inputs):
        return self.model(inputs.input_ids, inputs.attention_mask, inputs.token_type_ids, labels=labels)

model = FlatModel(model)

In [43]:
torch.onnx.export(
    model,
    (inputs["input_ids"], 
      inputs["attention_mask"],
      inputs["token_type_ids"],
      labels),
    f"model.onnx",
    input_names=["input_ids", "attention_mask", "token_type_ids", "labels"],
    output_names=["loss", "logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "token_type_ids": {0: "batch_size", 1: "sequence_length"},
        "labels": {0: "batch_size", 1: "sequence_length"},
        "logits ": {0: "batch_size", 1: "sequence_length"}
    },
    export_params=True,
    do_constant_folding=False,
    training=torch.onnx.TrainingMode.TRAINING,
)

  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


verbose: False, log level: Level.ERROR



In [44]:
from onnxruntime.training import artifacts
import onnx

requires_grad = []
frozen_params = []
for name, param in model.named_parameters():
    if param.requires_grad:
        requires_grad.append(name)
    else:
        frozen_params.append(name)

for name, param in model.named_buffers():
    frozen_params.append(name)

model = onnx.load("model.onnx")


artifacts.generate_artifacts(
    model,
    requires_grad=requires_grad,
    frozen_params=frozen_params,
    optimizer=artifacts.OptimType.AdamW,
)


2023-04-06 20:28:03.888548843 [I:onnxruntime:Default, graph.cc:3546 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/cls/predictions/transform/LayerNorm/Constant_output_0'. It is no longer used by any node.
2023-04-06 20:28:03.888612040 [I:onnxruntime:Default, graph.cc:3546 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/cls/predictions/transform/LayerNorm/Constant_1_output_0'. It is no longer used by any node.
2023-04-06 20:28:03.893525029 [I:onnxruntime:Default, graph.cc:3546 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/mobilebert/embeddings/Transpose_output_0'. It is no longer used by any node.
2023-04-06 20:28:03.893539528 [I:onnxruntime:Default, graph.cc:3546 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/mobilebert/embeddings/Slice_2_output_0'. It is no longer used by any node.
2023-04-06 20:28:03.893544728 [I:onnxruntime:Default, graph.cc:3546 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/m

In [32]:
# create the random input

# expects
# input_ids = torch.LongTensor of shape (batch size, seq len)
# attention_mask = torch.FloatTensor of shape (batch size, seq len)
# token_type_ids = torch.LongTensor of shape (bs, seq len)

num_seq = 25
seq_len = 150
vocab = 20000
input_ids = torch.randint(vocab, (num_seq, seq_len))
attention_mask = torch.ones((num_seq, seq_len), dtype=torch.float)
token_type_ids = torch.ones((num_seq, seq_len), dtype=torch.long)



## generating tokens

In [7]:
def tokenize_function(examples, pad_to_len):
    tokenizer = transformers.AutoTokenizer.from_pretrained("google/mobilebert-uncased")
    # filter out empty strings to remove unnecessary processing
    examples["text"] = [sent for sent in examples["text"] if len(sent) > 0]
    labels = tokenizer(examples["text"], padding="max_length", max_length=pad_to_len, truncation=True, return_tensors="pt")
    masked_examples = [mask(sent, pad_to_len) for sent in examples["text"]]
    inputs = tokenizer(masked_examples, padding="max_length", max_length=pad_to_len, truncation=True, return_tensors="pt")
    labels = torch.where(inputs["input_ids"] == tokenizer.mask_token_id, labels["input_ids"], -100)
    inputs["labels"] = labels
    return inputs

def mask(sent, pad_to_len):
    sent_words = sent.split()
    mask_index = random.randint(0, min(len(sent_words), pad_to_len) - 1)
    # replace random index with mask word, leaving punctuation as is
    # ... this preprocessing means that the token masked might be the <unk> word
    masked_words = [sent_words[ind] if ind != mask_index else re.sub("[a-zA-Z']+", "[MASK]", sent_words[ind]) for ind in range(len(sent_words))]
    return ' '.join(masked_words)

def generate_tokens(corpus):
    """
    Takes in a Dataset with a "text" feature.

    Returns a Dataset with the following features: text, input_ids, token_type_ids, attention_mask, special_tokens_mask
    """
    # pad_to_len must be calculated before the batching happens to create consistent sizes in the resulting tensor
    # pad_to_len = max([len(sent) for sent in corpus["text"]])
    pad_to_len = 80 # shortened for demonstration purposes
    return corpus.map(tokenize_function, batched=True, fn_kwargs={"pad_to_len": pad_to_len})

def generate_json_dict(token_dataset):
    """
    Takes in a Dataset with the following features: text, input_ids, token_type_ids, attention_mask, special_tokens_mask

    Basically changes the 2d Python lists into two fields: a shape & a flattened list, for easier conversion to OnnxValues

    Returns a dictionary with the following keys: input_ids, input_size, token_type_ids, token_type_size, attention_mask, attention_mask_size, special_tokens_mask, special_tokens_size
    """
    json_dict = {}
    keys_to_convert = ["input_ids", "token_type_ids", "attention_mask", "labels"]

    for key_name in keys_to_convert:
        # add field for the shape of the tensor
        json_dict[key_name + "_shape"] = [len(token_dataset[key_name]), len(token_dataset[key_name][0])]
        # flatten list
        json_dict[key_name] = [num for sent in token_dataset[key_name] for num in sent]
    
    return json_dict


In [8]:
dataset_name = "wikitext" 
dataset_config = "wikitext-2-v1"
# corpus = type DatasetDict with three Datasets: test, train, validation
corpus = load_dataset(dataset_name, dataset_config)

100%|██████████| 3/3 [00:00<00:00, 693.92it/s]


In [9]:
test_tokens_dataset = generate_tokens(corpus["test"])
test_tokens = generate_json_dict(test_tokens_dataset)
# corpus["train"]["text"] = corpus["train"]["text"][:5000]
train_tokens_dataset = generate_tokens(corpus["train"])
train_tokens = generate_json_dict(train_tokens_dataset)
validation_tokens_dataset = generate_tokens(corpus["validation"])
validation_tokens = generate_json_dict(validation_tokens_dataset)



In [85]:
# write all the tokens to a json file
file_names = ["test_tokens.json", "train_tokens.json", "validation_tokens.json"]
token_dicts = [test_tokens, train_tokens, validation_tokens]

def write_dicts_to_files(file_names, dicts):
    # assumes file_names and dicts are 2 lists w/ the same lengths
    for i in range(len(file_names)):
        with open(file_names[i], "w") as json_file:
            json.dump(dicts[i], json_file)

write_dicts_to_files(file_names, token_dicts)

In [10]:
import onnxruntime.training.api as orttraining
import os

checkpoint_state = orttraining.CheckpointState(
    os.path.join(os.getcwd(), "checkpoint")
)

model = orttraining.Module(
    os.path.join(os.getcwd(), "training_model.onnx"),
    checkpoint_state,
    os.path.join(os.getcwd(), "eval_model.onnx"),
)

optimizer = orttraining.Optimizer(
    os.path.join(os.getcwd(), "optimizer_model.onnx"), model
)



In [11]:
import numpy as np
# this runs one epoch... w c# version, maybe run 2 epochs & linear learning rate scheduler
# should be declared and used across the num of epochs... so maybe lr scheduler
# can be passed in as an optional argument?
def api_train(model, inputs, optimizer, batch_size):
    """
    Does one epoch of training on CPU

    Args:
        model - ORTModule
        inputs - Dictionary
        optimizer - ORTTraining Optimizer
        steps - int
    """
    # loss reset and accumulated every epoch
    total_loss = 0

    model.lazy_reset_grad()
    model.train()

    size = len(inputs["input_ids"])

    steps = int(size / batch_size)
    start_batch = 0
    end_batch = start_batch + batch_size

    for step in range(steps):
        if start_batch >= size or end_batch >= size:
            break
        
        # unpack and define inputs from inputs
        input_ids = np.array(inputs["input_ids"][start_batch:end_batch])
        print("input ids shape", input_ids.shape)
        attention_mask = np.array(inputs["attention_mask"][start_batch:end_batch])
        token_type_ids = np.array(inputs["token_type_ids"][start_batch:end_batch])
        labels = np.array(inputs["labels"][start_batch:end_batch])
        input_np_list = [input_ids, attention_mask, token_type_ids, labels]

        outputs = model(input_np_list)
        # returns array of NaN of shape [item, 3d list]
        loss = outputs[0]
        total_loss += loss.item() # .item() returns python value from the tensor
        print(outputs)

        # torch.nn.utils.clip_grad_norm_(model.get_contiguous_parameters(), 1.0) # prevent exploding gradients

        optimizer.step() 

        start_batch += batch_size
        end_batch = min(end_batch + batch_size, size)

    avg_loss = total_loss / steps 
    return avg_loss

api_train(model, test_tokens_dataset, optimizer, 10)

input ids shape (10, 80)


RuntimeError: /bert_ort/carolinezhu/ort/onnxruntime/orttraining/orttraining/training_api/module.cc:438 onnxruntime::common::Status onnxruntime::training::api::Module::TrainStep(const std::vector<OrtValue>&, std::vector<OrtValue>&) [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: onnx::Reshape_3 for the following indices
 index: 0 Got: 10 Expected: 1
 index: 1 Got: 80 Expected: 9
 Please fix either the inputs or the model.


In [14]:
type(test_tokens_dataset["input_ids"])

list

In [None]:
from onnxruntime import InferenceSession

session = InferenceSession("training_model.onnx", providers=["CPUExecutionProvider"])