From 0a5445ce1bae29dfca1d3b07433a5abbeb898805 Mon Sep 17 00:00:00 2001 From: Chen Fu <1316708+chenfucn@users.noreply.github.com> Date: Fri, 19 Aug 2022 09:12:54 -0700 Subject: [PATCH 1/3] add gpt2 qdq example --- .../language_model/gpt2/data_utils.py | 110 ++++++++++++++++++ .../language_model/gpt2/generate_inputs.py | 82 +++++++++++++ .../language_model/gpt2/gpt2_input_reader.py | 34 ++++++ quantization/language_model/gpt2/run_qdq.py | 52 +++++++++ .../language_model/gpt2/run_qdq_debug.py | 86 ++++++++++++++ 5 files changed, 364 insertions(+) create mode 100644 quantization/language_model/gpt2/data_utils.py create mode 100644 quantization/language_model/gpt2/generate_inputs.py create mode 100644 quantization/language_model/gpt2/gpt2_input_reader.py create mode 100644 quantization/language_model/gpt2/run_qdq.py create mode 100644 quantization/language_model/gpt2/run_qdq_debug.py diff --git a/quantization/language_model/gpt2/data_utils.py b/quantization/language_model/gpt2/data_utils.py new file mode 100644 index 000000000..bcee384a8 --- /dev/null +++ b/quantization/language_model/gpt2/data_utils.py @@ -0,0 +1,110 @@ +import random +import torch +from transformers import AutoTokenizer +from typing import Sequence, Tuple + +EXAMPLE_Text = ["best hotel in bay area", "here is an example of gpt2 model"] + + +def get_tokenizer(model_name_or_path: str, cache_dir: str): + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) + tokenizer.padding_side = "left" + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_example_inputs( + model_name_or_path: str, + cache_dir: str, + num_attention_heads: int, + num_layer: int, + hidden_size: int, + device: str, + prompt_text: Sequence[str] = EXAMPLE_Text, +): + tokenizer = get_tokenizer(model_name_or_path, cache_dir) + encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True) + + input_ids = torch.tensor(encodings_dict["input_ids"], dtype=torch.int32) + attention_mask = torch.tensor(encodings_dict["attention_mask"], dtype=torch.int32) + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(position_ids < 0, 0) + position_ids = position_ids.to(torch.int32) + + # Empty Past State for generating first word + empty_past = [] + batch_size = input_ids.size(0) + sequence_length = input_ids.size(1) + past_shape = [ + 2, + batch_size, + num_attention_heads, + 0, + hidden_size // num_attention_heads, + ] + for i in range(num_layer): + empty_past.append(torch.empty(past_shape).type(torch.float32).to(device)) + + return input_ids, attention_mask, position_ids, empty_past + + +def get_dummy_inputs( + batch_size: int, + past_sequence_length: int, + sequence_length: int, + num_attention_heads: int, + hidden_size: int, + num_layer: int, + vocab_size: int, + device: torch.device, + has_position_ids: bool = True, + has_attention_mask: bool = True, + input_ids_dtype: torch.dtype = torch.int64, + position_ids_dtype: torch.dtype = torch.int64, + attention_mask_dtype: torch.dtype = torch.int64, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Create random inputs for GPT2 model. + Returns torch tensors of input_ids, position_ids, attention_mask and a list of past state tensors. + """ + past_shape = [ + 2, + batch_size, + num_attention_heads, + past_sequence_length, + int(hidden_size / num_attention_heads), + ] + + past = [ + (torch.rand(past_shape, dtype=torch.float32, device=device) * 2.0 - 1.0) + for _ in range(num_layer) + ] + input_ids = torch.randint( + low=0, + high=vocab_size - 1, + size=(batch_size, sequence_length), + dtype=input_ids_dtype, + device=device, + ) + + attention_mask = None + if has_attention_mask: + total_sequence_length = past_sequence_length + sequence_length + attention_mask = torch.ones( + [batch_size, total_sequence_length], + dtype=attention_mask_dtype, + device=device, + ) + if total_sequence_length >= 2: + padding_position = random.randint( + 0, total_sequence_length - 1 + ) # test input with padding. + attention_mask[:, padding_position] = 0 + + # Deduce position_ids from attention mask + position_ids = None + if has_position_ids: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(position_ids < 0, 0) + position_ids = position_ids[:, past_sequence_length:].to(position_ids_dtype) + + return (input_ids, attention_mask, position_ids, past) diff --git a/quantization/language_model/gpt2/generate_inputs.py b/quantization/language_model/gpt2/generate_inputs.py new file mode 100644 index 000000000..681df0c17 --- /dev/null +++ b/quantization/language_model/gpt2/generate_inputs.py @@ -0,0 +1,82 @@ +import argparse +import logging +import numpy +import torch +from pathlib import Path + +import data_utils + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--output_dir", + default="./test_input", + help="Specify the destination folder of randomly generated input data sets.", + ) + + parser.add_argument( + "--num_batches", + type=int, + choices=range(2, 500), + default=10, + help="Specify how many batches of input data sets to generate.", + ) + parser.add_argument("--batch_size", type=int, default=2, help="Input batch size") + parser.add_argument("--past_sequence_length", type=int, default=4) + parser.add_argument("--sequence_length", type=int, default=2) + + args = parser.parse_args() + return args + + +def main(): + # Process input parameters and setup model input data reader + args = get_args() + + # Prepare output folder for storing input data files + output_folder = Path(args.output_dir) + if not output_folder.exists(): + output_folder.mkdir() + elif not output_folder.is_dir(): + logging.error(f"File '{str(output_folder)}' exists and is not a folder!") + return + + # Generate num_batches sets of input data + num_batches = 1 if args.num_batches < 1 else args.num_batches + for batch_id in range(num_batches): + data_file = output_folder / f"batch_{batch_id}.npz" + if data_file.exists(): + logging.error( + f"File '{data_file}' exists! Can't write generated input data!" + ) + return + + input_ids, attention_mask, position_ids, past = data_utils.get_dummy_inputs( + batch_size=args.batch_size, + past_sequence_length=args.past_sequence_length, + sequence_length=args.sequence_length, + num_attention_heads=16, + hidden_size=1024, + num_layer=24, + vocab_size=50257, + device="cpu", + has_position_ids=True, + has_attention_mask=True, + input_ids_dtype=torch.int64, + position_ids_dtype=torch.int64, + attention_mask_dtype=torch.int64, + ) + ort_inputs = { + "input_ids": numpy.ascontiguousarray(input_ids.cpu().numpy()), + "attention_mask": numpy.ascontiguousarray(attention_mask.cpu().numpy()), + "position_ids": numpy.ascontiguousarray(position_ids.cpu().numpy()), + } + for i, past_i in enumerate(past): + ort_inputs[f"past_{i}"] = numpy.ascontiguousarray(past_i.cpu().numpy()) + + numpy.savez(str(data_file), **ort_inputs) + + +if __name__ == "__main__": + main() diff --git a/quantization/language_model/gpt2/gpt2_input_reader.py b/quantization/language_model/gpt2/gpt2_input_reader.py new file mode 100644 index 000000000..926e419cf --- /dev/null +++ b/quantization/language_model/gpt2/gpt2_input_reader.py @@ -0,0 +1,34 @@ +import numpy +from onnxruntime.quantization import CalibrationDataReader +from pathlib import Path + + +class Gpt2InputReader(CalibrationDataReader): + def __init__(self, data_folder: str): + self.batch_id = 0 + self.input_folder = Path(data_folder) + + if not self.input_folder.is_dir(): + raise RuntimeError( + f"Can't find input data directory: {str(self.input_folder)}" + ) + data_file = self.input_folder / f"batch_{self.batch_id}.npz" + if not data_file.exists(): + raise RuntimeError(f"No data files found under '{self.input_folder}'") + + def get_next(self): + self.input_dict = None + data_file = self.input_folder / f"batch_{self.batch_id}.npz" + if not data_file.exists(): + return None + self.batch_id += 1 + + self.input_dict = {} + npy_file = numpy.load(data_file) + for name in npy_file.files: + self.input_dict[name] = npy_file[name] + + return self.input_dict + + def rewind(self): + self.batch_id = 0 diff --git a/quantization/language_model/gpt2/run_qdq.py b/quantization/language_model/gpt2/run_qdq.py new file mode 100644 index 000000000..38e4ed17c --- /dev/null +++ b/quantization/language_model/gpt2/run_qdq.py @@ -0,0 +1,52 @@ +import argparse +from onnxruntime.quantization import QuantFormat, QuantType, quantize_static + +import gpt2_input_reader + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_model", + default="gpt2_medium_fp32.onnx", + help="Path to float 32 gpt-2 model.", + ) + parser.add_argument( + "--output_model", required=False, help="Path to quantized model" + ) + parser.add_argument( + "--calibrate_dataset", + default="./test_input", + help="Specify the destination folder of input data sets.", + ) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + input_model_path = args.input_model + output_model_path = args.output_model + if not output_model_path: + output_model_path = ( + input_model_path[: -len(".onnx")] + if input_model_path.endswith(".onnx") + else input_model_path + ) + output_model_path += "_qdq.onnx" + + calibration_dataset_path = args.calibrate_dataset + input_reader = gpt2_input_reader.Gpt2InputReader(calibration_dataset_path) + quantize_static( + input_model_path, + output_model_path, + input_reader, + quant_format=QuantFormat.QDQ, + per_channel=False, + weight_type=QuantType.QInt8, + ) + print("Calibrated and quantized model saved.") + + +if __name__ == "__main__": + main() diff --git a/quantization/language_model/gpt2/run_qdq_debug.py b/quantization/language_model/gpt2/run_qdq_debug.py new file mode 100644 index 000000000..4aad527aa --- /dev/null +++ b/quantization/language_model/gpt2/run_qdq_debug.py @@ -0,0 +1,86 @@ +import argparse +import onnx +from onnxruntime.quantization.qdq_loss_debug import ( + collect_activations, compute_activation_error, compute_weight_error, create_activation_matching, + create_weight_matching, modify_model_output_intermediate_tensors) + +import gpt2_input_reader + + +def _generate_aug_model_path(model_path: str) -> str: + aug_model_path = ( + model_path[: -len(".onnx")] if model_path.endswith(".onnx") else model_path + ) + return aug_model_path + ".save_tensors.onnx" + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--float_model", required=True, help="Path to original 32 bit floating point model" + ) + parser.add_argument("--qdq_model", required=True, help="Path to qdq model") + parser.add_argument( + "--calibrate_dataset", default="./test_input", help="calibration data set" + ) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + float_model_path = args.float_model + qdq_model_path = args.qdq_model + calibration_dataset_path = args.calibrate_dataset + + print("------------------------------------------------\n") + print("Comparing weights of float model vs qdq model.....") + + matched_weights = create_weight_matching(float_model_path, qdq_model_path) + weights_error = compute_weight_error(matched_weights) + for weight_name, err in weights_error.items(): + print(f"Cross model error of '{weight_name}': {err}\n") + + print("------------------------------------------------\n") + print("Augmenting models to save intermediate activations......") + + aug_float_model = modify_model_output_intermediate_tensors(float_model_path) + aug_float_model_path = _generate_aug_model_path(float_model_path) + onnx.save( + aug_float_model, + aug_float_model_path, + save_as_external_data=False, + ) + del aug_float_model + + aug_qdq_model = modify_model_output_intermediate_tensors(qdq_model_path) + aug_qdq_model_path = _generate_aug_model_path(qdq_model_path) + onnx.save( + aug_qdq_model, + aug_qdq_model_path, + save_as_external_data=False, + ) + del aug_qdq_model + + print("------------------------------------------------\n") + print("Running the augmented floating point model to collect activations......") + input_data_reader = gpt2_input_reader.Gpt2InputReader(calibration_dataset_path) + float_activations = collect_activations(aug_float_model_path, input_data_reader) + + print("------------------------------------------------\n") + print("Running the augmented qdq model to collect activations......") + input_data_reader.rewind() + qdq_activations = collect_activations(aug_qdq_model_path, input_data_reader) + + print("------------------------------------------------\n") + print("Comparing activations of float model vs qdq model......") + + act_matching = create_activation_matching(qdq_activations, float_activations) + act_error = compute_activation_error(act_matching) + for act_name, err in act_error.items(): + print(f"Cross model error of '{act_name}': {err['xmodel_err']} \n") + print(f"QDQ error of '{act_name}': {err['qdq_err']} \n") + + +if __name__ == "__main__": + main() From 5c74e612925113b5cf1d459c86d7bb7b930538b9 Mon Sep 17 00:00:00 2001 From: Chen Fu <1316708+chenfucn@users.noreply.github.com> Date: Tue, 6 Sep 2022 09:54:56 -0700 Subject: [PATCH 2/3] preprocess module name change --- .../image_classification/cpu/ReadMe.md | 6 +-- quantization/language_model/gpt2/ReadMe.md | 47 +++++++++++++++++++ quantization/language_model/gpt2/run_qdq.py | 5 +- 3 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 quantization/language_model/gpt2/ReadMe.md diff --git a/quantization/image_classification/cpu/ReadMe.md b/quantization/image_classification/cpu/ReadMe.md index fe22b94ba..9a3e41808 100644 --- a/quantization/image_classification/cpu/ReadMe.md +++ b/quantization/image_classification/cpu/ReadMe.md @@ -13,7 +13,7 @@ Pre-processing prepares a float32 model for quantization. Run the following comm model `mobilenetv2-7.onnx`. ```console -python -m onnxruntime.quantization.shape_inference --input mobilenetv2-7.onnx --output mobilenetv2-7-infer.onnx +python -m onnxruntime.quantization.qdq_preprocess --input mobilenetv2-7.onnx --output mobilenetv2-7-infer.onnx ``` The pre-processing consists of the following optional steps @@ -30,7 +30,7 @@ merged Convolution + BatchNormalization node. It is highly recommended to run model optimization in pre-processing instead of in quantization. To learn more about each of these steps and finer controls, run: ```console -python -m onnxruntime.quantization.shape_inference --help +python -m onnxruntime.quantization.qdq_preprocess --help ``` ## Quantization @@ -76,7 +76,7 @@ For instance, you have a model `abc_float32_model.onnx`, and a quantized model by default. You can run the following code to produce an optimized float32 model: ```console -python -m onnxruntime.quantization.shape_inference --input abc_float32_model.onnx --output abc_optimized.onnx --skip_symbolic_shape True +python -m onnxruntime.quantization.qdq_preprocess --input abc_float32_model.onnx --output abc_optimized.onnx --skip_symbolic_shape True ``` Then run the debugger comparing `abc_optimized.onnx` with `abc_quantized.onnx`. diff --git a/quantization/language_model/gpt2/ReadMe.md b/quantization/language_model/gpt2/ReadMe.md new file mode 100644 index 000000000..d08249ace --- /dev/null +++ b/quantization/language_model/gpt2/ReadMe.md @@ -0,0 +1,47 @@ +# Example of GPT-2-medium Quantization Example + +This folder contains example code for quantizing GPT2-medium model. This is by an large similar to +[this example](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization/image_classification/cpu). + +## Obtaining the 32-bit floating point model + +ONNX Runtime provides tools for converting GPT2 models to ONNX, run: + +```console +python -m onnxruntime.transformers.models.gpt2.convert_to_onnx -m gpt2-medium --output gpt2_medium_fp32.onnx -o -p fp32 +``` + + +## Preparing the floating point model for quantization + +Here we pre-process the model, essentially run shape inferences and model optimization, both of +which may improve the performance of quantization. + +```console +python -m onnxruntime.quantization.qdq_preprocess --input gpt2_medium_fp32.onnx --output gpt2_medium_fp32_preprocessed.onnx +``` + +## Quantize + +We use static quantization here, for which a calibration data set is required. You can run +`generate_inputs.py` to generate random dummy input for gpt-2 medium. See the python source +code for finer control options + + +With calibration data set, run the following command to invoke the quantization tool, which +will run the model with provided data set, compute quantization parameters for each +weight and activation tensors, and output the quantized model: + +```console +python run_qdq.py --input_model gpt2_medium_fp32_preprocessed.onnx --output_model gpt2_medium_quant.onnx --calibrate_dataset ./test_input +``` + +## Quantization Debugging + +Python file `run_qdq_debug.py` showcase how to use our quantization debugging API to match up +corresponding weight/activation tensors between floating point and quantized models. Run + +```console +python run_qdq_debug.py --float_model gpt2_medium_fp32_preprocessed.onnx --qdq_model gpt2_medium_quant.onnx --calibrate_dataset ./test_input +``` + diff --git a/quantization/language_model/gpt2/run_qdq.py b/quantization/language_model/gpt2/run_qdq.py index 38e4ed17c..0335cc4b2 100644 --- a/quantization/language_model/gpt2/run_qdq.py +++ b/quantization/language_model/gpt2/run_qdq.py @@ -8,11 +8,12 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--input_model", - default="gpt2_medium_fp32.onnx", + default="gpt2_medium_fp32_preprocessed.onnx", help="Path to float 32 gpt-2 model.", ) parser.add_argument( - "--output_model", required=False, help="Path to quantized model" + "--output_model", required=False, help="Path to quantized model", + default="gpt2_medium_fp32_quant.onnx" ) parser.add_argument( "--calibrate_dataset", From 802af773e27c544115058bc01f4e29b517b4ef4f Mon Sep 17 00:00:00 2001 From: Chen Fu <1316708+chenfucn@users.noreply.github.com> Date: Tue, 6 Sep 2022 10:55:09 -0700 Subject: [PATCH 3/3] rename preprocessor --- quantization/image_classification/cpu/ReadMe.md | 6 +++--- quantization/language_model/gpt2/ReadMe.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/quantization/image_classification/cpu/ReadMe.md b/quantization/image_classification/cpu/ReadMe.md index 9a3e41808..d4734af6c 100644 --- a/quantization/image_classification/cpu/ReadMe.md +++ b/quantization/image_classification/cpu/ReadMe.md @@ -13,7 +13,7 @@ Pre-processing prepares a float32 model for quantization. Run the following comm model `mobilenetv2-7.onnx`. ```console -python -m onnxruntime.quantization.qdq_preprocess --input mobilenetv2-7.onnx --output mobilenetv2-7-infer.onnx +python -m onnxruntime.quantization.preprocess --input mobilenetv2-7.onnx --output mobilenetv2-7-infer.onnx ``` The pre-processing consists of the following optional steps @@ -30,7 +30,7 @@ merged Convolution + BatchNormalization node. It is highly recommended to run model optimization in pre-processing instead of in quantization. To learn more about each of these steps and finer controls, run: ```console -python -m onnxruntime.quantization.qdq_preprocess --help +python -m onnxruntime.quantization.preprocess --help ``` ## Quantization @@ -76,7 +76,7 @@ For instance, you have a model `abc_float32_model.onnx`, and a quantized model by default. You can run the following code to produce an optimized float32 model: ```console -python -m onnxruntime.quantization.qdq_preprocess --input abc_float32_model.onnx --output abc_optimized.onnx --skip_symbolic_shape True +python -m onnxruntime.quantization.preprocess --input abc_float32_model.onnx --output abc_optimized.onnx --skip_symbolic_shape True ``` Then run the debugger comparing `abc_optimized.onnx` with `abc_quantized.onnx`. diff --git a/quantization/language_model/gpt2/ReadMe.md b/quantization/language_model/gpt2/ReadMe.md index d08249ace..12d207409 100644 --- a/quantization/language_model/gpt2/ReadMe.md +++ b/quantization/language_model/gpt2/ReadMe.md @@ -18,7 +18,7 @@ Here we pre-process the model, essentially run shape inferences and model optimi which may improve the performance of quantization. ```console -python -m onnxruntime.quantization.qdq_preprocess --input gpt2_medium_fp32.onnx --output gpt2_medium_fp32_preprocessed.onnx +python -m onnxruntime.quantization.preprocess --input gpt2_medium_fp32.onnx --output gpt2_medium_fp32_preprocessed.onnx ``` ## Quantize