# Convert Finetuned FLAN-T5 Model from PyTorch format to ONNX

Export the finetuned checkpoint for production deployment by saving it in a common standard format: ONNX.

ONNX is compatible with multiple serving runtimes and it is kind of an Intermediate Representation that can be run indipendently from the toolkit/framework that the original model has been written in .

`ONNX` is the Acronym for `Open Neural Network Exchange`

In [None]:
# import libraries
try:
    import torch
    import os
    from dotenv import dotenv_values
    from optimum.onnxruntime import ORTModelForSeq2SeqLM
    from transformers import AutoTokenizer
except ImportError as e:
    print(f"Exception during library import {e}")

# load dotenv
config_env: dict = dotenv_values("localenv")

# load configuration parameters
CONFIG_FILE: str = config_env.get("PARAMETER_FILE", "parameters.yaml")
OUTPUT_DIR: str = config_env.get("OUTPUT_DIR", "flan-finetuned-ita")

## 1. Load & Convert Model via Optimum

In [None]:
# create output dir
ONNX_DIR: str = OUTPUT_DIR + "/onnx"
os.makedirs(ONNX_DIR, exist_ok=True)

In [None]:
# load model from local path via Optimum ONNX Optimizer
try:
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
    model = ORTModelForSeq2SeqLM.from_pretrained(
        OUTPUT_DIR,
        export=True
    )
except Exception as e:
    print(f"Exception during model export: {e}")

## 2. Save The Model!

In [None]:
# save onnx to disk
try:
    model.save_pretrained(ONNX_DIR)
    tokenizer.save_pretrained(ONNX_DIR)
except Exception as e:
    print(e)

# clean up
del model
del tokenizer

## 3. Test the Converted Model

Try to do inference with the ONNX model. Make sure it still works as expected

In [None]:
# synthetic test data. Use the same sentences used during test of the PyTorch model
# Test with Italian examples containing PII
test_sentences = [
    "Il signor Alessandro Bianchi abita in Via Nazionale 45, Milano.",
    "Per contattare Giulia Rossi chiamare il 339-8765432 o scrivere a giulia.rossi@email.it",
    "Il paziente Marco Esposito, nato il 25/08/1982, codice fiscale SPSMRC82M25H501Z.",
    "Pagamento con carta 5123-4567-8901-2345 intestata a Francesca Lombardi.",
    "Contattare la dottoressa Elena Ricci al numero 02-12345678, ufficio in Corso Italia 88, Roma.",
]

In [None]:
# load the onnx model
try:
    onnx_model = ORTModelForSeq2SeqLM.from_pretrained(ONNX_DIR)
    onnx_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
except Exception as e:
    print(e)

In [None]:
# prepare inference pipeline
from transformers import pipeline
gen = pipeline(
    "text2text-generation", # text2text-generation is the name of the task supported by Seq2Seq models such as T5
    model     = onnx_model,
    tokenizer = onnx_tokenizer,
)

In [None]:
# test input
for prompt in test_sentences:
    result = gen(prompt, max_new_tokens=50)
    print(result)

In [None]:
# cleanup
del onnx_model
del onnx_tokenizer
del gen