In [1]:
# ! pip install onnxruntime-genai==0.6.0 -U

In [2]:
# Import the onnxruntime_genai library, which is used for running generative AI models with ONNX Runtime
import onnxruntime_genai as og

# Import the argparse library to handle command-line arguments
import argparse

# Import the time library to work with time-related functions
import time

In [3]:
# Create an instance of the Model class from the onnxruntime_genai library
# The model is loaded from the specified path, which points to a Phi-3-mini-128k-instruct-onnx model
# The model is optimized for CPU and mobile devices with specific configurations (int4 precision, rtn block size 32, accuracy level 4)
model = og.Model('../../Models/Phi-3.5-mini-instruct-onnx/cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4')


In [4]:
# Create an instance of the Tokenizer class from the onnxruntime_genai library
# The tokenizer is initialized with the previously loaded model
tokenizer = og.Tokenizer(model)

# Create a stream for tokenizing input data using the tokenizer instance
tokenizer_stream = tokenizer.create_stream()

In [5]:
# Define a dictionary named search_options to store configuration settings for the model's search behavior
search_options = {
    # Set the maximum length of the generated output to 1024 tokens
    "max_length": 1024,
    
    # Set the temperature parameter to 0.6, which controls the randomness of the output
    # Lower values make the output more deterministic, while higher values make it more random
    "temperature": 0.6
}

In [6]:
# Create an instance of the GeneratorParams class from the onnxruntime_genai library
# The parameters are initialized with the previously loaded model
params = og.GeneratorParams(model)

# Optionally, enable the use of CUDA graphs for optimized performance with a maximum batch size of 1
# This line is currently commented out
# params.try_use_cuda_graph_with_max_batch_size(1)

# Set the search options for the generator parameters using the previously defined search_options dictionary
params.set_search_options(**search_options)

In [7]:
# Define a prompt string that sets up a conversation between the user and the AI assistant
# The prompt includes special tokens to indicate the roles of the system, user, and assistant
prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"

# Encode the prompt string into input tokens using the tokenizer
input_tokens = tokenizer.encode(prompt)




In [8]:
# Create an instance of the Generator class from the onnxruntime_genai library
# The generator is initialized with the previously loaded model and the configured parameters
generator = og.Generator(model, params)

generator.append_tokens(input_tokens)

In [9]:
# Loop until the generator has finished generating tokens
while not generator.is_done():


    generator.generate_next_token()

    new_token = generator.get_next_tokens()[0]
    token_text = tokenizer.decode(new_token)


    print(token_text, end='', flush=True)
    
    # Decode the new token into a readable string and print it
    # The 'end' parameter ensures the output is printed on the same line
    # The 'flush' parameter forces the output to be printed immediately
    # print(tokenizer_stream.decode(new_token), end='', flush=True)


 Hello! I'm Phi, an AI language model created by Microsoft. I exist purely in the digital realm and don't have personal experiences, but I'm here to help answer your questions and assist with any information or tasks you need. How can I help you today?�