In [None]:
text = 'Who is Isaac Newton?'
ExecutionProvider="OpenVINOExecutionProvider"
model_folder = "./model"

In [None]:
import onnxruntime_genai as og
import json
import time
from pathlib import Path

def get_session_options(obj):
    if type(obj) is dict:
        for k, v in obj.items():
            if k == "session_options":
                yield v
            else:
                for x in get_session_options(v):
                    yield x
    elif type(obj) is list:
        for v in obj:
            for x in get_session_options(v):
                yield x


def remove_provider_options(model_path):
    genai_config_path = Path(model_path) / "genai_config.json"
    data = json.loads(genai_config_path.read_text())
    for session_option in get_session_options(data):
        if 'provider_options' in session_option:
            session_option['provider_options'] = [{k: dict() for k in opts.keys()} for opts in session_option['provider_options']]

    json.dump(data, genai_config_path.open("w"), indent=4)

if ExecutionProvider == "QNNExecutionProvider":
    remove_provider_options(model_folder)

# Load the base model and tokenizer
model = og.Model(model_folder)
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()

# Set the max length to something sensible by default,
# since otherwise it will be set to the entire context length
search_options = {}
search_options["max_length"] = 200

chat_template = "<|user|>\n{input} <|end|>\n<|assistant|>"

# Generate prompt (prompt template + input)
prompt = f"{chat_template.format(input=text)}"

# Encode the prompt using the tokenizer
input_tokens = tokenizer.encode(prompt)

# Create params and generator
params = og.GeneratorParams(model)
params.set_search_options(**search_options)
generator = og.Generator(model, params)

# Append input tokens to the generator
generator.append_tokens(input_tokens)

print("")
print("Output: ", end="", flush=True)

token_times = []

# Stream the output
while not generator.is_done():
    start_time = time.time()
    generator.generate_next_token()
    end_time = time.time()
    
    # Record the time for this token generation
    token_time = end_time - start_time
    token_times.append(token_time)

    new_token = generator.get_next_tokens()[0]
    print(tokenizer_stream.decode(new_token), end="", flush=True)

print()

# Calculate and display timing statistics
if token_times:
    total_tokens = len(token_times)
    avg_time = sum(token_times) / total_tokens
    
    print(f"Total tokens generated: {total_tokens}")
    print(f"Average time per token: {avg_time:.4f} seconds")
    print(f"Tokens per second: {total_tokens / sum(token_times):.2f}")

del generator
