In [1]:
import onnxruntime_genai as og
import argparse
import time

In [7]:
print("Loading model...")
app_started_timestamp = time.time()

model = og.Model(f'example-models\phi2-int4-cpu')
model_loaded_timestamp  = time.time()

print("Model loaded in {:.2f} seconds".format(model_loaded_timestamp - app_started_timestamp))



Loading model...
Model loaded in 41.10 seconds


In [10]:
print("Loading tokenizer...")
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()

print("Tokenizer created")


system_prompt = "You are a helpful assistant. Answer in one sentence."
text = "What is Dilithium?"

input_tokens = tokenizer.encode(system_prompt + text)

prompt_length = len(input_tokens)

Loading tokenizer...
Tokenizer created


In [19]:
started_timestamp = time.time()

print("Creating generator ...")
params = og.GeneratorParams(model)
params.set_search_options({"do_sample": False, "max_length": 2028, "min_length": 0, "top_p": 0.9, "top_k": 40, "temperature": 1.0, "repetition_penalty": 1.0})
params.input_ids = input_tokens
generator = og.Generator(model, params)
print("Generator created")

first = True
new_tokens = []

while not generator.is_done():
    generator.compute_logits()
    generator.generate_next_token()
    if first:
        first_token_timestamp = time.time()
        first = False

    new_token = generator.get_next_tokens()[0]
    print(tokenizer_stream.decode(new_token), end="")
    new_tokens.append(new_token)

print()
run_time = time.time() - started_timestamp
print(f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(first_token_timestamp - started_timestamp):.2f}s, New tokens per second: {len(new_tokens)/run_time:.2f} tps")


Creating generator ...
Generator created

A: Dilithium is a fictional substance in the Star Trek universe that is used as a propellant and a power source for spaceships.

Prompt tokens: 17, New tokens: 32, Time to first: 1.32s, New tokens per second: 4.29 tps


In [None]:
# Compare with llama.cpp.
