# Experiment: ONNX CUDA

In [1]:
import os
import nvidia.cuda_runtime.bin
import nvidia.cudnn.bin
import onnxruntime as ort
from pprint import pprint
from onnx_utils import Model
from unittest import TestCase
test = TestCase()

Add CUDA 11.8 and cuDNN 8.9.2.26 to `PATH`

In [5]:
os.environ['PATH'] = os.path.dirname(nvidia.cuda_runtime.bin.__file__) + ';' + os.path.dirname(nvidia.cudnn.bin.__file__) + ';' + os.environ['PATH']

# If running CUDA and cuDNN from the virtual environment does not work, perform a system-wide install and update the PATH as follow (adjusting paths as needed)
# - CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
# - cuDNN 8.9.2.26: https://developer.nvidia.com/rdp/cudnn-archive
# os.environ['PATH'] = r'C:\Tools\cudnn-windows-x86_64-8.9.2.26_cuda11-archive\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;' + os.environ['PATH']

Validate the installation

In [7]:
test.assertIn('CUDAExecutionProvider', ort.get_available_providers())
test.assertEqual(ort.get_device(), 'GPU')
test.assertIsNotNone(
    ort.InferenceSession('cuda/cuda-int4-rtn-block-32/phi3-mini-4k-instruct-cuda-int4-rtn-block-32.onnx', providers=['CUDAExecutionProvider']))

Generate text

In [8]:
%%time

model = Model('cuda/cuda-int4-rtn-block-32')

CPU times: total: 1.89 s
Wall time: 3.81 s


In [9]:
%%time

output = model.generate('Tell a joke')

print(output.text)
print()
pprint(output.stats)
print()

 Here's a light-hearted joke for you:

Why don't scientists trust atoms?

Because they make up everything!

Remember, humor is subjective, so I hope you enjoy it!

{'average_time_per_token': 0.02680287187513386,
 'generation_time': 1.7148828000063077,
 'input_token_count': 13,
 'input_tokens_per_second': 14.666796336169563,
 'output_token_count': 51,
 'output_tokens_per_second': 61.51782569412104,
 'time_to_first_token': 0.8863558000011835,
 'tokenization_time': 0.0005010000022593886,
 'total_time': 1.715383800008567}

CPU times: total: 1.16 s
Wall time: 1.72 s
