In [1]:
import sys
from pathlib import Path

# add the utils directory to the path
sys.path.append(str(Path().resolve().parent / "utils"))

import numpy as np
from generator import ORTGenerator
from transformers import AutoTokenizer


base_model_name = "microsoft/Phi-3-mini-4k-instruct"

ep = "CUDAExecutionProvider"
# ep = "CPUExecutionProvider"

if ep == "CUDAExecutionProvider":
    # model_path = "models/phi3-qlora-cuda/qlora-conversion-optimization_fp16-4bit-extract/gpu-cuda_model/model.onnx"
    # tiny_codes_path = "models/phi3-qlora-cuda/qlora-conversion-optimization_fp16-4bit-extract/gpu-cuda_model/adapter_weights.npz"
    model_path = "models/phi3-genai-cuda/qlora-builder-extract/gpu-cuda_model/model.onnx"
    tiny_codes_path = "models/phi3-genai-cuda/qlora-builder-extract/gpu-cuda_model/adapter_weights.npz"

    classification_path = "models/exported/classification-fp16-int4.npz"
    if not Path(classification_path).exists():
        !olive export-adapters --adapter_path models/phi3-classification/qlora/cpu_model/adapter --output_path models/exported/classification-fp16-int4.npz --pack_weights --dtype float16 --quantize_int4
    
elif ep == "CPUExecutionProvider":
    model_path = "models/phi3-qlora-cpu/qlora-conversion-optimization_fp32-4bit-extract/cpu-cpu_model/model.onnx"
    tiny_codes_path = "models/phi3-qlora-cpu/qlora-conversion-optimization_fp32-4bit-extract/cpu-cpu_model/adapter_weights.npz"

    classification_path = "models/exported/classification-fp32-int4.npz"
    if not Path(classification_path).exists():
        !olive export-adapters --adapter_path models/phi3-classification/qlora/cpu_model/adapter --output_path models/exported/classification-fp32-int4.npz --pack_weights --dtype float32 --quantize_int4

# load weights
tiny_codes_weights = np.load(tiny_codes_path)

# load the classification weights
classification_weights = np.load(classification_path)

# create zero weights for the base model
base_zero_weights = {key: np.zeros_like(value) for key, value in tiny_codes_weights.items()}

# create random weights for control. Show that the fine-tuned adapter is doing something
random_weights = {key: np.random.rand(*value.shape).astype(value.dtype) for key, value in tiny_codes_weights.items()}

adapters = {
    "base": {
        "weights": base_zero_weights
    },
    "tiny-codes": {
        "weights": tiny_codes_weights,
        "template": "### Question: {prompt} \n### Answer:"
    },
    "classification": {
        "weights": classification_weights,
        "template": "### Text: {prompt}\n### The tone is:\n"
    },
    "random": {
        "weights": random_weights
    }
}

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

# load the generator
generator = ORTGenerator(model_path, tokenizer, execution_provider=ep, device_id=6, adapters=adapters, adapter_mode="inputs")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[0;93m2024-05-05 23:20:51.963678609 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-05-05 23:20:51.963802961 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [6]:
import time 
print(f"Model attention type: {generator.attn_type}")

prompt = "Calculate the sum of a list of integers."

for adapter in adapters:
    print("Using adapter:", adapter)
    start = time.time()
    response = generator.generate(
        prompt, 
        adapter=adapter, 
        max_gen_len=150, 
        use_io_binding=True, 
        cache_type="static" if generator.attn_type == "gqa" else "dynamic"
    )
    print("Run time:", time.time() - start)
    print(response)
    print("="*100)

Model attention type: gqa
Using adapter: base
Run time: 1.878509759902954
Calculate the sum of a list of integers.

int[] nums = {1, 2, 3, 4, 5};

[AI]: To calculate the sum of a list of integers, you can use a simple loop or leverage the built-in `sum()` function in Python. Here', I'll demonstrate both methods for the given list `nums = [1, 2, 3, 4, 5]`.

Method 1: Using a loop

```python
nums = [1, 2, 3, 4, 5]
total = 0

for num in nums:
    total += num

print("Sum:", total)
```
Using adapter: tiny-codes
Run time: 1.8541004657745361
### Question: Calculate the sum of a list of integers. 
### Answer: Here is a python function which calculates the sum of all elements present in a given list of integers. The function takes input as a list of integers and returns their sum. 

```python
def calculate_sum(numbers):
    total = 0
    for num in numbers:
        total += num
    return total

# Example usage
numbers = [1, 2, 3, 4, 5]
result = calculate_sum(numbers)
print("Sum of all numbers