In [6]:
from openai import OpenAI

class ChatModel:
    def __init__(self, base_url, key):
        self.client = OpenAI(
            base_url=base_url,
            api_key=key,
        )

    def chat_completion(self, model, messages):
        response = self.client.chat.completions.create(
            model=model,
            messages=messages
        )
        return response

BASE_URL = "http://localhost:11434/v1"  # Default local URL for Ollama
chatModel = ChatModel(base_url=BASE_URL, key="fake-key")  # Key is required but not used by Ollama

messages = [
    {"role": "system", "content": "You are a Jetson-based assistant."},
    {"role": "user", "content": "How can I optimize GPU usage on a Jetson Nano?"},
    {"role": "assistant", "content": "Use TensorRT for inference and disable services you don't need."},
    {"role": "user", "content": "Got it, thanks!"}
]

response = chatModel.chat_completion(model="llama3.2:latest", messages=messages)
print(response.choices[0].message.content)

If you're looking to get more specific, here are some tips for optimizing GPU usage on a Jetson Nano:

1. **Monitor GPU temps**: High temperatures can limit GPU performance. Use tools like `temp` command or `lspci -vnn` to monitor temperatures.
2. **Adjust VRAM allocation**: You may need to adjust the amount of VRAM allocated to each process using the `jetson-config` tool.
3. **Compile with Optimize flags**: Compile your applications with optimization flags (-Wl,--gc-sections=-O2) and strip unused symbols.
4. **Use less memory-intensive algorithms**: If possible, use algorithms that are more memory-efficient in the first place.
5. **Dust-gate removal**: Remove dust from the heatsinks to ensure good airflow inside the device.
6. **Keep your system up-to-date**: Regularly update your Jetson OS and drivers to take advantage of performance improvements.

Keep in mind, the Jetson Nano is an embedded platform with limited resources, so GPU optimization might not lead to dramatic performance 

In [5]:
import requests

class ChatModel:
    def __init__(self, base_url):
        self.base_url = base_url

    def chat_completion(self, model, messages):
        response = requests.post(
            f"{self.base_url}/chat/completions",
            json={"model": model, "messages": messages}
        )
        return response.json()

BASE_URL = "http://localhost:11434/v1"
chatModel = ChatModel(base_url=BASE_URL)

# messages = [
#     {"role": "system", "content": "You are a Jetson-based assistant."},
#     {"role": "user", "content": "How can I optimize GPU usage on a Jetson Nano?"}
# ]

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "What is the capital of Japan?"}
]

# response = chatModel.chat_completion(model="llama3.2:latest", messages=messages)
response = chatModel.chat_completion(model="gemma3:27b", messages=messages)
print(response["choices"][0]["message"]["content"])


The capital of Japan is **Tokyo**. 

It's not only the capital, but also the most populous metropolis in the world! 😊 




