In [1]:
from transformers import pipeline
import torch

# Local Inference

In [None]:
# model_id = "openai/gpt-oss-20b" # too large
model_id = "unsloth/gpt-oss-20b-GGUF" # 4-bit quantization

In [None]:
from llama_cpp import Llama


def check_gpu_usage():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory used: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
            print(f"  Memory total: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
            print()
    else:
        print("No CUDA devices available")

print("Before loading model:")
check_gpu_usage()


In [None]:
# Download and use GGUF model
llm = Llama.from_pretrained(
    repo_id="unsloth/gpt-oss-20b-GGUF",
    filename="gpt-oss-20b-Q3_K_S.gguf",  # Choose appropriate quantization
    verbose=True,
    n_gpu_layers=-1
)

# Check after loading
print("\nAfter loading model:")
check_gpu_usage()



In [None]:
from dotenv import load_dotenv

load_dotenv()

# HuggingFace How to use OpenAI's GPT OSS

In [2]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client=OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN")
)

response = client.chat.completions.create(
    model= "openai/gpt-oss-120b:cerebras",
    messages=[{"role": "user", "content":"Tell me a fun fact about the Eiffel Tower."}]
)

print(response.choices[0].message.content)

Here’s a quirky tidbit: **the Eiffel Tower actually “grows” in the summer!**  

Because it’s made of iron, the tower expands with the heat—about **6 centimeters (≈2.5 inches)** taller on a blazing July day, and then contracts back to its original height when it cools down. So the same iron lady you see in photos can be a little bit taller depending on the season!


In [3]:
import os
from openai import OpenAI

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        },
    }
]

response = client.chat.completions.create(
    model="openai/gpt-oss-120b:cerebras",
    messages=[{"role": "user", "content": "What is the weather in Paris in Celsius?"}],
    tools=tools,
    tool_choice="auto",
)

# The response will contain the tool_calls object if the model decides to use the tool
print(response.choices[0].message)

BadRequestError: Error code: 400 - {'message': 'Model generated a tool call with name "get_current_weather<|constrain|>json" that is not in the tools list: [\'get_current_weather\']', 'type': 'invalid_request_error', 'param': 'tools', 'code': 'wrong_api_format'}

In [None]:
import json
import os

from openai import OpenAI


client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

# Force the model to output a JSON object
response = client.chat.completions.create(
    model="openai/gpt-oss-120b:cerebras",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant designed to output JSON.",
        },
        {
            "role": "user",
            "content": "Extract the name, city, and profession from the following sentence: 'Amélie is a chef who lives in Paris.'",
        },
    ],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "person",
            "schema": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "city": {"type": "string"},
                    "profession": {"type": "string"},
                },
                "required": ["name", "city", "profession"],
            },
        },
    },
)

# The output is a valid JSON string that can be easily parsed
output_json_string = response.choices[0].message.content
parsed_output = json.loads(output_json_string)

print(parsed_output)

In [None]:
import os
from openai import OpenAI


client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

# Set stream=True to receive a stream of semantic events
stream = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai",
    input="Tell me a short story about a robot who discovers music.",
    stream=True,
)

# Iterate over the events in the stream
for event in stream:
    print(event)

In [None]:
from openai import OpenAI
import os

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

tools = [
    {
        "type": "function",
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
            "required": ["location", "unit"],
        },
    }
]

response = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai", 
    tools=tools,
    input="What is the weather like in Boston today?",
    tool_choice="auto",
)

print(response)

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.getenv("HF_TOKEN"),
)

response = client.responses.create(
    model="openai/gpt-oss-120b:fireworks-ai",
    input="What transport protocols are supported in the 2025-03-26 version of the MCP spec?",
    tools=[
        {
            "type": "mcp",
            "server_label": "deepwiki",
            "server_url": "https://mcp.deepwiki.com/mcp",
            "require_approval": "never",
        },
    ],
)

print(response)