# Encoding/Decoding

`LLMCompletion` and `LLMEmbedding` expose a `Tokenizer` property corresponding to the underlying model.


In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

import os

from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
    model_provider="azure",
    model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
    api_key=api_key,
    auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
)
llm_completion: LLMCompletion = create_completion(model_config)

encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
# OR
print(f"Number of tokens: {llm_completion.tokenizer.num_tokens('Hello, world!')}")
decoded = llm_completion.tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")

Encoded tokens: [9906, 11, 1917, 0]
Number of tokens: 4
Number of tokens: 4
Decoded text: Hello, world!


## Standalone Tokenizer


In [2]:
from graphrag_llm.config import TokenizerConfig, TokenizerType
from graphrag_llm.tokenizer import create_tokenizer

tokenizer = create_tokenizer(
    TokenizerConfig(
        type=TokenizerType.LiteLLM,
        model_id="openai/text-embedding-3-small",
    )
)

encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
decoded = tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")

Encoded tokens: [9906, 11, 1917, 0]
Number of tokens: 4
Decoded text: Hello, world!


## Tiktoken

By default, `LLMCompletion` and `LLMEmbedding` use a litellm based tokenizer that supports the 100+ models that litellm supports but you may use a tiktoken based tokenizer by specifying a tokenizer type of `TokenizerType.Tiktoken` and providing an `encoding_name` to the config.


In [3]:
tokenizer = create_tokenizer(
    TokenizerConfig(
        type=TokenizerType.Tiktoken,
        encoding_name="o200k_base",
    )
)
encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")

# Using with LLMCompletion
llm_completion: LLMCompletion = create_completion(model_config, tokenizer=tokenizer)

encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")

Encoded tokens: [13225, 11, 2375, 0]
Encoded tokens: [13225, 11, 2375, 0]
