# Rate Limiting

Rate limiting is disabled by default. Requests can be limited by either requests per period or tokens per period or both.


In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

import json
import os
import time

from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig, RateLimitConfig, RateLimitType

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
    model_provider="azure",
    model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
    api_key=api_key,
    auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
    rate_limit=RateLimitConfig(
        type=RateLimitType.SlidingWindow,
        period_in_seconds=60,  # limit requests per minute
        requests_per_period=3,  # max 3 requests per minute. Fire one off every 20 seconds
    ),
)

llm_completion: LLMCompletion = create_completion(model_config)

start_time = time.time()
response = llm_completion.completion(
    messages="What is the capital of France?",
)
response = llm_completion.completion(
    messages="What is the capital of France?",
)
end_time = time.time()
total_time = end_time - start_time
assert total_time >= 20, "Rate limiting did not work as expected."

print(f"Time taken for two requests: {total_time:.2f} seconds")
print(f"Metrics for: {llm_completion.metrics_store.id}")
print(json.dumps(llm_completion.metrics_store.get_metrics(), indent=2))

Time taken for two requests: 20.87 seconds
Metrics for: azure/gpt-4o
{
  "attempted_request_count": 2,
  "successful_response_count": 2,
  "failed_response_count": 0,
  "failure_rate": 0.0,
  "requests_with_retries": 0,
  "retries": 0,
  "retry_rate": 0.0,
  "compute_duration_seconds": 3.534508228302002,
  "compute_duration_per_response_seconds": 1.767254114151001,
  "cache_hit_rate": 0.0,
  "streaming_responses": 0,
  "responses_with_tokens": 2,
  "prompt_tokens": 28,
  "completion_tokens": 16,
  "total_tokens": 44,
  "tokens_per_response": 22.0,
  "responses_with_cost": 2,
  "input_cost": 7.000000000000001e-05,
  "output_cost": 0.00016,
  "total_cost": 0.00023,
  "cost_per_response": 0.000115
}


Notice that the `compute_duration_seconds` in the metrics only tracks how long a network request actually takes and does track paused periods that occur due to rate limits.
